urllib.py revision 6980342c348a37b80c27c3552574a112e8c29650
1"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol.  All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info().  The read*(), fileno()
19and close() methods work like those of open files.
20The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
24
25import string
26import socket
27import os
28import time
29import sys
30from urlparse import urljoin as basejoin
31
32__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
33           "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
34           "urlencode", "url2pathname", "pathname2url", "splittag",
35           "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
36           "splittype", "splithost", "splituser", "splitpasswd", "splitport",
37           "splitnport", "splitquery", "splitattr", "splitvalue",
38           "getproxies"]
39
40__version__ = '1.17'    # XXX This version is not always updated :-(
41
42MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
43
44# Helper for non-unix systems
45if os.name == 'mac':
46    from macurl2path import url2pathname, pathname2url
47elif os.name == 'nt':
48    from nturl2path import url2pathname, pathname2url
49elif os.name == 'riscos':
50    from rourl2path import url2pathname, pathname2url
51else:
52    def url2pathname(pathname):
53        """OS-specific conversion from a relative URL of the 'file' scheme
54        to a file system path; not recommended for general use."""
55        return unquote(pathname)
56
57    def pathname2url(pathname):
58        """OS-specific conversion from a file system path to a relative URL
59        of the 'file' scheme; not recommended for general use."""
60        return quote(pathname)
61
62# This really consists of two pieces:
63# (1) a class which handles opening of all sorts of URLs
64#     (plus assorted utilities etc.)
65# (2) a set of functions for parsing URLs
66# XXX Should these be separated out into different modules?
67
68
69# Shortcut for basic usage
70_urlopener = None
71def urlopen(url, data=None, proxies=None):
72    """Create a file-like object for the specified URL to read from."""
73    from warnings import warnpy3k
74    warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
75             "favor of urllib2.urlopen()", stacklevel=2)
76
77    global _urlopener
78    if proxies is not None:
79        opener = FancyURLopener(proxies=proxies)
80    elif not _urlopener:
81        opener = FancyURLopener()
82        _urlopener = opener
83    else:
84        opener = _urlopener
85    if data is None:
86        return opener.open(url)
87    else:
88        return opener.open(url, data)
89def urlretrieve(url, filename=None, reporthook=None, data=None):
90    global _urlopener
91    if not _urlopener:
92        _urlopener = FancyURLopener()
93    return _urlopener.retrieve(url, filename, reporthook, data)
94def urlcleanup():
95    if _urlopener:
96        _urlopener.cleanup()
97    _safemaps.clear()
98    ftpcache.clear()
99
100# check for SSL
101try:
102    import ssl
103except:
104    _have_ssl = False
105else:
106    _have_ssl = True
107
108# exception raised when downloaded size does not match content-length
109class ContentTooShortError(IOError):
110    def __init__(self, message, content):
111        IOError.__init__(self, message)
112        self.content = content
113
114ftpcache = {}
115class URLopener:
116    """Class to open URLs.
117    This is a class rather than just a subroutine because we may need
118    more than one set of global protocol-specific options.
119    Note -- this is a base class for those who don't want the
120    automatic handling of errors type 302 (relocated) and 401
121    (authorization needed)."""
122
123    __tempfiles = None
124
125    version = "Python-urllib/%s" % __version__
126
127    # Constructor
128    def __init__(self, proxies=None, **x509):
129        if proxies is None:
130            proxies = getproxies()
131        assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
132        self.proxies = proxies
133        self.key_file = x509.get('key_file')
134        self.cert_file = x509.get('cert_file')
135        self.addheaders = [('User-Agent', self.version)]
136        self.__tempfiles = []
137        self.__unlink = os.unlink # See cleanup()
138        self.tempcache = None
139        # Undocumented feature: if you assign {} to tempcache,
140        # it is used to cache files retrieved with
141        # self.retrieve().  This is not enabled by default
142        # since it does not work for changing documents (and I
143        # haven't got the logic to check expiration headers
144        # yet).
145        self.ftpcache = ftpcache
146        # Undocumented feature: you can use a different
147        # ftp cache by assigning to the .ftpcache member;
148        # in case you want logically independent URL openers
149        # XXX This is not threadsafe.  Bah.
150
151    def __del__(self):
152        self.close()
153
154    def close(self):
155        self.cleanup()
156
157    def cleanup(self):
158        # This code sometimes runs when the rest of this module
159        # has already been deleted, so it can't use any globals
160        # or import anything.
161        if self.__tempfiles:
162            for file in self.__tempfiles:
163                try:
164                    self.__unlink(file)
165                except OSError:
166                    pass
167            del self.__tempfiles[:]
168        if self.tempcache:
169            self.tempcache.clear()
170
171    def addheader(self, *args):
172        """Add a header to be used by the HTTP interface only
173        e.g. u.addheader('Accept', 'sound/basic')"""
174        self.addheaders.append(args)
175
176    # External interface
177    def open(self, fullurl, data=None):
178        """Use URLopener().open(file) instead of open(file, 'r')."""
179        fullurl = unwrap(toBytes(fullurl))
180        # percent encode url. fixing lame server errors like space within url
181        # parts
182        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
183        if self.tempcache and fullurl in self.tempcache:
184            filename, headers = self.tempcache[fullurl]
185            fp = open(filename, 'rb')
186            return addinfourl(fp, headers, fullurl)
187        urltype, url = splittype(fullurl)
188        if not urltype:
189            urltype = 'file'
190        if urltype in self.proxies:
191            proxy = self.proxies[urltype]
192            urltype, proxyhost = splittype(proxy)
193            host, selector = splithost(proxyhost)
194            url = (host, fullurl) # Signal special case to open_*()
195        else:
196            proxy = None
197        name = 'open_' + urltype
198        self.type = urltype
199        name = name.replace('-', '_')
200        if not hasattr(self, name):
201            if proxy:
202                return self.open_unknown_proxy(proxy, fullurl, data)
203            else:
204                return self.open_unknown(fullurl, data)
205        try:
206            if data is None:
207                return getattr(self, name)(url)
208            else:
209                return getattr(self, name)(url, data)
210        except socket.error, msg:
211            raise IOError, ('socket error', msg), sys.exc_info()[2]
212
213    def open_unknown(self, fullurl, data=None):
214        """Overridable interface to open unknown URL type."""
215        type, url = splittype(fullurl)
216        raise IOError, ('url error', 'unknown url type', type)
217
218    def open_unknown_proxy(self, proxy, fullurl, data=None):
219        """Overridable interface to open unknown URL type."""
220        type, url = splittype(fullurl)
221        raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
222
223    # External interface
224    def retrieve(self, url, filename=None, reporthook=None, data=None):
225        """retrieve(url) returns (filename, headers) for a local object
226        or (tempfilename, headers) for a remote object."""
227        url = unwrap(toBytes(url))
228        if self.tempcache and url in self.tempcache:
229            return self.tempcache[url]
230        type, url1 = splittype(url)
231        if filename is None and (not type or type == 'file'):
232            try:
233                fp = self.open_local_file(url1)
234                hdrs = fp.info()
235                del fp
236                return url2pathname(splithost(url1)[1]), hdrs
237            except IOError, msg:
238                pass
239        fp = self.open(url, data)
240        try:
241            headers = fp.info()
242            if filename:
243                tfp = open(filename, 'wb')
244            else:
245                import tempfile
246                garbage, path = splittype(url)
247                garbage, path = splithost(path or "")
248                path, garbage = splitquery(path or "")
249                path, garbage = splitattr(path or "")
250                suffix = os.path.splitext(path)[1]
251                (fd, filename) = tempfile.mkstemp(suffix)
252                self.__tempfiles.append(filename)
253                tfp = os.fdopen(fd, 'wb')
254            try:
255                result = filename, headers
256                if self.tempcache is not None:
257                    self.tempcache[url] = result
258                bs = 1024*8
259                size = -1
260                read = 0
261                blocknum = 0
262                if reporthook:
263                    if "content-length" in headers:
264                        size = int(headers["Content-Length"])
265                    reporthook(blocknum, bs, size)
266                while 1:
267                    block = fp.read(bs)
268                    if block == "":
269                        break
270                    read += len(block)
271                    tfp.write(block)
272                    blocknum += 1
273                    if reporthook:
274                        reporthook(blocknum, bs, size)
275            finally:
276                tfp.close()
277        finally:
278            fp.close()
279        del fp
280        del tfp
281
282        # raise exception if actual size does not match content-length header
283        if size >= 0 and read < size:
284            raise ContentTooShortError("retrieval incomplete: got only %i out "
285                                       "of %i bytes" % (read, size), result)
286
287        return result
288
289    # Each method named open_<type> knows how to open that type of URL
290
291    def open_http(self, url, data=None):
292        """Use HTTP protocol."""
293        import httplib
294        user_passwd = None
295        proxy_passwd= None
296        if isinstance(url, str):
297            host, selector = splithost(url)
298            if host:
299                user_passwd, host = splituser(host)
300                host = unquote(host)
301            realhost = host
302        else:
303            host, selector = url
304            # check whether the proxy contains authorization information
305            proxy_passwd, host = splituser(host)
306            # now we proceed with the url we want to obtain
307            urltype, rest = splittype(selector)
308            url = rest
309            user_passwd = None
310            if urltype.lower() != 'http':
311                realhost = None
312            else:
313                realhost, rest = splithost(rest)
314                if realhost:
315                    user_passwd, realhost = splituser(realhost)
316                if user_passwd:
317                    selector = "%s://%s%s" % (urltype, realhost, rest)
318                if proxy_bypass(realhost):
319                    host = realhost
320
321            #print "proxy via http:", host, selector
322        if not host: raise IOError, ('http error', 'no host given')
323
324        if proxy_passwd:
325            import base64
326            proxy_auth = base64.b64encode(proxy_passwd).strip()
327        else:
328            proxy_auth = None
329
330        if user_passwd:
331            import base64
332            auth = base64.b64encode(user_passwd).strip()
333        else:
334            auth = None
335        h = httplib.HTTP(host)
336        if data is not None:
337            h.putrequest('POST', selector)
338            h.putheader('Content-Type', 'application/x-www-form-urlencoded')
339            h.putheader('Content-Length', '%d' % len(data))
340        else:
341            h.putrequest('GET', selector)
342        if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
343        if auth: h.putheader('Authorization', 'Basic %s' % auth)
344        if realhost: h.putheader('Host', realhost)
345        for args in self.addheaders: h.putheader(*args)
346        h.endheaders()
347        if data is not None:
348            h.send(data)
349        errcode, errmsg, headers = h.getreply()
350        fp = h.getfile()
351        if errcode == -1:
352            if fp: fp.close()
353            # something went wrong with the HTTP status line
354            raise IOError, ('http protocol error', 0,
355                            'got a bad status line', None)
356        # According to RFC 2616, "2xx" code indicates that the client's
357        # request was successfully received, understood, and accepted.
358        if (200 <= errcode < 300):
359            return addinfourl(fp, headers, "http:" + url, errcode)
360        else:
361            if data is None:
362                return self.http_error(url, fp, errcode, errmsg, headers)
363            else:
364                return self.http_error(url, fp, errcode, errmsg, headers, data)
365
366    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
367        """Handle http errors.
368        Derived class can override this, or provide specific handlers
369        named http_error_DDD where DDD is the 3-digit error code."""
370        # First check if there's a specific handler for this error
371        name = 'http_error_%d' % errcode
372        if hasattr(self, name):
373            method = getattr(self, name)
374            if data is None:
375                result = method(url, fp, errcode, errmsg, headers)
376            else:
377                result = method(url, fp, errcode, errmsg, headers, data)
378            if result: return result
379        return self.http_error_default(url, fp, errcode, errmsg, headers)
380
381    def http_error_default(self, url, fp, errcode, errmsg, headers):
382        """Default error handler: close the connection and raise IOError."""
383        void = fp.read()
384        fp.close()
385        raise IOError, ('http error', errcode, errmsg, headers)
386
387    if _have_ssl:
388        def open_https(self, url, data=None):
389            """Use HTTPS protocol."""
390
391            import httplib
392            user_passwd = None
393            proxy_passwd = None
394            if isinstance(url, str):
395                host, selector = splithost(url)
396                if host:
397                    user_passwd, host = splituser(host)
398                    host = unquote(host)
399                realhost = host
400            else:
401                host, selector = url
402                # here, we determine, whether the proxy contains authorization information
403                proxy_passwd, host = splituser(host)
404                urltype, rest = splittype(selector)
405                url = rest
406                user_passwd = None
407                if urltype.lower() != 'https':
408                    realhost = None
409                else:
410                    realhost, rest = splithost(rest)
411                    if realhost:
412                        user_passwd, realhost = splituser(realhost)
413                    if user_passwd:
414                        selector = "%s://%s%s" % (urltype, realhost, rest)
415                #print "proxy via https:", host, selector
416            if not host: raise IOError, ('https error', 'no host given')
417            if proxy_passwd:
418                import base64
419                proxy_auth = base64.b64encode(proxy_passwd).strip()
420            else:
421                proxy_auth = None
422            if user_passwd:
423                import base64
424                auth = base64.b64encode(user_passwd).strip()
425            else:
426                auth = None
427            h = httplib.HTTPS(host, 0,
428                              key_file=self.key_file,
429                              cert_file=self.cert_file)
430            if data is not None:
431                h.putrequest('POST', selector)
432                h.putheader('Content-Type',
433                            'application/x-www-form-urlencoded')
434                h.putheader('Content-Length', '%d' % len(data))
435            else:
436                h.putrequest('GET', selector)
437            if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
438            if auth: h.putheader('Authorization', 'Basic %s' % auth)
439            if realhost: h.putheader('Host', realhost)
440            for args in self.addheaders: h.putheader(*args)
441            h.endheaders()
442            if data is not None:
443                h.send(data)
444            errcode, errmsg, headers = h.getreply()
445            fp = h.getfile()
446            if errcode == -1:
447                if fp: fp.close()
448                # something went wrong with the HTTP status line
449                raise IOError, ('http protocol error', 0,
450                                'got a bad status line', None)
451            # According to RFC 2616, "2xx" code indicates that the client's
452            # request was successfully received, understood, and accepted.
453            if (200 <= errcode < 300):
454                return addinfourl(fp, headers, "https:" + url, errcode)
455            else:
456                if data is None:
457                    return self.http_error(url, fp, errcode, errmsg, headers)
458                else:
459                    return self.http_error(url, fp, errcode, errmsg, headers,
460                                           data)
461
462    def open_file(self, url):
463        """Use local file or FTP depending on form of URL."""
464        if not isinstance(url, str):
465            raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
466        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
467            return self.open_ftp(url)
468        else:
469            return self.open_local_file(url)
470
471    def open_local_file(self, url):
472        """Use local file."""
473        import mimetypes, mimetools, email.utils
474        try:
475            from cStringIO import StringIO
476        except ImportError:
477            from StringIO import StringIO
478        host, file = splithost(url)
479        localname = url2pathname(file)
480        try:
481            stats = os.stat(localname)
482        except OSError, e:
483            raise IOError(e.errno, e.strerror, e.filename)
484        size = stats.st_size
485        modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
486        mtype = mimetypes.guess_type(url)[0]
487        headers = mimetools.Message(StringIO(
488            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
489            (mtype or 'text/plain', size, modified)))
490        if not host:
491            urlfile = file
492            if file[:1] == '/':
493                urlfile = 'file://' + file
494            return addinfourl(open(localname, 'rb'),
495                              headers, urlfile)
496        host, port = splitport(host)
497        if not port \
498           and socket.gethostbyname(host) in (localhost(), thishost()):
499            urlfile = file
500            if file[:1] == '/':
501                urlfile = 'file://' + file
502            return addinfourl(open(localname, 'rb'),
503                              headers, urlfile)
504        raise IOError, ('local file error', 'not on local host')
505
506    def open_ftp(self, url):
507        """Use FTP protocol."""
508        if not isinstance(url, str):
509            raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
510        import mimetypes, mimetools
511        try:
512            from cStringIO import StringIO
513        except ImportError:
514            from StringIO import StringIO
515        host, path = splithost(url)
516        if not host: raise IOError, ('ftp error', 'no host given')
517        host, port = splitport(host)
518        user, host = splituser(host)
519        if user: user, passwd = splitpasswd(user)
520        else: passwd = None
521        host = unquote(host)
522        user = unquote(user or '')
523        passwd = unquote(passwd or '')
524        host = socket.gethostbyname(host)
525        if not port:
526            import ftplib
527            port = ftplib.FTP_PORT
528        else:
529            port = int(port)
530        path, attrs = splitattr(path)
531        path = unquote(path)
532        dirs = path.split('/')
533        dirs, file = dirs[:-1], dirs[-1]
534        if dirs and not dirs[0]: dirs = dirs[1:]
535        if dirs and not dirs[0]: dirs[0] = '/'
536        key = user, host, port, '/'.join(dirs)
537        # XXX thread unsafe!
538        if len(self.ftpcache) > MAXFTPCACHE:
539            # Prune the cache, rather arbitrarily
540            for k in self.ftpcache.keys():
541                if k != key:
542                    v = self.ftpcache[k]
543                    del self.ftpcache[k]
544                    v.close()
545        try:
546            if not key in self.ftpcache:
547                self.ftpcache[key] = \
548                    ftpwrapper(user, passwd, host, port, dirs)
549            if not file: type = 'D'
550            else: type = 'I'
551            for attr in attrs:
552                attr, value = splitvalue(attr)
553                if attr.lower() == 'type' and \
554                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
555                    type = value.upper()
556            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
557            mtype = mimetypes.guess_type("ftp:" + url)[0]
558            headers = ""
559            if mtype:
560                headers += "Content-Type: %s\n" % mtype
561            if retrlen is not None and retrlen >= 0:
562                headers += "Content-Length: %d\n" % retrlen
563            headers = mimetools.Message(StringIO(headers))
564            return addinfourl(fp, headers, "ftp:" + url)
565        except ftperrors(), msg:
566            raise IOError, ('ftp error', msg), sys.exc_info()[2]
567
568    def open_data(self, url, data=None):
569        """Use "data" URL."""
570        if not isinstance(url, str):
571            raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
572        # ignore POSTed data
573        #
574        # syntax of data URLs:
575        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
576        # mediatype := [ type "/" subtype ] *( ";" parameter )
577        # data      := *urlchar
578        # parameter := attribute "=" value
579        import mimetools
580        try:
581            from cStringIO import StringIO
582        except ImportError:
583            from StringIO import StringIO
584        try:
585            [type, data] = url.split(',', 1)
586        except ValueError:
587            raise IOError, ('data error', 'bad data URL')
588        if not type:
589            type = 'text/plain;charset=US-ASCII'
590        semi = type.rfind(';')
591        if semi >= 0 and '=' not in type[semi:]:
592            encoding = type[semi+1:]
593            type = type[:semi]
594        else:
595            encoding = ''
596        msg = []
597        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
598                                            time.gmtime(time.time())))
599        msg.append('Content-type: %s' % type)
600        if encoding == 'base64':
601            import base64
602            data = base64.decodestring(data)
603        else:
604            data = unquote(data)
605        msg.append('Content-Length: %d' % len(data))
606        msg.append('')
607        msg.append(data)
608        msg = '\n'.join(msg)
609        f = StringIO(msg)
610        headers = mimetools.Message(f, 0)
611        #f.fileno = None     # needed for addinfourl
612        return addinfourl(f, headers, url)
613
614
615class FancyURLopener(URLopener):
616    """Derived class with handlers for errors we can handle (perhaps)."""
617
618    def __init__(self, *args, **kwargs):
619        URLopener.__init__(self, *args, **kwargs)
620        self.auth_cache = {}
621        self.tries = 0
622        self.maxtries = 10
623
624    def http_error_default(self, url, fp, errcode, errmsg, headers):
625        """Default error handling -- don't raise an exception."""
626        return addinfourl(fp, headers, "http:" + url, errcode)
627
628    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
629        """Error 302 -- relocated (temporarily)."""
630        self.tries += 1
631        if self.maxtries and self.tries >= self.maxtries:
632            if hasattr(self, "http_error_500"):
633                meth = self.http_error_500
634            else:
635                meth = self.http_error_default
636            self.tries = 0
637            return meth(url, fp, 500,
638                        "Internal Server Error: Redirect Recursion", headers)
639        result = self.redirect_internal(url, fp, errcode, errmsg, headers,
640                                        data)
641        self.tries = 0
642        return result
643
644    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
645        if 'location' in headers:
646            newurl = headers['location']
647        elif 'uri' in headers:
648            newurl = headers['uri']
649        else:
650            return
651        void = fp.read()
652        fp.close()
653        # In case the server sent a relative URL, join with original:
654        newurl = basejoin(self.type + ":" + url, newurl)
655        return self.open(newurl)
656
657    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
658        """Error 301 -- also relocated (permanently)."""
659        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
660
661    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
662        """Error 303 -- also relocated (essentially identical to 302)."""
663        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
664
665    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
666        """Error 307 -- relocated, but turn POST into error."""
667        if data is None:
668            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
669        else:
670            return self.http_error_default(url, fp, errcode, errmsg, headers)
671
672    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
673        """Error 401 -- authentication required.
674        This function supports Basic authentication only."""
675        if not 'www-authenticate' in headers:
676            URLopener.http_error_default(self, url, fp,
677                                         errcode, errmsg, headers)
678        stuff = headers['www-authenticate']
679        import re
680        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
681        if not match:
682            URLopener.http_error_default(self, url, fp,
683                                         errcode, errmsg, headers)
684        scheme, realm = match.groups()
685        if scheme.lower() != 'basic':
686            URLopener.http_error_default(self, url, fp,
687                                         errcode, errmsg, headers)
688        name = 'retry_' + self.type + '_basic_auth'
689        if data is None:
690            return getattr(self,name)(url, realm)
691        else:
692            return getattr(self,name)(url, realm, data)
693
694    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
695        """Error 407 -- proxy authentication required.
696        This function supports Basic authentication only."""
697        if not 'proxy-authenticate' in headers:
698            URLopener.http_error_default(self, url, fp,
699                                         errcode, errmsg, headers)
700        stuff = headers['proxy-authenticate']
701        import re
702        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
703        if not match:
704            URLopener.http_error_default(self, url, fp,
705                                         errcode, errmsg, headers)
706        scheme, realm = match.groups()
707        if scheme.lower() != 'basic':
708            URLopener.http_error_default(self, url, fp,
709                                         errcode, errmsg, headers)
710        name = 'retry_proxy_' + self.type + '_basic_auth'
711        if data is None:
712            return getattr(self,name)(url, realm)
713        else:
714            return getattr(self,name)(url, realm, data)
715
716    def retry_proxy_http_basic_auth(self, url, realm, data=None):
717        host, selector = splithost(url)
718        newurl = 'http://' + host + selector
719        proxy = self.proxies['http']
720        urltype, proxyhost = splittype(proxy)
721        proxyhost, proxyselector = splithost(proxyhost)
722        i = proxyhost.find('@') + 1
723        proxyhost = proxyhost[i:]
724        user, passwd = self.get_user_passwd(proxyhost, realm, i)
725        if not (user or passwd): return None
726        proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
727        self.proxies['http'] = 'http://' + proxyhost + proxyselector
728        if data is None:
729            return self.open(newurl)
730        else:
731            return self.open(newurl, data)
732
733    def retry_proxy_https_basic_auth(self, url, realm, data=None):
734        host, selector = splithost(url)
735        newurl = 'https://' + host + selector
736        proxy = self.proxies['https']
737        urltype, proxyhost = splittype(proxy)
738        proxyhost, proxyselector = splithost(proxyhost)
739        i = proxyhost.find('@') + 1
740        proxyhost = proxyhost[i:]
741        user, passwd = self.get_user_passwd(proxyhost, realm, i)
742        if not (user or passwd): return None
743        proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
744        self.proxies['https'] = 'https://' + proxyhost + proxyselector
745        if data is None:
746            return self.open(newurl)
747        else:
748            return self.open(newurl, data)
749
750    def retry_http_basic_auth(self, url, realm, data=None):
751        host, selector = splithost(url)
752        i = host.find('@') + 1
753        host = host[i:]
754        user, passwd = self.get_user_passwd(host, realm, i)
755        if not (user or passwd): return None
756        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
757        newurl = 'http://' + host + selector
758        if data is None:
759            return self.open(newurl)
760        else:
761            return self.open(newurl, data)
762
763    def retry_https_basic_auth(self, url, realm, data=None):
764        host, selector = splithost(url)
765        i = host.find('@') + 1
766        host = host[i:]
767        user, passwd = self.get_user_passwd(host, realm, i)
768        if not (user or passwd): return None
769        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
770        newurl = 'https://' + host + selector
771        if data is None:
772            return self.open(newurl)
773        else:
774            return self.open(newurl, data)
775
776    def get_user_passwd(self, host, realm, clear_cache = 0):
777        key = realm + '@' + host.lower()
778        if key in self.auth_cache:
779            if clear_cache:
780                del self.auth_cache[key]
781            else:
782                return self.auth_cache[key]
783        user, passwd = self.prompt_user_passwd(host, realm)
784        if user or passwd: self.auth_cache[key] = (user, passwd)
785        return user, passwd
786
787    def prompt_user_passwd(self, host, realm):
788        """Override this in a GUI environment!"""
789        import getpass
790        try:
791            user = raw_input("Enter username for %s at %s: " % (realm,
792                                                                host))
793            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
794                (user, realm, host))
795            return user, passwd
796        except KeyboardInterrupt:
797            print
798            return None, None
799
800
801# Utility functions
802
803_localhost = None
804def localhost():
805    """Return the IP address of the magic hostname 'localhost'."""
806    global _localhost
807    if _localhost is None:
808        _localhost = socket.gethostbyname('localhost')
809    return _localhost
810
811_thishost = None
812def thishost():
813    """Return the IP address of the current host."""
814    global _thishost
815    if _thishost is None:
816        _thishost = socket.gethostbyname(socket.gethostname())
817    return _thishost
818
819_ftperrors = None
820def ftperrors():
821    """Return the set of errors raised by the FTP class."""
822    global _ftperrors
823    if _ftperrors is None:
824        import ftplib
825        _ftperrors = ftplib.all_errors
826    return _ftperrors
827
828_noheaders = None
829def noheaders():
830    """Return an empty mimetools.Message object."""
831    global _noheaders
832    if _noheaders is None:
833        import mimetools
834        try:
835            from cStringIO import StringIO
836        except ImportError:
837            from StringIO import StringIO
838        _noheaders = mimetools.Message(StringIO(), 0)
839        _noheaders.fp.close()   # Recycle file descriptor
840    return _noheaders
841
842
843# Utility classes
844
845class ftpwrapper:
846    """Class used by open_ftp() for cache of open FTP connections."""
847
848    def __init__(self, user, passwd, host, port, dirs,
849                 timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
850        self.user = user
851        self.passwd = passwd
852        self.host = host
853        self.port = port
854        self.dirs = dirs
855        self.timeout = timeout
856        self.init()
857
858    def init(self):
859        import ftplib
860        self.busy = 0
861        self.ftp = ftplib.FTP()
862        self.ftp.connect(self.host, self.port, self.timeout)
863        self.ftp.login(self.user, self.passwd)
864        for dir in self.dirs:
865            self.ftp.cwd(dir)
866
867    def retrfile(self, file, type):
868        import ftplib
869        self.endtransfer()
870        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
871        else: cmd = 'TYPE ' + type; isdir = 0
872        try:
873            self.ftp.voidcmd(cmd)
874        except ftplib.all_errors:
875            self.init()
876            self.ftp.voidcmd(cmd)
877        conn = None
878        if file and not isdir:
879            # Try to retrieve as a file
880            try:
881                cmd = 'RETR ' + file
882                conn = self.ftp.ntransfercmd(cmd)
883            except ftplib.error_perm, reason:
884                if str(reason)[:3] != '550':
885                    raise IOError, ('ftp error', reason), sys.exc_info()[2]
886        if not conn:
887            # Set transfer mode to ASCII!
888            self.ftp.voidcmd('TYPE A')
889            # Try a directory listing. Verify that directory exists.
890            if file:
891                pwd = self.ftp.pwd()
892                try:
893                    try:
894                        self.ftp.cwd(file)
895                    except ftplib.error_perm, reason:
896                        raise IOError, ('ftp error', reason), sys.exc_info()[2]
897                finally:
898                    self.ftp.cwd(pwd)
899                cmd = 'LIST ' + file
900            else:
901                cmd = 'LIST'
902            conn = self.ftp.ntransfercmd(cmd)
903        self.busy = 1
904        # Pass back both a suitably decorated object and a retrieval length
905        return (addclosehook(conn[0].makefile('rb'),
906                             self.endtransfer), conn[1])
907    def endtransfer(self):
908        if not self.busy:
909            return
910        self.busy = 0
911        try:
912            self.ftp.voidresp()
913        except ftperrors():
914            pass
915
916    def close(self):
917        self.endtransfer()
918        try:
919            self.ftp.close()
920        except ftperrors():
921            pass
922
923class addbase:
924    """Base class for addinfo and addclosehook."""
925
926    def __init__(self, fp):
927        self.fp = fp
928        self.read = self.fp.read
929        self.readline = self.fp.readline
930        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
931        if hasattr(self.fp, "fileno"):
932            self.fileno = self.fp.fileno
933        else:
934            self.fileno = lambda: None
935        if hasattr(self.fp, "__iter__"):
936            self.__iter__ = self.fp.__iter__
937            if hasattr(self.fp, "next"):
938                self.next = self.fp.next
939
940    def __repr__(self):
941        return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
942                                             id(self), self.fp)
943
944    def close(self):
945        self.read = None
946        self.readline = None
947        self.readlines = None
948        self.fileno = None
949        if self.fp: self.fp.close()
950        self.fp = None
951
952class addclosehook(addbase):
953    """Class to add a close hook to an open file."""
954
955    def __init__(self, fp, closehook, *hookargs):
956        addbase.__init__(self, fp)
957        self.closehook = closehook
958        self.hookargs = hookargs
959
960    def close(self):
961        addbase.close(self)
962        if self.closehook:
963            self.closehook(*self.hookargs)
964            self.closehook = None
965            self.hookargs = None
966
967class addinfo(addbase):
968    """class to add an info() method to an open file."""
969
970    def __init__(self, fp, headers):
971        addbase.__init__(self, fp)
972        self.headers = headers
973
974    def info(self):
975        return self.headers
976
977class addinfourl(addbase):
978    """class to add info() and geturl() methods to an open file."""
979
980    def __init__(self, fp, headers, url, code=None):
981        addbase.__init__(self, fp)
982        self.headers = headers
983        self.url = url
984        self.code = code
985
986    def info(self):
987        return self.headers
988
989    def getcode(self):
990        return self.code
991
992    def geturl(self):
993        return self.url
994
995
996# Utilities to parse URLs (most of these return None for missing parts):
997# unwrap('<URL:type://host/path>') --> 'type://host/path'
998# splittype('type:opaquestring') --> 'type', 'opaquestring'
999# splithost('//host[:port]/path') --> 'host[:port]', '/path'
1000# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
1001# splitpasswd('user:passwd') -> 'user', 'passwd'
1002# splitport('host:port') --> 'host', 'port'
1003# splitquery('/path?query') --> '/path', 'query'
1004# splittag('/path#tag') --> '/path', 'tag'
1005# splitattr('/path;attr1=value1;attr2=value2;...') ->
1006#   '/path', ['attr1=value1', 'attr2=value2', ...]
1007# splitvalue('attr=value') --> 'attr', 'value'
1008# unquote('abc%20def') -> 'abc def'
1009# quote('abc def') -> 'abc%20def')
1010
1011try:
1012    unicode
1013except NameError:
1014    def _is_unicode(x):
1015        return 0
1016else:
1017    def _is_unicode(x):
1018        return isinstance(x, unicode)
1019
1020def toBytes(url):
1021    """toBytes(u"URL") --> 'URL'."""
1022    # Most URL schemes require ASCII. If that changes, the conversion
1023    # can be relaxed
1024    if _is_unicode(url):
1025        try:
1026            url = url.encode("ASCII")
1027        except UnicodeError:
1028            raise UnicodeError("URL " + repr(url) +
1029                               " contains non-ASCII characters")
1030    return url
1031
1032def unwrap(url):
1033    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
1034    url = url.strip()
1035    if url[:1] == '<' and url[-1:] == '>':
1036        url = url[1:-1].strip()
1037    if url[:4] == 'URL:': url = url[4:].strip()
1038    return url
1039
1040_typeprog = None
1041def splittype(url):
1042    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1043    global _typeprog
1044    if _typeprog is None:
1045        import re
1046        _typeprog = re.compile('^([^/:]+):')
1047
1048    match = _typeprog.match(url)
1049    if match:
1050        scheme = match.group(1)
1051        return scheme.lower(), url[len(scheme) + 1:]
1052    return None, url
1053
1054_hostprog = None
1055def splithost(url):
1056    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1057    global _hostprog
1058    if _hostprog is None:
1059        import re
1060        _hostprog = re.compile('^//([^/?]*)(.*)$')
1061
1062    match = _hostprog.match(url)
1063    if match: return match.group(1, 2)
1064    return None, url
1065
1066_userprog = None
1067def splituser(host):
1068    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1069    global _userprog
1070    if _userprog is None:
1071        import re
1072        _userprog = re.compile('^(.*)@(.*)$')
1073
1074    match = _userprog.match(host)
1075    if match: return map(unquote, match.group(1, 2))
1076    return None, host
1077
1078_passwdprog = None
1079def splitpasswd(user):
1080    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1081    global _passwdprog
1082    if _passwdprog is None:
1083        import re
1084        _passwdprog = re.compile('^([^:]*):(.*)$')
1085
1086    match = _passwdprog.match(user)
1087    if match: return match.group(1, 2)
1088    return user, None
1089
1090# splittag('/path#tag') --> '/path', 'tag'
1091_portprog = None
1092def splitport(host):
1093    """splitport('host:port') --> 'host', 'port'."""
1094    global _portprog
1095    if _portprog is None:
1096        import re
1097        _portprog = re.compile('^(.*):([0-9]+)$')
1098
1099    match = _portprog.match(host)
1100    if match: return match.group(1, 2)
1101    return host, None
1102
1103_nportprog = None
1104def splitnport(host, defport=-1):
1105    """Split host and port, returning numeric port.
1106    Return given default port if no ':' found; defaults to -1.
1107    Return numerical port if a valid number are found after ':'.
1108    Return None if ':' but not a valid number."""
1109    global _nportprog
1110    if _nportprog is None:
1111        import re
1112        _nportprog = re.compile('^(.*):(.*)$')
1113
1114    match = _nportprog.match(host)
1115    if match:
1116        host, port = match.group(1, 2)
1117        try:
1118            if not port: raise ValueError, "no digits"
1119            nport = int(port)
1120        except ValueError:
1121            nport = None
1122        return host, nport
1123    return host, defport
1124
1125_queryprog = None
1126def splitquery(url):
1127    """splitquery('/path?query') --> '/path', 'query'."""
1128    global _queryprog
1129    if _queryprog is None:
1130        import re
1131        _queryprog = re.compile('^(.*)\?([^?]*)$')
1132
1133    match = _queryprog.match(url)
1134    if match: return match.group(1, 2)
1135    return url, None
1136
1137_tagprog = None
1138def splittag(url):
1139    """splittag('/path#tag') --> '/path', 'tag'."""
1140    global _tagprog
1141    if _tagprog is None:
1142        import re
1143        _tagprog = re.compile('^(.*)#([^#]*)$')
1144
1145    match = _tagprog.match(url)
1146    if match: return match.group(1, 2)
1147    return url, None
1148
1149def splitattr(url):
1150    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1151        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1152    words = url.split(';')
1153    return words[0], words[1:]
1154
1155_valueprog = None
1156def splitvalue(attr):
1157    """splitvalue('attr=value') --> 'attr', 'value'."""
1158    global _valueprog
1159    if _valueprog is None:
1160        import re
1161        _valueprog = re.compile('^([^=]*)=(.*)$')
1162
1163    match = _valueprog.match(attr)
1164    if match: return match.group(1, 2)
1165    return attr, None
1166
1167_hexdig = '0123456789ABCDEFabcdef'
1168_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig)
1169
1170def unquote(s):
1171    """unquote('abc%20def') -> 'abc def'."""
1172    res = s.split('%')
1173    for i in xrange(1, len(res)):
1174        item = res[i]
1175        try:
1176            res[i] = _hextochr[item[:2]] + item[2:]
1177        except KeyError:
1178            res[i] = '%' + item
1179        except UnicodeDecodeError:
1180            res[i] = unichr(int(item[:2], 16)) + item[2:]
1181    return "".join(res)
1182
1183def unquote_plus(s):
1184    """unquote('%7e/abc+def') -> '~/abc def'"""
1185    s = s.replace('+', ' ')
1186    return unquote(s)
1187
1188always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1189               'abcdefghijklmnopqrstuvwxyz'
1190               '0123456789' '_.-')
1191_safemaps = {}
1192
1193def quote(s, safe = '/'):
1194    """quote('abc def') -> 'abc%20def'
1195
1196    Each part of a URL, e.g. the path info, the query, etc., has a
1197    different set of reserved characters that must be quoted.
1198
1199    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1200    the following reserved characters.
1201
1202    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1203                  "$" | ","
1204
1205    Each of these characters is reserved in some component of a URL,
1206    but not necessarily in all of them.
1207
1208    By default, the quote function is intended for quoting the path
1209    section of a URL.  Thus, it will not encode '/'.  This character
1210    is reserved, but in typical usage the quote function is being
1211    called on a path where the existing slash characters are used as
1212    reserved characters.
1213    """
1214    cachekey = (safe, always_safe)
1215    try:
1216        safe_map = _safemaps[cachekey]
1217    except KeyError:
1218        safe += always_safe
1219        safe_map = {}
1220        for i in range(256):
1221            c = chr(i)
1222            safe_map[c] = (c in safe) and c or ('%%%02X' % i)
1223        _safemaps[cachekey] = safe_map
1224    res = map(safe_map.__getitem__, s)
1225    return ''.join(res)
1226
1227def quote_plus(s, safe = ''):
1228    """Quote the query fragment of a URL; replacing ' ' with '+'"""
1229    if ' ' in s:
1230        s = quote(s, safe + ' ')
1231        return s.replace(' ', '+')
1232    return quote(s, safe)
1233
1234def urlencode(query,doseq=0):
1235    """Encode a sequence of two-element tuples or dictionary into a URL query string.
1236
1237    If any values in the query arg are sequences and doseq is true, each
1238    sequence element is converted to a separate parameter.
1239
1240    If the query arg is a sequence of two-element tuples, the order of the
1241    parameters in the output will match the order of parameters in the
1242    input.
1243    """
1244
1245    if hasattr(query,"items"):
1246        # mapping objects
1247        query = query.items()
1248    else:
1249        # it's a bother at times that strings and string-like objects are
1250        # sequences...
1251        try:
1252            # non-sequence items should not work with len()
1253            # non-empty strings will fail this
1254            if len(query) and not isinstance(query[0], tuple):
1255                raise TypeError
1256            # zero-length sequences of all types will get here and succeed,
1257            # but that's a minor nit - since the original implementation
1258            # allowed empty dicts that type of behavior probably should be
1259            # preserved for consistency
1260        except TypeError:
1261            ty,va,tb = sys.exc_info()
1262            raise TypeError, "not a valid non-string sequence or mapping object", tb
1263
1264    l = []
1265    if not doseq:
1266        # preserve old behavior
1267        for k, v in query:
1268            k = quote_plus(str(k))
1269            v = quote_plus(str(v))
1270            l.append(k + '=' + v)
1271    else:
1272        for k, v in query:
1273            k = quote_plus(str(k))
1274            if isinstance(v, str):
1275                v = quote_plus(v)
1276                l.append(k + '=' + v)
1277            elif _is_unicode(v):
1278                # is there a reasonable way to convert to ASCII?
1279                # encode generates a string, but "replace" or "ignore"
1280                # lose information and "strict" can raise UnicodeError
1281                v = quote_plus(v.encode("ASCII","replace"))
1282                l.append(k + '=' + v)
1283            else:
1284                try:
1285                    # is this a sufficient test for sequence-ness?
1286                    x = len(v)
1287                except TypeError:
1288                    # not a sequence
1289                    v = quote_plus(str(v))
1290                    l.append(k + '=' + v)
1291                else:
1292                    # loop over the sequence
1293                    for elt in v:
1294                        l.append(k + '=' + quote_plus(str(elt)))
1295    return '&'.join(l)
1296
1297# Proxy handling
1298def getproxies_environment():
1299    """Return a dictionary of scheme -> proxy server URL mappings.
1300
1301    Scan the environment for variables named <scheme>_proxy;
1302    this seems to be the standard convention.  If you need a
1303    different way, you can pass a proxies dictionary to the
1304    [Fancy]URLopener constructor.
1305
1306    """
1307    proxies = {}
1308    for name, value in os.environ.items():
1309        name = name.lower()
1310        if value and name[-6:] == '_proxy':
1311            proxies[name[:-6]] = value
1312    return proxies
1313
1314def proxy_bypass_environment(host):
1315    """Test if proxies should not be used for a particular host.
1316
1317    Checks the environment for a variable named no_proxy, which should
1318    be a list of DNS suffixes separated by commas, or '*' for all hosts.
1319    """
1320    no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
1321    # '*' is special case for always bypass
1322    if no_proxy == '*':
1323        return 1
1324    # strip port off host
1325    hostonly, port = splitport(host)
1326    # check if the host ends with any of the DNS suffixes
1327    for name in no_proxy.split(','):
1328        if name and (hostonly.endswith(name) or host.endswith(name)):
1329            return 1
1330    # otherwise, don't bypass
1331    return 0
1332
1333
1334if sys.platform == 'darwin':
1335    from _scproxy import _get_proxy_settings, _get_proxies
1336
1337    def proxy_bypass_macosx_sysconf(host):
1338        """
1339        Return True iff this host shouldn't be accessed using a proxy
1340
1341        This function uses the MacOSX framework SystemConfiguration
1342        to fetch the proxy information.
1343        """
1344        import re
1345        import socket
1346        from fnmatch import fnmatch
1347
1348        hostonly, port = splitport(host)
1349
1350        def ip2num(ipAddr):
1351            parts = ipAddr.split('.')
1352            parts = map(int, parts)
1353            if len(parts) != 4:
1354                parts = (parts + [0, 0, 0, 0])[:4]
1355            return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
1356
1357        proxy_settings = _get_proxy_settings()
1358
1359        # Check for simple host names:
1360        if '.' not in host:
1361            if proxy_settings['exclude_simple']:
1362                return True
1363
1364        hostIP = None
1365
1366        for value in proxy_settings.get('exceptions', ()):
1367            # Items in the list are strings like these: *.local, 169.254/16
1368            if not value: continue
1369
1370            m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
1371            if m is not None:
1372                if hostIP is None:
1373                    try:
1374                        hostIP = socket.gethostbyname(hostonly)
1375                        hostIP = ip2num(hostIP)
1376                    except socket.error:
1377                        continue
1378
1379                base = ip2num(m.group(1))
1380                mask = m.group(2)
1381                if mask is None:
1382                    mask = 8 * (m.group(1).count('.') + 1)
1383
1384                else:
1385                    mask = int(mask[1:])
1386                    mask = 32 - mask
1387
1388                if (hostIP >> mask) == (base >> mask):
1389                    return True
1390
1391            elif fnmatch(host, value):
1392                return True
1393
1394        return False
1395
1396
1397    def getproxies_macosx_sysconf():
1398        """Return a dictionary of scheme -> proxy server URL mappings.
1399
1400        This function uses the MacOSX framework SystemConfiguration
1401        to fetch the proxy information.
1402        """
1403        return _get_proxies()
1404
1405
1406
1407    def proxy_bypass(host):
1408        if getproxies_environment():
1409            return proxy_bypass_environment(host)
1410        else:
1411            return proxy_bypass_macosx_sysconf(host)
1412
1413    def getproxies():
1414        return getproxies_environment() or getproxies_macosx_sysconf()
1415
1416elif os.name == 'nt':
1417    def getproxies_registry():
1418        """Return a dictionary of scheme -> proxy server URL mappings.
1419
1420        Win32 uses the registry to store proxies.
1421
1422        """
1423        proxies = {}
1424        try:
1425            import _winreg
1426        except ImportError:
1427            # Std module, so should be around - but you never know!
1428            return proxies
1429        try:
1430            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1431                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1432            proxyEnable = _winreg.QueryValueEx(internetSettings,
1433                                               'ProxyEnable')[0]
1434            if proxyEnable:
1435                # Returned as Unicode but problems if not converted to ASCII
1436                proxyServer = str(_winreg.QueryValueEx(internetSettings,
1437                                                       'ProxyServer')[0])
1438                if '=' in proxyServer:
1439                    # Per-protocol settings
1440                    for p in proxyServer.split(';'):
1441                        protocol, address = p.split('=', 1)
1442                        # See if address has a type:// prefix
1443                        import re
1444                        if not re.match('^([^/:]+)://', address):
1445                            address = '%s://%s' % (protocol, address)
1446                        proxies[protocol] = address
1447                else:
1448                    # Use one setting for all protocols
1449                    if proxyServer[:5] == 'http:':
1450                        proxies['http'] = proxyServer
1451                    else:
1452                        proxies['http'] = 'http://%s' % proxyServer
1453                        proxies['https'] = 'https://%s' % proxyServer
1454                        proxies['ftp'] = 'ftp://%s' % proxyServer
1455            internetSettings.Close()
1456        except (WindowsError, ValueError, TypeError):
1457            # Either registry key not found etc, or the value in an
1458            # unexpected format.
1459            # proxies already set up to be empty so nothing to do
1460            pass
1461        return proxies
1462
1463    def getproxies():
1464        """Return a dictionary of scheme -> proxy server URL mappings.
1465
1466        Returns settings gathered from the environment, if specified,
1467        or the registry.
1468
1469        """
1470        return getproxies_environment() or getproxies_registry()
1471
1472    def proxy_bypass_registry(host):
1473        try:
1474            import _winreg
1475            import re
1476        except ImportError:
1477            # Std modules, so should be around - but you never know!
1478            return 0
1479        try:
1480            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1481                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1482            proxyEnable = _winreg.QueryValueEx(internetSettings,
1483                                               'ProxyEnable')[0]
1484            proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1485                                                     'ProxyOverride')[0])
1486            # ^^^^ Returned as Unicode but problems if not converted to ASCII
1487        except WindowsError:
1488            return 0
1489        if not proxyEnable or not proxyOverride:
1490            return 0
1491        # try to make a host list from name and IP address.
1492        rawHost, port = splitport(host)
1493        host = [rawHost]
1494        try:
1495            addr = socket.gethostbyname(rawHost)
1496            if addr != rawHost:
1497                host.append(addr)
1498        except socket.error:
1499            pass
1500        try:
1501            fqdn = socket.getfqdn(rawHost)
1502            if fqdn != rawHost:
1503                host.append(fqdn)
1504        except socket.error:
1505            pass
1506        # make a check value list from the registry entry: replace the
1507        # '<local>' string by the localhost entry and the corresponding
1508        # canonical entry.
1509        proxyOverride = proxyOverride.split(';')
1510        i = 0
1511        while i < len(proxyOverride):
1512            if proxyOverride[i] == '<local>':
1513                proxyOverride[i:i+1] = ['localhost',
1514                                        '127.0.0.1',
1515                                        socket.gethostname(),
1516                                        socket.gethostbyname(
1517                                            socket.gethostname())]
1518            i += 1
1519        # print proxyOverride
1520        # now check if we match one of the registry values.
1521        for test in proxyOverride:
1522            test = test.replace(".", r"\.")     # mask dots
1523            test = test.replace("*", r".*")     # change glob sequence
1524            test = test.replace("?", r".")      # change glob char
1525            for val in host:
1526                # print "%s <--> %s" %( test, val )
1527                if re.match(test, val, re.I):
1528                    return 1
1529        return 0
1530
1531    def proxy_bypass(host):
1532        """Return a dictionary of scheme -> proxy server URL mappings.
1533
1534        Returns settings gathered from the environment, if specified,
1535        or the registry.
1536
1537        """
1538        if getproxies_environment():
1539            return proxy_bypass_environment(host)
1540        else:
1541            return proxy_bypass_registry(host)
1542
1543else:
1544    # By default use environment variables
1545    getproxies = getproxies_environment
1546    proxy_bypass = proxy_bypass_environment
1547
1548# Test and time quote() and unquote()
1549def test1():
1550    s = ''
1551    for i in range(256): s = s + chr(i)
1552    s = s*4
1553    t0 = time.time()
1554    qs = quote(s)
1555    uqs = unquote(qs)
1556    t1 = time.time()
1557    if uqs != s:
1558        print 'Wrong!'
1559    print repr(s)
1560    print repr(qs)
1561    print repr(uqs)
1562    print round(t1 - t0, 3), 'sec'
1563
1564
1565def reporthook(blocknum, blocksize, totalsize):
1566    # Report during remote transfers
1567    print "Block number: %d, Block size: %d, Total size: %d" % (
1568        blocknum, blocksize, totalsize)
1569
1570# Test program
1571def test(args=[]):
1572    if not args:
1573        args = [
1574            '/etc/passwd',
1575            'file:/etc/passwd',
1576            'file://localhost/etc/passwd',
1577            'ftp://ftp.gnu.org/pub/README',
1578            'http://www.python.org/index.html',
1579            ]
1580        if hasattr(URLopener, "open_https"):
1581            args.append('https://synergy.as.cmu.edu/~geek/')
1582    try:
1583        for url in args:
1584            print '-'*10, url, '-'*10
1585            fn, h = urlretrieve(url, None, reporthook)
1586            print fn
1587            if h:
1588                print '======'
1589                for k in h.keys(): print k + ':', h[k]
1590                print '======'
1591            fp = open(fn, 'rb')
1592            data = fp.read()
1593            del fp
1594            if '\r' in data:
1595                table = string.maketrans("", "")
1596                data = data.translate(table, "\r")
1597            print data
1598            fn, h = None, None
1599        print '-'*40
1600    finally:
1601        urlcleanup()
1602
1603def main():
1604    import getopt, sys
1605    try:
1606        opts, args = getopt.getopt(sys.argv[1:], "th")
1607    except getopt.error, msg:
1608        print msg
1609        print "Use -h for help"
1610        return
1611    t = 0
1612    for o, a in opts:
1613        if o == '-t':
1614            t = t + 1
1615        if o == '-h':
1616            print "Usage: python urllib.py [-t] [url ...]"
1617            print "-t runs self-test;",
1618            print "otherwise, contents of urls are printed"
1619            return
1620    if t:
1621        if t > 1:
1622            test1()
1623        test(args)
1624    else:
1625        if not args:
1626            print "Use -h for help"
1627        for url in args:
1628            print urlopen(url).read(),
1629
1630# Run test program when run as a script
1631if __name__ == '__main__':
1632    main()
1633