urllib.py revision ec3dfdee6a09e1ad8d4e319876d0cf7691397245
1"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol.  All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info().  The read*(), fileno()
19and close() methods work like those of open files.
20The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
24
25import string
26import socket
27import os
28import sys
29import types
30
31__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
32           "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
33           "urlencode", "url2pathname", "pathname2url", "splittag",
34           "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
35           "splittype", "splithost", "splituser", "splitpasswd", "splitport",
36           "splitnport", "splitquery", "splitattr", "splitvalue",
37           "splitgophertype", "getproxies"]
38
39__version__ = '1.15'    # XXX This version is not always updated :-(
40
41MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
42
43# Helper for non-unix systems
44if os.name == 'mac':
45    from macurl2path import url2pathname, pathname2url
46elif os.name == 'nt':
47    from nturl2path import url2pathname, pathname2url
48elif os.name == 'riscos':
49    from rourl2path import url2pathname, pathname2url
50else:
51    def url2pathname(pathname):
52        return unquote(pathname)
53    def pathname2url(pathname):
54        return quote(pathname)
55
56# This really consists of two pieces:
57# (1) a class which handles opening of all sorts of URLs
58#     (plus assorted utilities etc.)
59# (2) a set of functions for parsing URLs
60# XXX Should these be separated out into different modules?
61
62
63# Shortcut for basic usage
64_urlopener = None
65def urlopen(url, data=None):
66    """urlopen(url [, data]) -> open file-like object"""
67    global _urlopener
68    if not _urlopener:
69        _urlopener = FancyURLopener()
70    if data is None:
71        return _urlopener.open(url)
72    else:
73        return _urlopener.open(url, data)
74def urlretrieve(url, filename=None, reporthook=None, data=None):
75    global _urlopener
76    if not _urlopener:
77        _urlopener = FancyURLopener()
78    return _urlopener.retrieve(url, filename, reporthook, data)
79def urlcleanup():
80    if _urlopener:
81        _urlopener.cleanup()
82
83
84ftpcache = {}
85class URLopener:
86    """Class to open URLs.
87    This is a class rather than just a subroutine because we may need
88    more than one set of global protocol-specific options.
89    Note -- this is a base class for those who don't want the
90    automatic handling of errors type 302 (relocated) and 401
91    (authorization needed)."""
92
93    __tempfiles = None
94
95    version = "Python-urllib/%s" % __version__
96
97    # Constructor
98    def __init__(self, proxies=None, **x509):
99        if proxies is None:
100            proxies = getproxies()
101        assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
102        self.proxies = proxies
103        self.key_file = x509.get('key_file')
104        self.cert_file = x509.get('cert_file')
105        self.addheaders = [('User-agent', self.version)]
106        self.__tempfiles = []
107        self.__unlink = os.unlink # See cleanup()
108        self.tempcache = None
109        # Undocumented feature: if you assign {} to tempcache,
110        # it is used to cache files retrieved with
111        # self.retrieve().  This is not enabled by default
112        # since it does not work for changing documents (and I
113        # haven't got the logic to check expiration headers
114        # yet).
115        self.ftpcache = ftpcache
116        # Undocumented feature: you can use a different
117        # ftp cache by assigning to the .ftpcache member;
118        # in case you want logically independent URL openers
119        # XXX This is not threadsafe.  Bah.
120
121    def __del__(self):
122        self.close()
123
124    def close(self):
125        self.cleanup()
126
127    def cleanup(self):
128        # This code sometimes runs when the rest of this module
129        # has already been deleted, so it can't use any globals
130        # or import anything.
131        if self.__tempfiles:
132            for file in self.__tempfiles:
133                try:
134                    self.__unlink(file)
135                except:
136                    pass
137            del self.__tempfiles[:]
138        if self.tempcache:
139            self.tempcache.clear()
140
141    def addheader(self, *args):
142        """Add a header to be used by the HTTP interface only
143        e.g. u.addheader('Accept', 'sound/basic')"""
144        self.addheaders.append(args)
145
146    # External interface
147    def open(self, fullurl, data=None):
148        """Use URLopener().open(file) instead of open(file, 'r')."""
149        fullurl = unwrap(toBytes(fullurl))
150        if self.tempcache and self.tempcache.has_key(fullurl):
151            filename, headers = self.tempcache[fullurl]
152            fp = open(filename, 'rb')
153            return addinfourl(fp, headers, fullurl)
154        urltype, url = splittype(fullurl)
155        if not urltype:
156            urltype = 'file'
157        if self.proxies.has_key(urltype):
158            proxy = self.proxies[urltype]
159            urltype, proxyhost = splittype(proxy)
160            host, selector = splithost(proxyhost)
161            url = (host, fullurl) # Signal special case to open_*()
162        else:
163            proxy = None
164        name = 'open_' + urltype
165        self.type = urltype
166        if '-' in name:
167            # replace - with _
168            name = '_'.join(name.split('-'))
169        if not hasattr(self, name):
170            if proxy:
171                return self.open_unknown_proxy(proxy, fullurl, data)
172            else:
173                return self.open_unknown(fullurl, data)
174        try:
175            if data is None:
176                return getattr(self, name)(url)
177            else:
178                return getattr(self, name)(url, data)
179        except socket.error, msg:
180            raise IOError, ('socket error', msg), sys.exc_info()[2]
181
182    def open_unknown(self, fullurl, data=None):
183        """Overridable interface to open unknown URL type."""
184        type, url = splittype(fullurl)
185        raise IOError, ('url error', 'unknown url type', type)
186
187    def open_unknown_proxy(self, proxy, fullurl, data=None):
188        """Overridable interface to open unknown URL type."""
189        type, url = splittype(fullurl)
190        raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
191
192    # External interface
193    def retrieve(self, url, filename=None, reporthook=None, data=None):
194        """retrieve(url) returns (filename, None) for a local object
195        or (tempfilename, headers) for a remote object."""
196        url = unwrap(toBytes(url))
197        if self.tempcache and self.tempcache.has_key(url):
198            return self.tempcache[url]
199        type, url1 = splittype(url)
200        if not filename and (not type or type == 'file'):
201            try:
202                fp = self.open_local_file(url1)
203                hdrs = fp.info()
204                del fp
205                return url2pathname(splithost(url1)[1]), hdrs
206            except IOError, msg:
207                pass
208        fp = self.open(url, data)
209        headers = fp.info()
210        if not filename:
211            import tempfile
212            garbage, path = splittype(url)
213            garbage, path = splithost(path or "")
214            path, garbage = splitquery(path or "")
215            path, garbage = splitattr(path or "")
216            suffix = os.path.splitext(path)[1]
217            filename = tempfile.mktemp(suffix)
218            self.__tempfiles.append(filename)
219        result = filename, headers
220        if self.tempcache is not None:
221            self.tempcache[url] = result
222        tfp = open(filename, 'wb')
223        bs = 1024*8
224        size = -1
225        blocknum = 1
226        if reporthook:
227            if headers.has_key("content-length"):
228                size = int(headers["Content-Length"])
229            reporthook(0, bs, size)
230        block = fp.read(bs)
231        if reporthook:
232            reporthook(1, bs, size)
233        while block:
234            tfp.write(block)
235            block = fp.read(bs)
236            blocknum = blocknum + 1
237            if reporthook:
238                reporthook(blocknum, bs, size)
239        fp.close()
240        tfp.close()
241        del fp
242        del tfp
243        return result
244
245    # Each method named open_<type> knows how to open that type of URL
246
247    def open_http(self, url, data=None):
248        """Use HTTP protocol."""
249        import httplib
250        user_passwd = None
251        if type(url) is types.StringType:
252            host, selector = splithost(url)
253            if host:
254                user_passwd, host = splituser(host)
255                host = unquote(host)
256            realhost = host
257        else:
258            host, selector = url
259            urltype, rest = splittype(selector)
260            url = rest
261            user_passwd = None
262            if urltype.lower() != 'http':
263                realhost = None
264            else:
265                realhost, rest = splithost(rest)
266                if realhost:
267                    user_passwd, realhost = splituser(realhost)
268                if user_passwd:
269                    selector = "%s://%s%s" % (urltype, realhost, rest)
270            #print "proxy via http:", host, selector
271        if not host: raise IOError, ('http error', 'no host given')
272        if user_passwd:
273            import base64
274            auth = base64.encodestring(user_passwd).strip()
275        else:
276            auth = None
277        h = httplib.HTTP(host)
278        if data is not None:
279            h.putrequest('POST', selector)
280            h.putheader('Content-type', 'application/x-www-form-urlencoded')
281            h.putheader('Content-length', '%d' % len(data))
282        else:
283            h.putrequest('GET', selector)
284        if auth: h.putheader('Authorization', 'Basic %s' % auth)
285        if realhost: h.putheader('Host', realhost)
286        for args in self.addheaders: apply(h.putheader, args)
287        h.endheaders()
288        if data is not None:
289            h.send(data)
290        errcode, errmsg, headers = h.getreply()
291        fp = h.getfile()
292        if errcode == 200:
293            return addinfourl(fp, headers, "http:" + url)
294        else:
295            if data is None:
296                return self.http_error(url, fp, errcode, errmsg, headers)
297            else:
298                return self.http_error(url, fp, errcode, errmsg, headers, data)
299
300    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
301        """Handle http errors.
302        Derived class can override this, or provide specific handlers
303        named http_error_DDD where DDD is the 3-digit error code."""
304        # First check if there's a specific handler for this error
305        name = 'http_error_%d' % errcode
306        if hasattr(self, name):
307            method = getattr(self, name)
308            if data is None:
309                result = method(url, fp, errcode, errmsg, headers)
310            else:
311                result = method(url, fp, errcode, errmsg, headers, data)
312            if result: return result
313        return self.http_error_default(url, fp, errcode, errmsg, headers)
314
315    def http_error_default(self, url, fp, errcode, errmsg, headers):
316        """Default error handler: close the connection and raise IOError."""
317        void = fp.read()
318        fp.close()
319        raise IOError, ('http error', errcode, errmsg, headers)
320
321    if hasattr(socket, "ssl"):
322        def open_https(self, url, data=None):
323            """Use HTTPS protocol."""
324            import httplib
325            user_passwd = None
326            if type(url) is types.StringType:
327                host, selector = splithost(url)
328                if host:
329                    user_passwd, host = splituser(host)
330                    host = unquote(host)
331                realhost = host
332            else:
333                host, selector = url
334                urltype, rest = splittype(selector)
335                url = rest
336                user_passwd = None
337                if urltype.lower() != 'https':
338                    realhost = None
339                else:
340                    realhost, rest = splithost(rest)
341                    if realhost:
342                        user_passwd, realhost = splituser(realhost)
343                    if user_passwd:
344                        selector = "%s://%s%s" % (urltype, realhost, rest)
345                #print "proxy via https:", host, selector
346            if not host: raise IOError, ('https error', 'no host given')
347            if user_passwd:
348                import base64
349                auth = base64.encodestring(user_passwd).strip()
350            else:
351                auth = None
352            h = httplib.HTTPS(host, 0,
353                              key_file=self.key_file,
354                              cert_file=self.cert_file)
355            if data is not None:
356                h.putrequest('POST', selector)
357                h.putheader('Content-type',
358                            'application/x-www-form-urlencoded')
359                h.putheader('Content-length', '%d' % len(data))
360            else:
361                h.putrequest('GET', selector)
362            if auth: h.putheader('Authorization: Basic %s' % auth)
363            if realhost: h.putheader('Host', realhost)
364            for args in self.addheaders: apply(h.putheader, args)
365            h.endheaders()
366            if data is not None:
367                h.send(data)
368            errcode, errmsg, headers = h.getreply()
369            fp = h.getfile()
370            if errcode == 200:
371                return addinfourl(fp, headers, url)
372            else:
373                if data is None:
374                    return self.http_error(url, fp, errcode, errmsg, headers)
375                else:
376                    return self.http_error(url, fp, errcode, errmsg, headers,
377                                           data)
378
379    def open_gopher(self, url):
380        """Use Gopher protocol."""
381        import gopherlib
382        host, selector = splithost(url)
383        if not host: raise IOError, ('gopher error', 'no host given')
384        host = unquote(host)
385        type, selector = splitgophertype(selector)
386        selector, query = splitquery(selector)
387        selector = unquote(selector)
388        if query:
389            query = unquote(query)
390            fp = gopherlib.send_query(selector, query, host)
391        else:
392            fp = gopherlib.send_selector(selector, host)
393        return addinfourl(fp, noheaders(), "gopher:" + url)
394
395    def open_file(self, url):
396        """Use local file or FTP depending on form of URL."""
397        if url[:2] == '//' and url[2:3] != '/':
398            return self.open_ftp(url)
399        else:
400            return self.open_local_file(url)
401
402    def open_local_file(self, url):
403        """Use local file."""
404        import mimetypes, mimetools, StringIO
405        mtype = mimetypes.guess_type(url)[0]
406        headers = mimetools.Message(StringIO.StringIO(
407            'Content-Type: %s\n' % (mtype or 'text/plain')))
408        host, file = splithost(url)
409        if not host:
410            urlfile = file
411            if file[:1] == '/':
412                urlfile = 'file://' + file
413            return addinfourl(open(url2pathname(file), 'rb'),
414                              headers, urlfile)
415        host, port = splitport(host)
416        if not port \
417           and socket.gethostbyname(host) in (localhost(), thishost()):
418            urlfile = file
419            if file[:1] == '/':
420                urlfile = 'file://' + file
421            return addinfourl(open(url2pathname(file), 'rb'),
422                              headers, urlfile)
423        raise IOError, ('local file error', 'not on local host')
424
425    def open_ftp(self, url):
426        """Use FTP protocol."""
427        host, path = splithost(url)
428        if not host: raise IOError, ('ftp error', 'no host given')
429        host, port = splitport(host)
430        user, host = splituser(host)
431        if user: user, passwd = splitpasswd(user)
432        else: passwd = None
433        host = unquote(host)
434        user = unquote(user or '')
435        passwd = unquote(passwd or '')
436        host = socket.gethostbyname(host)
437        if not port:
438            import ftplib
439            port = ftplib.FTP_PORT
440        else:
441            port = int(port)
442        path, attrs = splitattr(path)
443        path = unquote(path)
444        dirs = path.split('/')
445        dirs, file = dirs[:-1], dirs[-1]
446        if dirs and not dirs[0]: dirs = dirs[1:]
447        if dirs and not dirs[0]: dirs[0] = '/'
448        key = user, host, port, '/'.join(dirs)
449        # XXX thread unsafe!
450        if len(self.ftpcache) > MAXFTPCACHE:
451            # Prune the cache, rather arbitrarily
452            for k in self.ftpcache.keys():
453                if k != key:
454                    v = self.ftpcache[k]
455                    del self.ftpcache[k]
456                    v.close()
457        try:
458            if not self.ftpcache.has_key(key):
459                self.ftpcache[key] = \
460                    ftpwrapper(user, passwd, host, port, dirs)
461            if not file: type = 'D'
462            else: type = 'I'
463            for attr in attrs:
464                attr, value = splitvalue(attr)
465                if attr.lower() == 'type' and \
466                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
467                    type = value.upper()
468            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
469            if retrlen is not None and retrlen >= 0:
470                import mimetools, StringIO
471                headers = mimetools.Message(StringIO.StringIO(
472                    'Content-Length: %d\n' % retrlen))
473            else:
474                headers = noheaders()
475            return addinfourl(fp, headers, "ftp:" + url)
476        except ftperrors(), msg:
477            raise IOError, ('ftp error', msg), sys.exc_info()[2]
478
479    def open_data(self, url, data=None):
480        """Use "data" URL."""
481        # ignore POSTed data
482        #
483        # syntax of data URLs:
484        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
485        # mediatype := [ type "/" subtype ] *( ";" parameter )
486        # data      := *urlchar
487        # parameter := attribute "=" value
488        import StringIO, mimetools, time
489        try:
490            [type, data] = url.split(',', 1)
491        except ValueError:
492            raise IOError, ('data error', 'bad data URL')
493        if not type:
494            type = 'text/plain;charset=US-ASCII'
495        semi = type.rfind(';')
496        if semi >= 0 and '=' not in type[semi:]:
497            encoding = type[semi+1:]
498            type = type[:semi]
499        else:
500            encoding = ''
501        msg = []
502        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
503                                            time.gmtime(time.time())))
504        msg.append('Content-type: %s' % type)
505        if encoding == 'base64':
506            import base64
507            data = base64.decodestring(data)
508        else:
509            data = unquote(data)
510        msg.append('Content-length: %d' % len(data))
511        msg.append('')
512        msg.append(data)
513        msg = '\n'.join(msg)
514        f = StringIO.StringIO(msg)
515        headers = mimetools.Message(f, 0)
516        f.fileno = None     # needed for addinfourl
517        return addinfourl(f, headers, url)
518
519
520class FancyURLopener(URLopener):
521    """Derived class with handlers for errors we can handle (perhaps)."""
522
523    def __init__(self, *args):
524        apply(URLopener.__init__, (self,) + args)
525        self.auth_cache = {}
526        self.tries = 0
527        self.maxtries = 10
528
529    def http_error_default(self, url, fp, errcode, errmsg, headers):
530        """Default error handling -- don't raise an exception."""
531        return addinfourl(fp, headers, "http:" + url)
532
533    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
534        """Error 302 -- relocated (temporarily)."""
535        self.tries += 1
536        if self.maxtries and self.tries >= self.maxtries:
537            if hasattr(self, "http_error_500"):
538                meth = self.http_error_500
539            else:
540                meth = self.http_error_default
541            self.tries = 0
542            return meth(url, fp, 500,
543                        "Internal Server Error: Redirect Recursion", headers)
544        result = self.redirect_internal(url, fp, errcode, errmsg, headers,
545                                        data)
546        self.tries = 0
547        return result
548
549    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
550        if headers.has_key('location'):
551            newurl = headers['location']
552        elif headers.has_key('uri'):
553            newurl = headers['uri']
554        else:
555            return
556        void = fp.read()
557        fp.close()
558        # In case the server sent a relative URL, join with original:
559        newurl = basejoin(self.type + ":" + url, newurl)
560        if data is None:
561            return self.open(newurl)
562        else:
563            return self.open(newurl, data)
564
565    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
566        """Error 301 -- also relocated (permanently)."""
567        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
568
569    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
570        """Error 401 -- authentication required.
571        See this URL for a description of the basic authentication scheme:
572        http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
573        if not headers.has_key('www-authenticate'):
574            URLopener.http_error_default(self, url, fp,
575                                         errmsg, headers)
576        stuff = headers['www-authenticate']
577        import re
578        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
579        if not match:
580            URLopener.http_error_default(self, url, fp,
581                                         errcode, errmsg, headers)
582        scheme, realm = match.groups()
583        if scheme.lower() != 'basic':
584            URLopener.http_error_default(self, url, fp,
585                                         errcode, errmsg, headers)
586        name = 'retry_' + self.type + '_basic_auth'
587        if data is None:
588            return getattr(self,name)(url, realm)
589        else:
590            return getattr(self,name)(url, realm, data)
591
592    def retry_http_basic_auth(self, url, realm, data=None):
593        host, selector = splithost(url)
594        i = host.find('@') + 1
595        host = host[i:]
596        user, passwd = self.get_user_passwd(host, realm, i)
597        if not (user or passwd): return None
598        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
599        newurl = 'http://' + host + selector
600        if data is None:
601            return self.open(newurl)
602        else:
603            return self.open(newurl, data)
604
605    def retry_https_basic_auth(self, url, realm, data=None):
606        host, selector = splithost(url)
607        i = host.find('@') + 1
608        host = host[i:]
609        user, passwd = self.get_user_passwd(host, realm, i)
610        if not (user or passwd): return None
611        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
612        newurl = '//' + host + selector
613        return self.open_https(newurl, data)
614
615    def get_user_passwd(self, host, realm, clear_cache = 0):
616        key = realm + '@' + host.lower()
617        if self.auth_cache.has_key(key):
618            if clear_cache:
619                del self.auth_cache[key]
620            else:
621                return self.auth_cache[key]
622        user, passwd = self.prompt_user_passwd(host, realm)
623        if user or passwd: self.auth_cache[key] = (user, passwd)
624        return user, passwd
625
626    def prompt_user_passwd(self, host, realm):
627        """Override this in a GUI environment!"""
628        import getpass
629        try:
630            user = raw_input("Enter username for %s at %s: " % (realm,
631                                                                host))
632            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
633                (user, realm, host))
634            return user, passwd
635        except KeyboardInterrupt:
636            print
637            return None, None
638
639
640# Utility functions
641
642_localhost = None
643def localhost():
644    """Return the IP address of the magic hostname 'localhost'."""
645    global _localhost
646    if not _localhost:
647        _localhost = socket.gethostbyname('localhost')
648    return _localhost
649
650_thishost = None
651def thishost():
652    """Return the IP address of the current host."""
653    global _thishost
654    if not _thishost:
655        _thishost = socket.gethostbyname(socket.gethostname())
656    return _thishost
657
658_ftperrors = None
659def ftperrors():
660    """Return the set of errors raised by the FTP class."""
661    global _ftperrors
662    if not _ftperrors:
663        import ftplib
664        _ftperrors = ftplib.all_errors
665    return _ftperrors
666
667_noheaders = None
668def noheaders():
669    """Return an empty mimetools.Message object."""
670    global _noheaders
671    if not _noheaders:
672        import mimetools
673        import StringIO
674        _noheaders = mimetools.Message(StringIO.StringIO(), 0)
675        _noheaders.fp.close()   # Recycle file descriptor
676    return _noheaders
677
678
679# Utility classes
680
681class ftpwrapper:
682    """Class used by open_ftp() for cache of open FTP connections."""
683
684    def __init__(self, user, passwd, host, port, dirs):
685        self.user = user
686        self.passwd = passwd
687        self.host = host
688        self.port = port
689        self.dirs = dirs
690        self.init()
691
692    def init(self):
693        import ftplib
694        self.busy = 0
695        self.ftp = ftplib.FTP()
696        self.ftp.connect(self.host, self.port)
697        self.ftp.login(self.user, self.passwd)
698        for dir in self.dirs:
699            self.ftp.cwd(dir)
700
701    def retrfile(self, file, type):
702        import ftplib
703        self.endtransfer()
704        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
705        else: cmd = 'TYPE ' + type; isdir = 0
706        try:
707            self.ftp.voidcmd(cmd)
708        except ftplib.all_errors:
709            self.init()
710            self.ftp.voidcmd(cmd)
711        conn = None
712        if file and not isdir:
713            # Use nlst to see if the file exists at all
714            try:
715                self.ftp.nlst(file)
716            except ftplib.error_perm, reason:
717                raise IOError, ('ftp error', reason), sys.exc_info()[2]
718            # Restore the transfer mode!
719            self.ftp.voidcmd(cmd)
720            # Try to retrieve as a file
721            try:
722                cmd = 'RETR ' + file
723                conn = self.ftp.ntransfercmd(cmd)
724            except ftplib.error_perm, reason:
725                if str(reason)[:3] != '550':
726                    raise IOError, ('ftp error', reason), sys.exc_info()[2]
727        if not conn:
728            # Set transfer mode to ASCII!
729            self.ftp.voidcmd('TYPE A')
730            # Try a directory listing
731            if file: cmd = 'LIST ' + file
732            else: cmd = 'LIST'
733            conn = self.ftp.ntransfercmd(cmd)
734        self.busy = 1
735        # Pass back both a suitably decorated object and a retrieval length
736        return (addclosehook(conn[0].makefile('rb'),
737                             self.endtransfer), conn[1])
738    def endtransfer(self):
739        if not self.busy:
740            return
741        self.busy = 0
742        try:
743            self.ftp.voidresp()
744        except ftperrors():
745            pass
746
747    def close(self):
748        self.endtransfer()
749        try:
750            self.ftp.close()
751        except ftperrors():
752            pass
753
754class addbase:
755    """Base class for addinfo and addclosehook."""
756
757    def __init__(self, fp):
758        self.fp = fp
759        self.read = self.fp.read
760        self.readline = self.fp.readline
761        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
762        if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
763
764    def __repr__(self):
765        return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
766                                             `id(self)`, `self.fp`)
767
768    def close(self):
769        self.read = None
770        self.readline = None
771        self.readlines = None
772        self.fileno = None
773        if self.fp: self.fp.close()
774        self.fp = None
775
776class addclosehook(addbase):
777    """Class to add a close hook to an open file."""
778
779    def __init__(self, fp, closehook, *hookargs):
780        addbase.__init__(self, fp)
781        self.closehook = closehook
782        self.hookargs = hookargs
783
784    def close(self):
785        addbase.close(self)
786        if self.closehook:
787            apply(self.closehook, self.hookargs)
788            self.closehook = None
789            self.hookargs = None
790
791class addinfo(addbase):
792    """class to add an info() method to an open file."""
793
794    def __init__(self, fp, headers):
795        addbase.__init__(self, fp)
796        self.headers = headers
797
798    def info(self):
799        return self.headers
800
801class addinfourl(addbase):
802    """class to add info() and geturl() methods to an open file."""
803
804    def __init__(self, fp, headers, url):
805        addbase.__init__(self, fp)
806        self.headers = headers
807        self.url = url
808
809    def info(self):
810        return self.headers
811
812    def geturl(self):
813        return self.url
814
815
816def basejoin(base, url):
817    """Utility to combine a URL with a base URL to form a new URL."""
818    type, path = splittype(url)
819    if type:
820        # if url is complete (i.e., it contains a type), return it
821        return url
822    host, path = splithost(path)
823    type, basepath = splittype(base) # inherit type from base
824    if host:
825        # if url contains host, just inherit type
826        if type: return type + '://' + host + path
827        else:
828            # no type inherited, so url must have started with //
829            # just return it
830            return url
831    host, basepath = splithost(basepath) # inherit host
832    basepath, basetag = splittag(basepath) # remove extraneous cruft
833    basepath, basequery = splitquery(basepath) # idem
834    if path[:1] != '/':
835        # non-absolute path name
836        if path[:1] in ('#', '?'):
837            # path is just a tag or query, attach to basepath
838            i = len(basepath)
839        else:
840            # else replace last component
841            i = basepath.rfind('/')
842        if i < 0:
843            # basepath not absolute
844            if host:
845                # host present, make absolute
846                basepath = '/'
847            else:
848                # else keep non-absolute
849                basepath = ''
850        else:
851            # remove last file component
852            basepath = basepath[:i+1]
853        # Interpret ../ (important because of symlinks)
854        while basepath and path[:3] == '../':
855            path = path[3:]
856            i = basepath[:-1].rfind('/')
857            if i > 0:
858                basepath = basepath[:i+1]
859            elif i == 0:
860                basepath = '/'
861                break
862            else:
863                basepath = ''
864
865        path = basepath + path
866    if host and path and path[0] != '/':
867        path = '/' + path
868    if type and host: return type + '://' + host + path
869    elif type: return type + ':' + path
870    elif host: return '//' + host + path # don't know what this means
871    else: return path
872
873
874# Utilities to parse URLs (most of these return None for missing parts):
875# unwrap('<URL:type://host/path>') --> 'type://host/path'
876# splittype('type:opaquestring') --> 'type', 'opaquestring'
877# splithost('//host[:port]/path') --> 'host[:port]', '/path'
878# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
879# splitpasswd('user:passwd') -> 'user', 'passwd'
880# splitport('host:port') --> 'host', 'port'
881# splitquery('/path?query') --> '/path', 'query'
882# splittag('/path#tag') --> '/path', 'tag'
883# splitattr('/path;attr1=value1;attr2=value2;...') ->
884#   '/path', ['attr1=value1', 'attr2=value2', ...]
885# splitvalue('attr=value') --> 'attr', 'value'
886# splitgophertype('/Xselector') --> 'X', 'selector'
887# unquote('abc%20def') -> 'abc def'
888# quote('abc def') -> 'abc%20def')
889
890def toBytes(url):
891    """toBytes(u"URL") --> 'URL'."""
892    # Most URL schemes require ASCII. If that changes, the conversion
893    # can be relaxed
894    if type(url) is types.UnicodeType:
895        try:
896            url = url.encode("ASCII")
897        except UnicodeError:
898            raise UnicodeError("URL " + repr(url) +
899                               " contains non-ASCII characters")
900    return url
901
902def unwrap(url):
903    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
904    url = url.strip()
905    if url[:1] == '<' and url[-1:] == '>':
906        url = url[1:-1].strip()
907    if url[:4] == 'URL:': url = url[4:].strip()
908    return url
909
910_typeprog = None
911def splittype(url):
912    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
913    global _typeprog
914    if _typeprog is None:
915        import re
916        _typeprog = re.compile('^([^/:]+):')
917
918    match = _typeprog.match(url)
919    if match:
920        scheme = match.group(1)
921        return scheme.lower(), url[len(scheme) + 1:]
922    return None, url
923
924_hostprog = None
925def splithost(url):
926    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
927    global _hostprog
928    if _hostprog is None:
929        import re
930        _hostprog = re.compile('^//([^/]*)(.*)$')
931
932    match = _hostprog.match(url)
933    if match: return match.group(1, 2)
934    return None, url
935
936_userprog = None
937def splituser(host):
938    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
939    global _userprog
940    if _userprog is None:
941        import re
942        _userprog = re.compile('^([^@]*)@(.*)$')
943
944    match = _userprog.match(host)
945    if match: return map(unquote, match.group(1, 2))
946    return None, host
947
948_passwdprog = None
949def splitpasswd(user):
950    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
951    global _passwdprog
952    if _passwdprog is None:
953        import re
954        _passwdprog = re.compile('^([^:]*):(.*)$')
955
956    match = _passwdprog.match(user)
957    if match: return match.group(1, 2)
958    return user, None
959
960# splittag('/path#tag') --> '/path', 'tag'
961_portprog = None
962def splitport(host):
963    """splitport('host:port') --> 'host', 'port'."""
964    global _portprog
965    if _portprog is None:
966        import re
967        _portprog = re.compile('^(.*):([0-9]+)$')
968
969    match = _portprog.match(host)
970    if match: return match.group(1, 2)
971    return host, None
972
973_nportprog = None
974def splitnport(host, defport=-1):
975    """Split host and port, returning numeric port.
976    Return given default port if no ':' found; defaults to -1.
977    Return numerical port if a valid number are found after ':'.
978    Return None if ':' but not a valid number."""
979    global _nportprog
980    if _nportprog is None:
981        import re
982        _nportprog = re.compile('^(.*):(.*)$')
983
984    match = _nportprog.match(host)
985    if match:
986        host, port = match.group(1, 2)
987        try:
988            if not port: raise ValueError, "no digits"
989            nport = int(port)
990        except ValueError:
991            nport = None
992        return host, nport
993    return host, defport
994
995_queryprog = None
996def splitquery(url):
997    """splitquery('/path?query') --> '/path', 'query'."""
998    global _queryprog
999    if _queryprog is None:
1000        import re
1001        _queryprog = re.compile('^(.*)\?([^?]*)$')
1002
1003    match = _queryprog.match(url)
1004    if match: return match.group(1, 2)
1005    return url, None
1006
1007_tagprog = None
1008def splittag(url):
1009    """splittag('/path#tag') --> '/path', 'tag'."""
1010    global _tagprog
1011    if _tagprog is None:
1012        import re
1013        _tagprog = re.compile('^(.*)#([^#]*)$')
1014
1015    match = _tagprog.match(url)
1016    if match: return match.group(1, 2)
1017    return url, None
1018
1019def splitattr(url):
1020    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1021        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1022    words = url.split(';')
1023    return words[0], words[1:]
1024
1025_valueprog = None
1026def splitvalue(attr):
1027    """splitvalue('attr=value') --> 'attr', 'value'."""
1028    global _valueprog
1029    if _valueprog is None:
1030        import re
1031        _valueprog = re.compile('^([^=]*)=(.*)$')
1032
1033    match = _valueprog.match(attr)
1034    if match: return match.group(1, 2)
1035    return attr, None
1036
1037def splitgophertype(selector):
1038    """splitgophertype('/Xselector') --> 'X', 'selector'."""
1039    if selector[:1] == '/' and selector[1:2]:
1040        return selector[1], selector[2:]
1041    return None, selector
1042
1043def unquote(s):
1044    """unquote('abc%20def') -> 'abc def'."""
1045    mychr = chr
1046    myatoi = int
1047    list = s.split('%')
1048    res = [list[0]]
1049    myappend = res.append
1050    del list[0]
1051    for item in list:
1052        if item[1:2]:
1053            try:
1054                myappend(mychr(myatoi(item[:2], 16))
1055                     + item[2:])
1056            except:
1057                myappend('%' + item)
1058        else:
1059            myappend('%' + item)
1060    return "".join(res)
1061
1062def unquote_plus(s):
1063    """unquote('%7e/abc+def') -> '~/abc def'"""
1064    if '+' in s:
1065        # replace '+' with ' '
1066        s = ' '.join(s.split('+'))
1067    return unquote(s)
1068
1069always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1070               'abcdefghijklmnopqrstuvwxyz'
1071               '0123456789' '_.-')
1072
1073_fast_safe_test = always_safe + '/'
1074_fast_safe = None
1075
1076def _fast_quote(s):
1077    global _fast_safe
1078    if _fast_safe is None:
1079        _fast_safe = {}
1080        for c in _fast_safe_test:
1081            _fast_safe[c] = c
1082    res = list(s)
1083    for i in range(len(res)):
1084        c = res[i]
1085        if not _fast_safe.has_key(c):
1086            res[i] = '%%%02X' % ord(c)
1087    return ''.join(res)
1088
1089def quote(s, safe = '/'):
1090    """quote('abc def') -> 'abc%20def'
1091
1092    Each part of a URL, e.g. the path info, the query, etc., has a
1093    different set of reserved characters that must be quoted.
1094
1095    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1096    the following reserved characters.
1097
1098    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1099                  "$" | ","
1100
1101    Each of these characters is reserved in some component of a URL,
1102    but not necessarily in all of them.
1103
1104    By default, the quote function is intended for quoting the path
1105    section of a URL.  Thus, it will not encode '/'.  This character
1106    is reserved, but in typical usage the quote function is being
1107    called on a path where the existing slash characters are used as
1108    reserved characters.
1109    """
1110    safe = always_safe + safe
1111    if _fast_safe_test == safe:
1112        return _fast_quote(s)
1113    res = list(s)
1114    for i in range(len(res)):
1115        c = res[i]
1116        if c not in safe:
1117            res[i] = '%%%02X' % ord(c)
1118    return ''.join(res)
1119
1120def quote_plus(s, safe = ''):
1121    """Quote the query fragment of a URL; replacing ' ' with '+'"""
1122    if ' ' in s:
1123        l = s.split(' ')
1124        for i in range(len(l)):
1125            l[i] = quote(l[i], safe)
1126        return '+'.join(l)
1127    else:
1128        return quote(s, safe)
1129
1130def urlencode(query,doseq=0):
1131    """Encode a sequence of two-element tuples or dictionary into a URL query string.
1132
1133    If any values in the query arg are sequences and doseq is true, each
1134    sequence element is converted to a separate parameter.
1135
1136    If the query arg is a sequence of two-element tuples, the order of the
1137    parameters in the output will match the order of parameters in the
1138    input.
1139    """
1140
1141    if hasattr(query,"items"):
1142        # mapping objects
1143        query = query.items()
1144    else:
1145        # it's a bother at times that strings and string-like objects are
1146        # sequences...
1147        try:
1148            # non-sequence items should not work with len()
1149            x = len(query)
1150            # non-empty strings will fail this
1151            if len(query) and type(query[0]) != types.TupleType:
1152                raise TypeError
1153            # zero-length sequences of all types will get here and succeed,
1154            # but that's a minor nit - since the original implementation
1155            # allowed empty dicts that type of behavior probably should be
1156            # preserved for consistency
1157        except TypeError:
1158            ty,va,tb = sys.exc_info()
1159            raise TypeError, "not a valid non-string sequence or mapping object", tb
1160
1161    l = []
1162    if not doseq:
1163        # preserve old behavior
1164        for k, v in query:
1165            k = quote_plus(str(k))
1166            v = quote_plus(str(v))
1167            l.append(k + '=' + v)
1168    else:
1169        for k, v in query:
1170            k = quote_plus(str(k))
1171            if type(v) == types.StringType:
1172                v = quote_plus(v)
1173                l.append(k + '=' + v)
1174            elif type(v) == types.UnicodeType:
1175                # is there a reasonable way to convert to ASCII?
1176                # encode generates a string, but "replace" or "ignore"
1177                # lose information and "strict" can raise UnicodeError
1178                v = quote_plus(v.encode("ASCII","replace"))
1179                l.append(k + '=' + v)
1180            else:
1181                try:
1182                    # is this a sufficient test for sequence-ness?
1183                    x = len(v)
1184                except TypeError:
1185                    # not a sequence
1186                    v = quote_plus(str(v))
1187                    l.append(k + '=' + v)
1188                else:
1189                    # loop over the sequence
1190                    for elt in v:
1191                        l.append(k + '=' + quote_plus(str(elt)))
1192    return '&'.join(l)
1193
1194# Proxy handling
1195def getproxies_environment():
1196    """Return a dictionary of scheme -> proxy server URL mappings.
1197
1198    Scan the environment for variables named <scheme>_proxy;
1199    this seems to be the standard convention.  If you need a
1200    different way, you can pass a proxies dictionary to the
1201    [Fancy]URLopener constructor.
1202
1203    """
1204    proxies = {}
1205    for name, value in os.environ.items():
1206        name = name.lower()
1207        if value and name[-6:] == '_proxy':
1208            proxies[name[:-6]] = value
1209    return proxies
1210
1211if os.name == 'mac':
1212    def getproxies():
1213        """Return a dictionary of scheme -> proxy server URL mappings.
1214
1215        By convention the mac uses Internet Config to store
1216        proxies.  An HTTP proxy, for instance, is stored under
1217        the HttpProxy key.
1218
1219        """
1220        try:
1221            import ic
1222        except ImportError:
1223            return {}
1224
1225        try:
1226            config = ic.IC()
1227        except ic.error:
1228            return {}
1229        proxies = {}
1230        # HTTP:
1231        if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1232            try:
1233                value = config['HTTPProxyHost']
1234            except ic.error:
1235                pass
1236            else:
1237                proxies['http'] = 'http://%s' % value
1238        # FTP: XXXX To be done.
1239        # Gopher: XXXX To be done.
1240        return proxies
1241
1242elif os.name == 'nt':
1243    def getproxies_registry():
1244        """Return a dictionary of scheme -> proxy server URL mappings.
1245
1246        Win32 uses the registry to store proxies.
1247
1248        """
1249        proxies = {}
1250        try:
1251            import _winreg
1252        except ImportError:
1253            # Std module, so should be around - but you never know!
1254            return proxies
1255        try:
1256            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1257                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1258            proxyEnable = _winreg.QueryValueEx(internetSettings,
1259                                               'ProxyEnable')[0]
1260            if proxyEnable:
1261                # Returned as Unicode but problems if not converted to ASCII
1262                proxyServer = str(_winreg.QueryValueEx(internetSettings,
1263                                                       'ProxyServer')[0])
1264                if '=' in proxyServer:
1265                    # Per-protocol settings
1266                    for p in proxyServer.split(';'):
1267                        protocol, address = p.split('=', 1)
1268                        proxies[protocol] = '%s://%s' % (protocol, address)
1269                else:
1270                    # Use one setting for all protocols
1271                    if proxyServer[:5] == 'http:':
1272                        proxies['http'] = proxyServer
1273                    else:
1274                        proxies['http'] = 'http://%s' % proxyServer
1275                        proxies['ftp'] = 'ftp://%s' % proxyServer
1276            internetSettings.Close()
1277        except (WindowsError, ValueError, TypeError):
1278            # Either registry key not found etc, or the value in an
1279            # unexpected format.
1280            # proxies already set up to be empty so nothing to do
1281            pass
1282        return proxies
1283
1284    def getproxies():
1285        """Return a dictionary of scheme -> proxy server URL mappings.
1286
1287        Returns settings gathered from the environment, if specified,
1288        or the registry.
1289
1290        """
1291        return getproxies_environment() or getproxies_registry()
1292else:
1293    # By default use environment variables
1294    getproxies = getproxies_environment
1295
1296
1297# Test and time quote() and unquote()
1298def test1():
1299    import time
1300    s = ''
1301    for i in range(256): s = s + chr(i)
1302    s = s*4
1303    t0 = time.time()
1304    qs = quote(s)
1305    uqs = unquote(qs)
1306    t1 = time.time()
1307    if uqs != s:
1308        print 'Wrong!'
1309    print `s`
1310    print `qs`
1311    print `uqs`
1312    print round(t1 - t0, 3), 'sec'
1313
1314
1315def reporthook(blocknum, blocksize, totalsize):
1316    # Report during remote transfers
1317    print "Block number: %d, Block size: %d, Total size: %d" % (
1318        blocknum, blocksize, totalsize)
1319
1320# Test program
1321def test(args=[]):
1322    if not args:
1323        args = [
1324            '/etc/passwd',
1325            'file:/etc/passwd',
1326            'file://localhost/etc/passwd',
1327            'ftp://ftp.python.org/etc/passwd',
1328##          'gopher://gopher.micro.umn.edu/1/',
1329            'http://www.python.org/index.html',
1330            ]
1331        if hasattr(URLopener, "open_https"):
1332            args.append('https://synergy.as.cmu.edu/~geek/')
1333    try:
1334        for url in args:
1335            print '-'*10, url, '-'*10
1336            fn, h = urlretrieve(url, None, reporthook)
1337            print fn
1338            if h:
1339                print '======'
1340                for k in h.keys(): print k + ':', h[k]
1341                print '======'
1342            fp = open(fn, 'rb')
1343            data = fp.read()
1344            del fp
1345            if '\r' in data:
1346                table = string.maketrans("", "")
1347                data = data.translate(table, "\r")
1348            print data
1349            fn, h = None, None
1350        print '-'*40
1351    finally:
1352        urlcleanup()
1353
1354def main():
1355    import getopt, sys
1356    try:
1357        opts, args = getopt.getopt(sys.argv[1:], "th")
1358    except getopt.error, msg:
1359        print msg
1360        print "Use -h for help"
1361        return
1362    t = 0
1363    for o, a in opts:
1364        if o == '-t':
1365            t = t + 1
1366        if o == '-h':
1367            print "Usage: python urllib.py [-t] [url ...]"
1368            print "-t runs self-test;",
1369            print "otherwise, contents of urls are printed"
1370            return
1371    if t:
1372        if t > 1:
1373            test1()
1374        test(args)
1375    else:
1376        if not args:
1377            print "Use -h for help"
1378        for url in args:
1379            print urlopen(url).read(),
1380
1381# Run test program when run as a script
1382if __name__ == '__main__':
1383    main()
1384