urllib.py revision 49985638fa33230fdf1ef95613d918fe5e385f5e
1"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol.  All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info().  The read*(), fileno()
19and close() methods work like those of open files.
20The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
24
25import string
26import socket
27import os
28import sys
29import types
30
31__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
32           "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
33           "urlencode", "url2pathname", "pathname2url"]
34
35__version__ = '1.15'    # XXX This version is not always updated :-(
36
37MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
38
39# Helper for non-unix systems
40if os.name == 'mac':
41    from macurl2path import url2pathname, pathname2url
42elif os.name == 'nt':
43    from nturl2path import url2pathname, pathname2url
44elif os.name == 'riscos':
45    from rourl2path import url2pathname, pathname2url
46else:
47    def url2pathname(pathname):
48        return unquote(pathname)
49    def pathname2url(pathname):
50        return quote(pathname)
51
52# This really consists of two pieces:
53# (1) a class which handles opening of all sorts of URLs
54#     (plus assorted utilities etc.)
55# (2) a set of functions for parsing URLs
56# XXX Should these be separated out into different modules?
57
58
59# Shortcut for basic usage
60_urlopener = None
61def urlopen(url, data=None):
62    """urlopen(url [, data]) -> open file-like object"""
63    global _urlopener
64    if not _urlopener:
65        _urlopener = FancyURLopener()
66    if data is None:
67        return _urlopener.open(url)
68    else:
69        return _urlopener.open(url, data)
70def urlretrieve(url, filename=None, reporthook=None, data=None):
71    global _urlopener
72    if not _urlopener:
73        _urlopener = FancyURLopener()
74    return _urlopener.retrieve(url, filename, reporthook, data)
75def urlcleanup():
76    if _urlopener:
77        _urlopener.cleanup()
78
79
80ftpcache = {}
81class URLopener:
82    """Class to open URLs.
83    This is a class rather than just a subroutine because we may need
84    more than one set of global protocol-specific options.
85    Note -- this is a base class for those who don't want the
86    automatic handling of errors type 302 (relocated) and 401
87    (authorization needed)."""
88
89    __tempfiles = None
90
91    version = "Python-urllib/%s" % __version__
92
93    # Constructor
94    def __init__(self, proxies=None, **x509):
95        if proxies is None:
96            proxies = getproxies()
97        assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
98        self.proxies = proxies
99        self.key_file = x509.get('key_file')
100        self.cert_file = x509.get('cert_file')
101        self.addheaders = [('User-agent', self.version)]
102        self.__tempfiles = []
103        self.__unlink = os.unlink # See cleanup()
104        self.tempcache = None
105        # Undocumented feature: if you assign {} to tempcache,
106        # it is used to cache files retrieved with
107        # self.retrieve().  This is not enabled by default
108        # since it does not work for changing documents (and I
109        # haven't got the logic to check expiration headers
110        # yet).
111        self.ftpcache = ftpcache
112        # Undocumented feature: you can use a different
113        # ftp cache by assigning to the .ftpcache member;
114        # in case you want logically independent URL openers
115        # XXX This is not threadsafe.  Bah.
116
117    def __del__(self):
118        self.close()
119
120    def close(self):
121        self.cleanup()
122
123    def cleanup(self):
124        # This code sometimes runs when the rest of this module
125        # has already been deleted, so it can't use any globals
126        # or import anything.
127        if self.__tempfiles:
128            for file in self.__tempfiles:
129                try:
130                    self.__unlink(file)
131                except:
132                    pass
133            del self.__tempfiles[:]
134        if self.tempcache:
135            self.tempcache.clear()
136
137    def addheader(self, *args):
138        """Add a header to be used by the HTTP interface only
139        e.g. u.addheader('Accept', 'sound/basic')"""
140        self.addheaders.append(args)
141
142    # External interface
143    def open(self, fullurl, data=None):
144        """Use URLopener().open(file) instead of open(file, 'r')."""
145        fullurl = unwrap(toBytes(fullurl))
146        if self.tempcache and self.tempcache.has_key(fullurl):
147            filename, headers = self.tempcache[fullurl]
148            fp = open(filename, 'rb')
149            return addinfourl(fp, headers, fullurl)
150        urltype, url = splittype(fullurl)
151        if not urltype:
152            urltype = 'file'
153        if self.proxies.has_key(urltype):
154            proxy = self.proxies[urltype]
155            urltype, proxyhost = splittype(proxy)
156            host, selector = splithost(proxyhost)
157            url = (host, fullurl) # Signal special case to open_*()
158        else:
159            proxy = None
160        name = 'open_' + urltype
161        self.type = urltype
162        if '-' in name:
163            # replace - with _
164            name = '_'.join(name.split('-'))
165        if not hasattr(self, name):
166            if proxy:
167                return self.open_unknown_proxy(proxy, fullurl, data)
168            else:
169                return self.open_unknown(fullurl, data)
170        try:
171            if data is None:
172                return getattr(self, name)(url)
173            else:
174                return getattr(self, name)(url, data)
175        except socket.error, msg:
176            raise IOError, ('socket error', msg), sys.exc_info()[2]
177
178    def open_unknown(self, fullurl, data=None):
179        """Overridable interface to open unknown URL type."""
180        type, url = splittype(fullurl)
181        raise IOError, ('url error', 'unknown url type', type)
182
183    def open_unknown_proxy(self, proxy, fullurl, data=None):
184        """Overridable interface to open unknown URL type."""
185        type, url = splittype(fullurl)
186        raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
187
188    # External interface
189    def retrieve(self, url, filename=None, reporthook=None, data=None):
190        """retrieve(url) returns (filename, None) for a local object
191        or (tempfilename, headers) for a remote object."""
192        url = unwrap(toBytes(url))
193        if self.tempcache and self.tempcache.has_key(url):
194            return self.tempcache[url]
195        type, url1 = splittype(url)
196        if not filename and (not type or type == 'file'):
197            try:
198                fp = self.open_local_file(url1)
199                hdrs = fp.info()
200                del fp
201                return url2pathname(splithost(url1)[1]), hdrs
202            except IOError, msg:
203                pass
204        fp = self.open(url, data)
205        headers = fp.info()
206        if not filename:
207            import tempfile
208            garbage, path = splittype(url)
209            garbage, path = splithost(path or "")
210            path, garbage = splitquery(path or "")
211            path, garbage = splitattr(path or "")
212            suffix = os.path.splitext(path)[1]
213            filename = tempfile.mktemp(suffix)
214            self.__tempfiles.append(filename)
215        result = filename, headers
216        if self.tempcache is not None:
217            self.tempcache[url] = result
218        tfp = open(filename, 'wb')
219        bs = 1024*8
220        size = -1
221        blocknum = 1
222        if reporthook:
223            if headers.has_key("content-length"):
224                size = int(headers["Content-Length"])
225            reporthook(0, bs, size)
226        block = fp.read(bs)
227        if reporthook:
228            reporthook(1, bs, size)
229        while block:
230            tfp.write(block)
231            block = fp.read(bs)
232            blocknum = blocknum + 1
233            if reporthook:
234                reporthook(blocknum, bs, size)
235        fp.close()
236        tfp.close()
237        del fp
238        del tfp
239        return result
240
241    # Each method named open_<type> knows how to open that type of URL
242
243    def open_http(self, url, data=None):
244        """Use HTTP protocol."""
245        import httplib
246        user_passwd = None
247        if type(url) is types.StringType:
248            host, selector = splithost(url)
249            if host:
250                user_passwd, host = splituser(host)
251                host = unquote(host)
252            realhost = host
253        else:
254            host, selector = url
255            urltype, rest = splittype(selector)
256            url = rest
257            user_passwd = None
258            if urltype.lower() != 'http':
259                realhost = None
260            else:
261                realhost, rest = splithost(rest)
262                if realhost:
263                    user_passwd, realhost = splituser(realhost)
264                if user_passwd:
265                    selector = "%s://%s%s" % (urltype, realhost, rest)
266            #print "proxy via http:", host, selector
267        if not host: raise IOError, ('http error', 'no host given')
268        if user_passwd:
269            import base64
270            auth = base64.encodestring(user_passwd).strip()
271        else:
272            auth = None
273        h = httplib.HTTP(host)
274        if data is not None:
275            h.putrequest('POST', selector)
276            h.putheader('Content-type', 'application/x-www-form-urlencoded')
277            h.putheader('Content-length', '%d' % len(data))
278        else:
279            h.putrequest('GET', selector)
280        if auth: h.putheader('Authorization', 'Basic %s' % auth)
281        if realhost: h.putheader('Host', realhost)
282        for args in self.addheaders: apply(h.putheader, args)
283        h.endheaders()
284        if data is not None:
285            h.send(data + '\r\n')
286        errcode, errmsg, headers = h.getreply()
287        fp = h.getfile()
288        if errcode == 200:
289            return addinfourl(fp, headers, "http:" + url)
290        else:
291            if data is None:
292                return self.http_error(url, fp, errcode, errmsg, headers)
293            else:
294                return self.http_error(url, fp, errcode, errmsg, headers, data)
295
296    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
297        """Handle http errors.
298        Derived class can override this, or provide specific handlers
299        named http_error_DDD where DDD is the 3-digit error code."""
300        # First check if there's a specific handler for this error
301        name = 'http_error_%d' % errcode
302        if hasattr(self, name):
303            method = getattr(self, name)
304            if data is None:
305                result = method(url, fp, errcode, errmsg, headers)
306            else:
307                result = method(url, fp, errcode, errmsg, headers, data)
308            if result: return result
309        return self.http_error_default(url, fp, errcode, errmsg, headers)
310
311    def http_error_default(self, url, fp, errcode, errmsg, headers):
312        """Default error handler: close the connection and raise IOError."""
313        void = fp.read()
314        fp.close()
315        raise IOError, ('http error', errcode, errmsg, headers)
316
317    if hasattr(socket, "ssl"):
318        def open_https(self, url, data=None):
319            """Use HTTPS protocol."""
320            import httplib
321            user_passwd = None
322            if type(url) is types.StringType:
323                host, selector = splithost(url)
324                if host:
325                    user_passwd, host = splituser(host)
326                    host = unquote(host)
327                realhost = host
328            else:
329                host, selector = url
330                urltype, rest = splittype(selector)
331                url = rest
332                user_passwd = None
333                if urltype.lower() != 'https':
334                    realhost = None
335                else:
336                    realhost, rest = splithost(rest)
337                    if realhost:
338                        user_passwd, realhost = splituser(realhost)
339                    if user_passwd:
340                        selector = "%s://%s%s" % (urltype, realhost, rest)
341                #print "proxy via https:", host, selector
342            if not host: raise IOError, ('https error', 'no host given')
343            if user_passwd:
344                import base64
345                auth = base64.encodestring(user_passwd).strip()
346            else:
347                auth = None
348            h = httplib.HTTPS(host, 0,
349                              key_file=self.key_file,
350                              cert_file=self.cert_file)
351            if data is not None:
352                h.putrequest('POST', selector)
353                h.putheader('Content-type',
354                            'application/x-www-form-urlencoded')
355                h.putheader('Content-length', '%d' % len(data))
356            else:
357                h.putrequest('GET', selector)
358            if auth: h.putheader('Authorization: Basic %s' % auth)
359            if realhost: h.putheader('Host', realhost)
360            for args in self.addheaders: apply(h.putheader, args)
361            h.endheaders()
362            if data is not None:
363                h.send(data + '\r\n')
364            errcode, errmsg, headers = h.getreply()
365            fp = h.getfile()
366            if errcode == 200:
367                return addinfourl(fp, headers, url)
368            else:
369                if data is None:
370                    return self.http_error(url, fp, errcode, errmsg, headers)
371                else:
372                    return self.http_error(url, fp, errcode, errmsg, headers,
373                                           data)
374
375    def open_gopher(self, url):
376        """Use Gopher protocol."""
377        import gopherlib
378        host, selector = splithost(url)
379        if not host: raise IOError, ('gopher error', 'no host given')
380        host = unquote(host)
381        type, selector = splitgophertype(selector)
382        selector, query = splitquery(selector)
383        selector = unquote(selector)
384        if query:
385            query = unquote(query)
386            fp = gopherlib.send_query(selector, query, host)
387        else:
388            fp = gopherlib.send_selector(selector, host)
389        return addinfourl(fp, noheaders(), "gopher:" + url)
390
391    def open_file(self, url):
392        """Use local file or FTP depending on form of URL."""
393        if url[:2] == '//' and url[2:3] != '/':
394            return self.open_ftp(url)
395        else:
396            return self.open_local_file(url)
397
398    def open_local_file(self, url):
399        """Use local file."""
400        import mimetypes, mimetools, StringIO
401        mtype = mimetypes.guess_type(url)[0]
402        headers = mimetools.Message(StringIO.StringIO(
403            'Content-Type: %s\n' % (mtype or 'text/plain')))
404        host, file = splithost(url)
405        if not host:
406            urlfile = file
407            if file[:1] == '/':
408                urlfile = 'file://' + file
409            return addinfourl(open(url2pathname(file), 'rb'),
410                              headers, urlfile)
411        host, port = splitport(host)
412        if not port \
413           and socket.gethostbyname(host) in (localhost(), thishost()):
414            urlfile = file
415            if file[:1] == '/':
416                urlfile = 'file://' + file
417            return addinfourl(open(url2pathname(file), 'rb'),
418                              headers, urlfile)
419        raise IOError, ('local file error', 'not on local host')
420
421    def open_ftp(self, url):
422        """Use FTP protocol."""
423        host, path = splithost(url)
424        if not host: raise IOError, ('ftp error', 'no host given')
425        host, port = splitport(host)
426        user, host = splituser(host)
427        if user: user, passwd = splitpasswd(user)
428        else: passwd = None
429        host = unquote(host)
430        user = unquote(user or '')
431        passwd = unquote(passwd or '')
432        host = socket.gethostbyname(host)
433        if not port:
434            import ftplib
435            port = ftplib.FTP_PORT
436        else:
437            port = int(port)
438        path, attrs = splitattr(path)
439        path = unquote(path)
440        dirs = path.split('/')
441        dirs, file = dirs[:-1], dirs[-1]
442        if dirs and not dirs[0]: dirs = dirs[1:]
443        if dirs and not dirs[0]: dirs[0] = '/'
444        key = user, host, port, '/'.join(dirs)
445        # XXX thread unsafe!
446        if len(self.ftpcache) > MAXFTPCACHE:
447            # Prune the cache, rather arbitrarily
448            for k in self.ftpcache.keys():
449                if k != key:
450                    v = self.ftpcache[k]
451                    del self.ftpcache[k]
452                    v.close()
453        try:
454            if not self.ftpcache.has_key(key):
455                self.ftpcache[key] = \
456                    ftpwrapper(user, passwd, host, port, dirs)
457            if not file: type = 'D'
458            else: type = 'I'
459            for attr in attrs:
460                attr, value = splitvalue(attr)
461                if attr.lower() == 'type' and \
462                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
463                    type = value.upper()
464            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
465            if retrlen is not None and retrlen >= 0:
466                import mimetools, StringIO
467                headers = mimetools.Message(StringIO.StringIO(
468                    'Content-Length: %d\n' % retrlen))
469            else:
470                headers = noheaders()
471            return addinfourl(fp, headers, "ftp:" + url)
472        except ftperrors(), msg:
473            raise IOError, ('ftp error', msg), sys.exc_info()[2]
474
475    def open_data(self, url, data=None):
476        """Use "data" URL."""
477        # ignore POSTed data
478        #
479        # syntax of data URLs:
480        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
481        # mediatype := [ type "/" subtype ] *( ";" parameter )
482        # data      := *urlchar
483        # parameter := attribute "=" value
484        import StringIO, mimetools, time
485        try:
486            [type, data] = url.split(',', 1)
487        except ValueError:
488            raise IOError, ('data error', 'bad data URL')
489        if not type:
490            type = 'text/plain;charset=US-ASCII'
491        semi = type.rfind(';')
492        if semi >= 0 and '=' not in type[semi:]:
493            encoding = type[semi+1:]
494            type = type[:semi]
495        else:
496            encoding = ''
497        msg = []
498        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
499                                            time.gmtime(time.time())))
500        msg.append('Content-type: %s' % type)
501        if encoding == 'base64':
502            import base64
503            data = base64.decodestring(data)
504        else:
505            data = unquote(data)
506        msg.append('Content-length: %d' % len(data))
507        msg.append('')
508        msg.append(data)
509        msg = '\n'.join(msg)
510        f = StringIO.StringIO(msg)
511        headers = mimetools.Message(f, 0)
512        f.fileno = None     # needed for addinfourl
513        return addinfourl(f, headers, url)
514
515
516class FancyURLopener(URLopener):
517    """Derived class with handlers for errors we can handle (perhaps)."""
518
519    def __init__(self, *args):
520        apply(URLopener.__init__, (self,) + args)
521        self.auth_cache = {}
522        self.tries = 0
523        self.maxtries = 10
524
525    def http_error_default(self, url, fp, errcode, errmsg, headers):
526        """Default error handling -- don't raise an exception."""
527        return addinfourl(fp, headers, "http:" + url)
528
529    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
530        """Error 302 -- relocated (temporarily)."""
531        self.tries += 1
532        if self.maxtries and self.tries >= self.maxtries:
533            if hasattr(self, "http_error_500"):
534                meth = self.http_error_500
535            else:
536                meth = self.http_error_default
537            self.tries = 0
538            return meth(url, fp, 500,
539                        "Internal Server Error: Redirect Recursion", headers)
540        result = self.redirect_internal(url, fp, errcode, errmsg, headers,
541                                        data)
542        self.tries = 0
543        return result
544
545    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
546        if headers.has_key('location'):
547            newurl = headers['location']
548        elif headers.has_key('uri'):
549            newurl = headers['uri']
550        else:
551            return
552        void = fp.read()
553        fp.close()
554        # In case the server sent a relative URL, join with original:
555        newurl = basejoin("http:" + url, newurl)
556        if data is None:
557            return self.open(newurl)
558        else:
559            return self.open(newurl, data)
560
561    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
562        """Error 301 -- also relocated (permanently)."""
563        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
564
565    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
566        """Error 401 -- authentication required.
567        See this URL for a description of the basic authentication scheme:
568        http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
569        if not headers.has_key('www-authenticate'):
570            URLopener.http_error_default(self, url, fp,
571                                         errmsg, headers)
572        stuff = headers['www-authenticate']
573        import re
574        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
575        if not match:
576            URLopener.http_error_default(self, url, fp,
577                                         errcode, errmsg, headers)
578        scheme, realm = match.groups()
579        if scheme.lower() != 'basic':
580            URLopener.http_error_default(self, url, fp,
581                                         errcode, errmsg, headers)
582        name = 'retry_' + self.type + '_basic_auth'
583        if data is None:
584            return getattr(self,name)(url, realm)
585        else:
586            return getattr(self,name)(url, realm, data)
587
588    def retry_http_basic_auth(self, url, realm, data=None):
589        host, selector = splithost(url)
590        i = host.find('@') + 1
591        host = host[i:]
592        user, passwd = self.get_user_passwd(host, realm, i)
593        if not (user or passwd): return None
594        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
595        newurl = 'http://' + host + selector
596        if data is None:
597            return self.open(newurl)
598        else:
599            return self.open(newurl, data)
600
601    def retry_https_basic_auth(self, url, realm, data=None):
602        host, selector = splithost(url)
603        i = host.find('@') + 1
604        host = host[i:]
605        user, passwd = self.get_user_passwd(host, realm, i)
606        if not (user or passwd): return None
607        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
608        newurl = '//' + host + selector
609        return self.open_https(newurl, data)
610
611    def get_user_passwd(self, host, realm, clear_cache = 0):
612        key = realm + '@' + host.lower()
613        if self.auth_cache.has_key(key):
614            if clear_cache:
615                del self.auth_cache[key]
616            else:
617                return self.auth_cache[key]
618        user, passwd = self.prompt_user_passwd(host, realm)
619        if user or passwd: self.auth_cache[key] = (user, passwd)
620        return user, passwd
621
622    def prompt_user_passwd(self, host, realm):
623        """Override this in a GUI environment!"""
624        import getpass
625        try:
626            user = raw_input("Enter username for %s at %s: " % (realm,
627                                                                host))
628            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
629                (user, realm, host))
630            return user, passwd
631        except KeyboardInterrupt:
632            print
633            return None, None
634
635
636# Utility functions
637
638_localhost = None
639def localhost():
640    """Return the IP address of the magic hostname 'localhost'."""
641    global _localhost
642    if not _localhost:
643        _localhost = socket.gethostbyname('localhost')
644    return _localhost
645
646_thishost = None
647def thishost():
648    """Return the IP address of the current host."""
649    global _thishost
650    if not _thishost:
651        _thishost = socket.gethostbyname(socket.gethostname())
652    return _thishost
653
654_ftperrors = None
655def ftperrors():
656    """Return the set of errors raised by the FTP class."""
657    global _ftperrors
658    if not _ftperrors:
659        import ftplib
660        _ftperrors = ftplib.all_errors
661    return _ftperrors
662
663_noheaders = None
664def noheaders():
665    """Return an empty mimetools.Message object."""
666    global _noheaders
667    if not _noheaders:
668        import mimetools
669        import StringIO
670        _noheaders = mimetools.Message(StringIO.StringIO(), 0)
671        _noheaders.fp.close()   # Recycle file descriptor
672    return _noheaders
673
674
675# Utility classes
676
677class ftpwrapper:
678    """Class used by open_ftp() for cache of open FTP connections."""
679
680    def __init__(self, user, passwd, host, port, dirs):
681        self.user = user
682        self.passwd = passwd
683        self.host = host
684        self.port = port
685        self.dirs = dirs
686        self.init()
687
688    def init(self):
689        import ftplib
690        self.busy = 0
691        self.ftp = ftplib.FTP()
692        self.ftp.connect(self.host, self.port)
693        self.ftp.login(self.user, self.passwd)
694        for dir in self.dirs:
695            self.ftp.cwd(dir)
696
697    def retrfile(self, file, type):
698        import ftplib
699        self.endtransfer()
700        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
701        else: cmd = 'TYPE ' + type; isdir = 0
702        try:
703            self.ftp.voidcmd(cmd)
704        except ftplib.all_errors:
705            self.init()
706            self.ftp.voidcmd(cmd)
707        conn = None
708        if file and not isdir:
709            # Use nlst to see if the file exists at all
710            try:
711                self.ftp.nlst(file)
712            except ftplib.error_perm, reason:
713                raise IOError, ('ftp error', reason), sys.exc_info()[2]
714            # Restore the transfer mode!
715            self.ftp.voidcmd(cmd)
716            # Try to retrieve as a file
717            try:
718                cmd = 'RETR ' + file
719                conn = self.ftp.ntransfercmd(cmd)
720            except ftplib.error_perm, reason:
721                if str(reason)[:3] != '550':
722                    raise IOError, ('ftp error', reason), sys.exc_info()[2]
723        if not conn:
724            # Set transfer mode to ASCII!
725            self.ftp.voidcmd('TYPE A')
726            # Try a directory listing
727            if file: cmd = 'LIST ' + file
728            else: cmd = 'LIST'
729            conn = self.ftp.ntransfercmd(cmd)
730        self.busy = 1
731        # Pass back both a suitably decorated object and a retrieval length
732        return (addclosehook(conn[0].makefile('rb'),
733                             self.endtransfer), conn[1])
734    def endtransfer(self):
735        if not self.busy:
736            return
737        self.busy = 0
738        try:
739            self.ftp.voidresp()
740        except ftperrors():
741            pass
742
743    def close(self):
744        self.endtransfer()
745        try:
746            self.ftp.close()
747        except ftperrors():
748            pass
749
750class addbase:
751    """Base class for addinfo and addclosehook."""
752
753    def __init__(self, fp):
754        self.fp = fp
755        self.read = self.fp.read
756        self.readline = self.fp.readline
757        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
758        if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
759
760    def __repr__(self):
761        return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
762                                             `id(self)`, `self.fp`)
763
764    def close(self):
765        self.read = None
766        self.readline = None
767        self.readlines = None
768        self.fileno = None
769        if self.fp: self.fp.close()
770        self.fp = None
771
772class addclosehook(addbase):
773    """Class to add a close hook to an open file."""
774
775    def __init__(self, fp, closehook, *hookargs):
776        addbase.__init__(self, fp)
777        self.closehook = closehook
778        self.hookargs = hookargs
779
780    def close(self):
781        addbase.close(self)
782        if self.closehook:
783            apply(self.closehook, self.hookargs)
784            self.closehook = None
785            self.hookargs = None
786
787class addinfo(addbase):
788    """class to add an info() method to an open file."""
789
790    def __init__(self, fp, headers):
791        addbase.__init__(self, fp)
792        self.headers = headers
793
794    def info(self):
795        return self.headers
796
797class addinfourl(addbase):
798    """class to add info() and geturl() methods to an open file."""
799
800    def __init__(self, fp, headers, url):
801        addbase.__init__(self, fp)
802        self.headers = headers
803        self.url = url
804
805    def info(self):
806        return self.headers
807
808    def geturl(self):
809        return self.url
810
811
812def basejoin(base, url):
813    """Utility to combine a URL with a base URL to form a new URL."""
814    type, path = splittype(url)
815    if type:
816        # if url is complete (i.e., it contains a type), return it
817        return url
818    host, path = splithost(path)
819    type, basepath = splittype(base) # inherit type from base
820    if host:
821        # if url contains host, just inherit type
822        if type: return type + '://' + host + path
823        else:
824            # no type inherited, so url must have started with //
825            # just return it
826            return url
827    host, basepath = splithost(basepath) # inherit host
828    basepath, basetag = splittag(basepath) # remove extraneous cruft
829    basepath, basequery = splitquery(basepath) # idem
830    if path[:1] != '/':
831        # non-absolute path name
832        if path[:1] in ('#', '?'):
833            # path is just a tag or query, attach to basepath
834            i = len(basepath)
835        else:
836            # else replace last component
837            i = basepath.rfind('/')
838        if i < 0:
839            # basepath not absolute
840            if host:
841                # host present, make absolute
842                basepath = '/'
843            else:
844                # else keep non-absolute
845                basepath = ''
846        else:
847            # remove last file component
848            basepath = basepath[:i+1]
849        # Interpret ../ (important because of symlinks)
850        while basepath and path[:3] == '../':
851            path = path[3:]
852            i = basepath[:-1].rfind('/')
853            if i > 0:
854                basepath = basepath[:i+1]
855            elif i == 0:
856                basepath = '/'
857                break
858            else:
859                basepath = ''
860
861        path = basepath + path
862    if type and host: return type + '://' + host + path
863    elif type: return type + ':' + path
864    elif host: return '//' + host + path # don't know what this means
865    else: return path
866
867
868# Utilities to parse URLs (most of these return None for missing parts):
869# unwrap('<URL:type://host/path>') --> 'type://host/path'
870# splittype('type:opaquestring') --> 'type', 'opaquestring'
871# splithost('//host[:port]/path') --> 'host[:port]', '/path'
872# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
873# splitpasswd('user:passwd') -> 'user', 'passwd'
874# splitport('host:port') --> 'host', 'port'
875# splitquery('/path?query') --> '/path', 'query'
876# splittag('/path#tag') --> '/path', 'tag'
877# splitattr('/path;attr1=value1;attr2=value2;...') ->
878#   '/path', ['attr1=value1', 'attr2=value2', ...]
879# splitvalue('attr=value') --> 'attr', 'value'
880# splitgophertype('/Xselector') --> 'X', 'selector'
881# unquote('abc%20def') -> 'abc def'
882# quote('abc def') -> 'abc%20def')
883
884def toBytes(url):
885    """toBytes(u"URL") --> 'URL'."""
886    # Most URL schemes require ASCII. If that changes, the conversion
887    # can be relaxed
888    if type(url) is types.UnicodeType:
889        try:
890            url = url.encode("ASCII")
891        except UnicodeError:
892            raise UnicodeError("URL " + repr(url) +
893                               " contains non-ASCII characters")
894    return url
895
896def unwrap(url):
897    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
898    url = url.strip()
899    if url[:1] == '<' and url[-1:] == '>':
900        url = url[1:-1].strip()
901    if url[:4] == 'URL:': url = url[4:].strip()
902    return url
903
904_typeprog = None
905def splittype(url):
906    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
907    global _typeprog
908    if _typeprog is None:
909        import re
910        _typeprog = re.compile('^([^/:]+):')
911
912    match = _typeprog.match(url)
913    if match:
914        scheme = match.group(1)
915        return scheme.lower(), url[len(scheme) + 1:]
916    return None, url
917
918_hostprog = None
919def splithost(url):
920    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
921    global _hostprog
922    if _hostprog is None:
923        import re
924        _hostprog = re.compile('^//([^/]*)(.*)$')
925
926    match = _hostprog.match(url)
927    if match: return match.group(1, 2)
928    return None, url
929
930_userprog = None
931def splituser(host):
932    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
933    global _userprog
934    if _userprog is None:
935        import re
936        _userprog = re.compile('^([^@]*)@(.*)$')
937
938    match = _userprog.match(host)
939    if match: return map(unquote, match.group(1, 2))
940    return None, host
941
942_passwdprog = None
943def splitpasswd(user):
944    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
945    global _passwdprog
946    if _passwdprog is None:
947        import re
948        _passwdprog = re.compile('^([^:]*):(.*)$')
949
950    match = _passwdprog.match(user)
951    if match: return match.group(1, 2)
952    return user, None
953
954# splittag('/path#tag') --> '/path', 'tag'
955_portprog = None
956def splitport(host):
957    """splitport('host:port') --> 'host', 'port'."""
958    global _portprog
959    if _portprog is None:
960        import re
961        _portprog = re.compile('^(.*):([0-9]+)$')
962
963    match = _portprog.match(host)
964    if match: return match.group(1, 2)
965    return host, None
966
967_nportprog = None
968def splitnport(host, defport=-1):
969    """Split host and port, returning numeric port.
970    Return given default port if no ':' found; defaults to -1.
971    Return numerical port if a valid number are found after ':'.
972    Return None if ':' but not a valid number."""
973    global _nportprog
974    if _nportprog is None:
975        import re
976        _nportprog = re.compile('^(.*):(.*)$')
977
978    match = _nportprog.match(host)
979    if match:
980        host, port = match.group(1, 2)
981        try:
982            if not port: raise ValueError, "no digits"
983            nport = int(port)
984        except ValueError:
985            nport = None
986        return host, nport
987    return host, defport
988
989_queryprog = None
990def splitquery(url):
991    """splitquery('/path?query') --> '/path', 'query'."""
992    global _queryprog
993    if _queryprog is None:
994        import re
995        _queryprog = re.compile('^(.*)\?([^?]*)$')
996
997    match = _queryprog.match(url)
998    if match: return match.group(1, 2)
999    return url, None
1000
1001_tagprog = None
1002def splittag(url):
1003    """splittag('/path#tag') --> '/path', 'tag'."""
1004    global _tagprog
1005    if _tagprog is None:
1006        import re
1007        _tagprog = re.compile('^(.*)#([^#]*)$')
1008
1009    match = _tagprog.match(url)
1010    if match: return match.group(1, 2)
1011    return url, None
1012
1013def splitattr(url):
1014    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1015        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1016    words = url.split(';')
1017    return words[0], words[1:]
1018
1019_valueprog = None
1020def splitvalue(attr):
1021    """splitvalue('attr=value') --> 'attr', 'value'."""
1022    global _valueprog
1023    if _valueprog is None:
1024        import re
1025        _valueprog = re.compile('^([^=]*)=(.*)$')
1026
1027    match = _valueprog.match(attr)
1028    if match: return match.group(1, 2)
1029    return attr, None
1030
1031def splitgophertype(selector):
1032    """splitgophertype('/Xselector') --> 'X', 'selector'."""
1033    if selector[:1] == '/' and selector[1:2]:
1034        return selector[1], selector[2:]
1035    return None, selector
1036
1037def unquote(s):
1038    """unquote('abc%20def') -> 'abc def'."""
1039    mychr = chr
1040    myatoi = int
1041    list = s.split('%')
1042    res = [list[0]]
1043    myappend = res.append
1044    del list[0]
1045    for item in list:
1046        if item[1:2]:
1047            try:
1048                myappend(mychr(myatoi(item[:2], 16))
1049                     + item[2:])
1050            except:
1051                myappend('%' + item)
1052        else:
1053            myappend('%' + item)
1054    return "".join(res)
1055
1056def unquote_plus(s):
1057    """unquote('%7e/abc+def') -> '~/abc def'"""
1058    if '+' in s:
1059        # replace '+' with ' '
1060        s = ' '.join(s.split('+'))
1061    return unquote(s)
1062
1063always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1064               'abcdefghijklmnopqrstuvwxyz'
1065               '0123456789' '_.-')
1066
1067_fast_safe_test = always_safe + '/'
1068_fast_safe = None
1069
1070def _fast_quote(s):
1071    global _fast_safe
1072    if _fast_safe is None:
1073        _fast_safe = {}
1074        for c in _fast_safe_test:
1075            _fast_safe[c] = c
1076    res = list(s)
1077    for i in range(len(res)):
1078        c = res[i]
1079        if not _fast_safe.has_key(c):
1080            res[i] = '%%%02X' % ord(c)
1081    return ''.join(res)
1082
1083def quote(s, safe = '/'):
1084    """quote('abc def') -> 'abc%20def'
1085
1086    Each part of a URL, e.g. the path info, the query, etc., has a
1087    different set of reserved characters that must be quoted.
1088
1089    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1090    the following reserved characters.
1091
1092    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1093                  "$" | ","
1094
1095    Each of these characters is reserved in some component of a URL,
1096    but not necessarily in all of them.
1097
1098    By default, the quote function is intended for quoting the path
1099    section of a URL.  Thus, it will not encode '/'.  This character
1100    is reserved, but in typical usage the quote function is being
1101    called on a path where the existing slash characters are used as
1102    reserved characters.
1103    """
1104    safe = always_safe + safe
1105    if _fast_safe_test == safe:
1106        return _fast_quote(s)
1107    res = list(s)
1108    for i in range(len(res)):
1109        c = res[i]
1110        if c not in safe:
1111            res[i] = '%%%02X' % ord(c)
1112    return ''.join(res)
1113
1114def quote_plus(s, safe = ''):
1115    """Quote the query fragment of a URL; replacing ' ' with '+'"""
1116    if ' ' in s:
1117        l = s.split(' ')
1118        for i in range(len(l)):
1119            l[i] = quote(l[i], safe)
1120        return '+'.join(l)
1121    else:
1122        return quote(s, safe)
1123
1124def urlencode(query,doseq=0):
1125    """Encode a sequence of two-element tuples or dictionary into a URL query string.
1126
1127    If any values in the query arg are sequences and doseq is true, each
1128    sequence element is converted to a separate parameter.
1129
1130    If the query arg is a sequence of two-element tuples, the order of the
1131    parameters in the output will match the order of parameters in the
1132    input.
1133    """
1134
1135    if hasattr(query,"items"):
1136        # mapping objects
1137        query = query.items()
1138    else:
1139        # it's a bother at times that strings and string-like objects are
1140        # sequences...
1141        try:
1142            # non-sequence items should not work with len()
1143            x = len(query)
1144            # non-empty strings will fail this
1145            if len(query) and type(query[0]) != types.TupleType:
1146                raise TypeError
1147            # zero-length sequences of all types will get here and succeed,
1148            # but that's a minor nit - since the original implementation
1149            # allowed empty dicts that type of behavior probably should be
1150            # preserved for consistency
1151        except TypeError:
1152            ty,va,tb = sys.exc_info()
1153            raise TypeError, "not a valid non-string sequence or mapping object", tb
1154
1155    l = []
1156    if not doseq:
1157        # preserve old behavior
1158        for k, v in query:
1159            k = quote_plus(str(k))
1160            v = quote_plus(str(v))
1161            l.append(k + '=' + v)
1162    else:
1163        for k, v in query:
1164            k = quote_plus(str(k))
1165            if type(v) == types.StringType:
1166                v = quote_plus(v)
1167                l.append(k + '=' + v)
1168            elif type(v) == types.UnicodeType:
1169                # is there a reasonable way to convert to ASCII?
1170                # encode generates a string, but "replace" or "ignore"
1171                # lose information and "strict" can raise UnicodeError
1172                v = quote_plus(v.encode("ASCII","replace"))
1173                l.append(k + '=' + v)
1174            else:
1175                try:
1176                    # is this a sufficient test for sequence-ness?
1177                    x = len(v)
1178                except TypeError:
1179                    # not a sequence
1180                    v = quote_plus(str(v))
1181                    l.append(k + '=' + v)
1182                else:
1183                    # loop over the sequence
1184                    for elt in v:
1185                        l.append(k + '=' + quote_plus(str(elt)))
1186    return '&'.join(l)
1187
1188# Proxy handling
1189def getproxies_environment():
1190    """Return a dictionary of scheme -> proxy server URL mappings.
1191
1192    Scan the environment for variables named <scheme>_proxy;
1193    this seems to be the standard convention.  If you need a
1194    different way, you can pass a proxies dictionary to the
1195    [Fancy]URLopener constructor.
1196
1197    """
1198    proxies = {}
1199    for name, value in os.environ.items():
1200        name = name.lower()
1201        if value and name[-6:] == '_proxy':
1202            proxies[name[:-6]] = value
1203    return proxies
1204
1205if os.name == 'mac':
1206    def getproxies():
1207        """Return a dictionary of scheme -> proxy server URL mappings.
1208
1209        By convention the mac uses Internet Config to store
1210        proxies.  An HTTP proxy, for instance, is stored under
1211        the HttpProxy key.
1212
1213        """
1214        try:
1215            import ic
1216        except ImportError:
1217            return {}
1218
1219        try:
1220            config = ic.IC()
1221        except ic.error:
1222            return {}
1223        proxies = {}
1224        # HTTP:
1225        if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1226            try:
1227                value = config['HTTPProxyHost']
1228            except ic.error:
1229                pass
1230            else:
1231                proxies['http'] = 'http://%s' % value
1232        # FTP: XXXX To be done.
1233        # Gopher: XXXX To be done.
1234        return proxies
1235
1236elif os.name == 'nt':
1237    def getproxies_registry():
1238        """Return a dictionary of scheme -> proxy server URL mappings.
1239
1240        Win32 uses the registry to store proxies.
1241
1242        """
1243        proxies = {}
1244        try:
1245            import _winreg
1246        except ImportError:
1247            # Std module, so should be around - but you never know!
1248            return proxies
1249        try:
1250            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1251                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1252            proxyEnable = _winreg.QueryValueEx(internetSettings,
1253                                               'ProxyEnable')[0]
1254            if proxyEnable:
1255                # Returned as Unicode but problems if not converted to ASCII
1256                proxyServer = str(_winreg.QueryValueEx(internetSettings,
1257                                                       'ProxyServer')[0])
1258                if '=' in proxyServer:
1259                    # Per-protocol settings
1260                    for p in proxyServer.split(';'):
1261                        protocol, address = p.split('=', 1)
1262                        proxies[protocol] = '%s://%s' % (protocol, address)
1263                else:
1264                    # Use one setting for all protocols
1265                    if proxyServer[:5] == 'http:':
1266                        proxies['http'] = proxyServer
1267                    else:
1268                        proxies['http'] = 'http://%s' % proxyServer
1269                        proxies['ftp'] = 'ftp://%s' % proxyServer
1270            internetSettings.Close()
1271        except (WindowsError, ValueError, TypeError):
1272            # Either registry key not found etc, or the value in an
1273            # unexpected format.
1274            # proxies already set up to be empty so nothing to do
1275            pass
1276        return proxies
1277
1278    def getproxies():
1279        """Return a dictionary of scheme -> proxy server URL mappings.
1280
1281        Returns settings gathered from the environment, if specified,
1282        or the registry.
1283
1284        """
1285        return getproxies_environment() or getproxies_registry()
1286else:
1287    # By default use environment variables
1288    getproxies = getproxies_environment
1289
1290
1291# Test and time quote() and unquote()
1292def test1():
1293    import time
1294    s = ''
1295    for i in range(256): s = s + chr(i)
1296    s = s*4
1297    t0 = time.time()
1298    qs = quote(s)
1299    uqs = unquote(qs)
1300    t1 = time.time()
1301    if uqs != s:
1302        print 'Wrong!'
1303    print `s`
1304    print `qs`
1305    print `uqs`
1306    print round(t1 - t0, 3), 'sec'
1307
1308
1309def reporthook(blocknum, blocksize, totalsize):
1310    # Report during remote transfers
1311    print "Block number: %d, Block size: %d, Total size: %d" % (
1312        blocknum, blocksize, totalsize)
1313
1314# Test program
1315def test(args=[]):
1316    if not args:
1317        args = [
1318            '/etc/passwd',
1319            'file:/etc/passwd',
1320            'file://localhost/etc/passwd',
1321            'ftp://ftp.python.org/etc/passwd',
1322##          'gopher://gopher.micro.umn.edu/1/',
1323            'http://www.python.org/index.html',
1324            ]
1325        if hasattr(URLopener, "open_https"):
1326            args.append('https://synergy.as.cmu.edu/~geek/')
1327    try:
1328        for url in args:
1329            print '-'*10, url, '-'*10
1330            fn, h = urlretrieve(url, None, reporthook)
1331            print fn
1332            if h:
1333                print '======'
1334                for k in h.keys(): print k + ':', h[k]
1335                print '======'
1336            fp = open(fn, 'rb')
1337            data = fp.read()
1338            del fp
1339            if '\r' in data:
1340                table = string.maketrans("", "")
1341                data = data.translate(table, "\r")
1342            print data
1343            fn, h = None, None
1344        print '-'*40
1345    finally:
1346        urlcleanup()
1347
1348def main():
1349    import getopt, sys
1350    try:
1351        opts, args = getopt.getopt(sys.argv[1:], "th")
1352    except getopt.error, msg:
1353        print msg
1354        print "Use -h for help"
1355        return
1356    t = 0
1357    for o, a in opts:
1358        if o == '-t':
1359            t = t + 1
1360        if o == '-h':
1361            print "Usage: python urllib.py [-t] [url ...]"
1362            print "-t runs self-test;",
1363            print "otherwise, contents of urls are printed"
1364            return
1365    if t:
1366        if t > 1:
1367            test1()
1368        test(args)
1369    else:
1370        if not args:
1371            print "Use -h for help"
1372        for url in args:
1373            print urlopen(url).read(),
1374
1375# Run test program when run as a script
1376if __name__ == '__main__':
1377    main()
1378