urllib.py revision c680ae8002e955ef616741ae59338f0cde0f2ff8
1"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol.  All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info().  The read*(), fileno()
19and close() methods work like those of open files.
20The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
24
25import string
26import socket
27import os
28import stat
29import time
30import sys
31import types
32
33__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
34           "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
35           "urlencode", "url2pathname", "pathname2url", "splittag",
36           "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
37           "splittype", "splithost", "splituser", "splitpasswd", "splitport",
38           "splitnport", "splitquery", "splitattr", "splitvalue",
39           "splitgophertype", "getproxies"]
40
41__version__ = '1.15'    # XXX This version is not always updated :-(
42
43MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
44
45# Helper for non-unix systems
46if os.name == 'mac':
47    from macurl2path import url2pathname, pathname2url
48elif os.name == 'nt':
49    from nturl2path import url2pathname, pathname2url
50elif os.name == 'riscos':
51    from rourl2path import url2pathname, pathname2url
52else:
53    def url2pathname(pathname):
54        return unquote(pathname)
55    def pathname2url(pathname):
56        return quote(pathname)
57
58# This really consists of two pieces:
59# (1) a class which handles opening of all sorts of URLs
60#     (plus assorted utilities etc.)
61# (2) a set of functions for parsing URLs
62# XXX Should these be separated out into different modules?
63
64
65# Shortcut for basic usage
66_urlopener = None
67def urlopen(url, data=None):
68    """urlopen(url [, data]) -> open file-like object"""
69    global _urlopener
70    if not _urlopener:
71        _urlopener = FancyURLopener()
72    if data is None:
73        return _urlopener.open(url)
74    else:
75        return _urlopener.open(url, data)
76def urlretrieve(url, filename=None, reporthook=None, data=None):
77    global _urlopener
78    if not _urlopener:
79        _urlopener = FancyURLopener()
80    return _urlopener.retrieve(url, filename, reporthook, data)
81def urlcleanup():
82    if _urlopener:
83        _urlopener.cleanup()
84
85
86ftpcache = {}
87class URLopener:
88    """Class to open URLs.
89    This is a class rather than just a subroutine because we may need
90    more than one set of global protocol-specific options.
91    Note -- this is a base class for those who don't want the
92    automatic handling of errors type 302 (relocated) and 401
93    (authorization needed)."""
94
95    __tempfiles = None
96
97    version = "Python-urllib/%s" % __version__
98
99    # Constructor
100    def __init__(self, proxies=None, **x509):
101        if proxies is None:
102            proxies = getproxies()
103        assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
104        self.proxies = proxies
105        self.key_file = x509.get('key_file')
106        self.cert_file = x509.get('cert_file')
107        self.addheaders = [('User-agent', self.version)]
108        self.__tempfiles = []
109        self.__unlink = os.unlink # See cleanup()
110        self.tempcache = None
111        # Undocumented feature: if you assign {} to tempcache,
112        # it is used to cache files retrieved with
113        # self.retrieve().  This is not enabled by default
114        # since it does not work for changing documents (and I
115        # haven't got the logic to check expiration headers
116        # yet).
117        self.ftpcache = ftpcache
118        # Undocumented feature: you can use a different
119        # ftp cache by assigning to the .ftpcache member;
120        # in case you want logically independent URL openers
121        # XXX This is not threadsafe.  Bah.
122
123    def __del__(self):
124        self.close()
125
126    def close(self):
127        self.cleanup()
128
129    def cleanup(self):
130        # This code sometimes runs when the rest of this module
131        # has already been deleted, so it can't use any globals
132        # or import anything.
133        if self.__tempfiles:
134            for file in self.__tempfiles:
135                try:
136                    self.__unlink(file)
137                except OSError:
138                    pass
139            del self.__tempfiles[:]
140        if self.tempcache:
141            self.tempcache.clear()
142
143    def addheader(self, *args):
144        """Add a header to be used by the HTTP interface only
145        e.g. u.addheader('Accept', 'sound/basic')"""
146        self.addheaders.append(args)
147
148    # External interface
149    def open(self, fullurl, data=None):
150        """Use URLopener().open(file) instead of open(file, 'r')."""
151        fullurl = unwrap(toBytes(fullurl))
152        if self.tempcache and self.tempcache.has_key(fullurl):
153            filename, headers = self.tempcache[fullurl]
154            fp = open(filename, 'rb')
155            return addinfourl(fp, headers, fullurl)
156        urltype, url = splittype(fullurl)
157        if not urltype:
158            urltype = 'file'
159        if self.proxies.has_key(urltype):
160            proxy = self.proxies[urltype]
161            urltype, proxyhost = splittype(proxy)
162            host, selector = splithost(proxyhost)
163            url = (host, fullurl) # Signal special case to open_*()
164        else:
165            proxy = None
166        name = 'open_' + urltype
167        self.type = urltype
168        if '-' in name:
169            # replace - with _
170            name = '_'.join(name.split('-'))
171        if not hasattr(self, name):
172            if proxy:
173                return self.open_unknown_proxy(proxy, fullurl, data)
174            else:
175                return self.open_unknown(fullurl, data)
176        try:
177            if data is None:
178                return getattr(self, name)(url)
179            else:
180                return getattr(self, name)(url, data)
181        except socket.error, msg:
182            raise IOError, ('socket error', msg), sys.exc_info()[2]
183
184    def open_unknown(self, fullurl, data=None):
185        """Overridable interface to open unknown URL type."""
186        type, url = splittype(fullurl)
187        raise IOError, ('url error', 'unknown url type', type)
188
189    def open_unknown_proxy(self, proxy, fullurl, data=None):
190        """Overridable interface to open unknown URL type."""
191        type, url = splittype(fullurl)
192        raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
193
194    # External interface
195    def retrieve(self, url, filename=None, reporthook=None, data=None):
196        """retrieve(url) returns (filename, None) for a local object
197        or (tempfilename, headers) for a remote object."""
198        url = unwrap(toBytes(url))
199        if self.tempcache and self.tempcache.has_key(url):
200            return self.tempcache[url]
201        type, url1 = splittype(url)
202        if not filename and (not type or type == 'file'):
203            try:
204                fp = self.open_local_file(url1)
205                hdrs = fp.info()
206                del fp
207                return url2pathname(splithost(url1)[1]), hdrs
208            except IOError, msg:
209                pass
210        fp = self.open(url, data)
211        headers = fp.info()
212        if not filename:
213            import tempfile
214            garbage, path = splittype(url)
215            garbage, path = splithost(path or "")
216            path, garbage = splitquery(path or "")
217            path, garbage = splitattr(path or "")
218            suffix = os.path.splitext(path)[1]
219            filename = tempfile.mktemp(suffix)
220            self.__tempfiles.append(filename)
221        result = filename, headers
222        if self.tempcache is not None:
223            self.tempcache[url] = result
224        tfp = open(filename, 'wb')
225        bs = 1024*8
226        size = -1
227        blocknum = 1
228        if reporthook:
229            if headers.has_key("content-length"):
230                size = int(headers["Content-Length"])
231            reporthook(0, bs, size)
232        block = fp.read(bs)
233        if reporthook:
234            reporthook(1, bs, size)
235        while block:
236            tfp.write(block)
237            block = fp.read(bs)
238            blocknum = blocknum + 1
239            if reporthook:
240                reporthook(blocknum, bs, size)
241        fp.close()
242        tfp.close()
243        del fp
244        del tfp
245        return result
246
247    # Each method named open_<type> knows how to open that type of URL
248
249    def open_http(self, url, data=None):
250        """Use HTTP protocol."""
251        import httplib
252        user_passwd = None
253        if type(url) is types.StringType:
254            host, selector = splithost(url)
255            if host:
256                user_passwd, host = splituser(host)
257                host = unquote(host)
258            realhost = host
259        else:
260            host, selector = url
261            urltype, rest = splittype(selector)
262            url = rest
263            user_passwd = None
264            if urltype.lower() != 'http':
265                realhost = None
266            else:
267                realhost, rest = splithost(rest)
268                if realhost:
269                    user_passwd, realhost = splituser(realhost)
270                if user_passwd:
271                    selector = "%s://%s%s" % (urltype, realhost, rest)
272                if proxy_bypass(realhost):
273                    host = realhost
274
275            #print "proxy via http:", host, selector
276        if not host: raise IOError, ('http error', 'no host given')
277        if user_passwd:
278            import base64
279            auth = base64.encodestring(user_passwd).strip()
280        else:
281            auth = None
282        h = httplib.HTTP(host)
283        if data is not None:
284            h.putrequest('POST', selector)
285            h.putheader('Content-type', 'application/x-www-form-urlencoded')
286            h.putheader('Content-length', '%d' % len(data))
287        else:
288            h.putrequest('GET', selector)
289        if auth: h.putheader('Authorization', 'Basic %s' % auth)
290        if realhost: h.putheader('Host', realhost)
291        for args in self.addheaders: apply(h.putheader, args)
292        h.endheaders()
293        if data is not None:
294            h.send(data)
295        errcode, errmsg, headers = h.getreply()
296        fp = h.getfile()
297        if errcode == 200:
298            return addinfourl(fp, headers, "http:" + url)
299        else:
300            if data is None:
301                return self.http_error(url, fp, errcode, errmsg, headers)
302            else:
303                return self.http_error(url, fp, errcode, errmsg, headers, data)
304
305    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
306        """Handle http errors.
307        Derived class can override this, or provide specific handlers
308        named http_error_DDD where DDD is the 3-digit error code."""
309        # First check if there's a specific handler for this error
310        name = 'http_error_%d' % errcode
311        if hasattr(self, name):
312            method = getattr(self, name)
313            if data is None:
314                result = method(url, fp, errcode, errmsg, headers)
315            else:
316                result = method(url, fp, errcode, errmsg, headers, data)
317            if result: return result
318        return self.http_error_default(url, fp, errcode, errmsg, headers)
319
320    def http_error_default(self, url, fp, errcode, errmsg, headers):
321        """Default error handler: close the connection and raise IOError."""
322        void = fp.read()
323        fp.close()
324        raise IOError, ('http error', errcode, errmsg, headers)
325
326    if hasattr(socket, "ssl"):
327        def open_https(self, url, data=None):
328            """Use HTTPS protocol."""
329            import httplib
330            user_passwd = None
331            if type(url) is types.StringType:
332                host, selector = splithost(url)
333                if host:
334                    user_passwd, host = splituser(host)
335                    host = unquote(host)
336                realhost = host
337            else:
338                host, selector = url
339                urltype, rest = splittype(selector)
340                url = rest
341                user_passwd = None
342                if urltype.lower() != 'https':
343                    realhost = None
344                else:
345                    realhost, rest = splithost(rest)
346                    if realhost:
347                        user_passwd, realhost = splituser(realhost)
348                    if user_passwd:
349                        selector = "%s://%s%s" % (urltype, realhost, rest)
350                #print "proxy via https:", host, selector
351            if not host: raise IOError, ('https error', 'no host given')
352            if user_passwd:
353                import base64
354                auth = base64.encodestring(user_passwd).strip()
355            else:
356                auth = None
357            h = httplib.HTTPS(host, 0,
358                              key_file=self.key_file,
359                              cert_file=self.cert_file)
360            if data is not None:
361                h.putrequest('POST', selector)
362                h.putheader('Content-type',
363                            'application/x-www-form-urlencoded')
364                h.putheader('Content-length', '%d' % len(data))
365            else:
366                h.putrequest('GET', selector)
367            if auth: h.putheader('Authorization: Basic %s' % auth)
368            if realhost: h.putheader('Host', realhost)
369            for args in self.addheaders: apply(h.putheader, args)
370            h.endheaders()
371            if data is not None:
372                h.send(data)
373            errcode, errmsg, headers = h.getreply()
374            fp = h.getfile()
375            if errcode == 200:
376                return addinfourl(fp, headers, url)
377            else:
378                if data is None:
379                    return self.http_error(url, fp, errcode, errmsg, headers)
380                else:
381                    return self.http_error(url, fp, errcode, errmsg, headers,
382                                           data)
383
384    def open_gopher(self, url):
385        """Use Gopher protocol."""
386        import gopherlib
387        host, selector = splithost(url)
388        if not host: raise IOError, ('gopher error', 'no host given')
389        host = unquote(host)
390        type, selector = splitgophertype(selector)
391        selector, query = splitquery(selector)
392        selector = unquote(selector)
393        if query:
394            query = unquote(query)
395            fp = gopherlib.send_query(selector, query, host)
396        else:
397            fp = gopherlib.send_selector(selector, host)
398        return addinfourl(fp, noheaders(), "gopher:" + url)
399
400    def open_file(self, url):
401        """Use local file or FTP depending on form of URL."""
402        if url[:2] == '//' and url[2:3] != '/':
403            return self.open_ftp(url)
404        else:
405            return self.open_local_file(url)
406
407    def open_local_file(self, url):
408        """Use local file."""
409        import mimetypes, mimetools, rfc822, StringIO
410        host, file = splithost(url)
411        localname = url2pathname(file)
412        stats = os.stat(localname)
413        size = stats[stat.ST_SIZE]
414        modified = rfc822.formatdate(stats[stat.ST_MTIME])
415        mtype = mimetypes.guess_type(url)[0]
416        headers = mimetools.Message(StringIO.StringIO(
417            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
418            (mtype or 'text/plain', size, modified)))
419        if not host:
420            urlfile = file
421            if file[:1] == '/':
422                urlfile = 'file://' + file
423            return addinfourl(open(localname, 'rb'),
424                              headers, urlfile)
425        host, port = splitport(host)
426        if not port \
427           and socket.gethostbyname(host) in (localhost(), thishost()):
428            urlfile = file
429            if file[:1] == '/':
430                urlfile = 'file://' + file
431            return addinfourl(open(localname, 'rb'),
432                              headers, urlfile)
433        raise IOError, ('local file error', 'not on local host')
434
435    def open_ftp(self, url):
436        """Use FTP protocol."""
437        import mimetypes, mimetools, StringIO
438        host, path = splithost(url)
439        if not host: raise IOError, ('ftp error', 'no host given')
440        host, port = splitport(host)
441        user, host = splituser(host)
442        if user: user, passwd = splitpasswd(user)
443        else: passwd = None
444        host = unquote(host)
445        user = unquote(user or '')
446        passwd = unquote(passwd or '')
447        host = socket.gethostbyname(host)
448        if not port:
449            import ftplib
450            port = ftplib.FTP_PORT
451        else:
452            port = int(port)
453        path, attrs = splitattr(path)
454        path = unquote(path)
455        dirs = path.split('/')
456        dirs, file = dirs[:-1], dirs[-1]
457        if dirs and not dirs[0]: dirs = dirs[1:]
458        if dirs and not dirs[0]: dirs[0] = '/'
459        key = user, host, port, '/'.join(dirs)
460        # XXX thread unsafe!
461        if len(self.ftpcache) > MAXFTPCACHE:
462            # Prune the cache, rather arbitrarily
463            for k in self.ftpcache.keys():
464                if k != key:
465                    v = self.ftpcache[k]
466                    del self.ftpcache[k]
467                    v.close()
468        try:
469            if not self.ftpcache.has_key(key):
470                self.ftpcache[key] = \
471                    ftpwrapper(user, passwd, host, port, dirs)
472            if not file: type = 'D'
473            else: type = 'I'
474            for attr in attrs:
475                attr, value = splitvalue(attr)
476                if attr.lower() == 'type' and \
477                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
478                    type = value.upper()
479            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
480            mtype = mimetypes.guess_type("ftp:" + url)[0]
481            headers = ""
482            if mtype:
483                headers += "Content-Type: %s\n" % mtype
484            if retrlen is not None and retrlen >= 0:
485                headers += "Content-Length: %d\n" % retrlen
486            headers = mimetools.Message(StringIO.StringIO(headers))
487            return addinfourl(fp, headers, "ftp:" + url)
488        except ftperrors(), msg:
489            raise IOError, ('ftp error', msg), sys.exc_info()[2]
490
491    def open_data(self, url, data=None):
492        """Use "data" URL."""
493        # ignore POSTed data
494        #
495        # syntax of data URLs:
496        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
497        # mediatype := [ type "/" subtype ] *( ";" parameter )
498        # data      := *urlchar
499        # parameter := attribute "=" value
500        import StringIO, mimetools, time
501        try:
502            [type, data] = url.split(',', 1)
503        except ValueError:
504            raise IOError, ('data error', 'bad data URL')
505        if not type:
506            type = 'text/plain;charset=US-ASCII'
507        semi = type.rfind(';')
508        if semi >= 0 and '=' not in type[semi:]:
509            encoding = type[semi+1:]
510            type = type[:semi]
511        else:
512            encoding = ''
513        msg = []
514        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
515                                            time.gmtime(time.time())))
516        msg.append('Content-type: %s' % type)
517        if encoding == 'base64':
518            import base64
519            data = base64.decodestring(data)
520        else:
521            data = unquote(data)
522        msg.append('Content-length: %d' % len(data))
523        msg.append('')
524        msg.append(data)
525        msg = '\n'.join(msg)
526        f = StringIO.StringIO(msg)
527        headers = mimetools.Message(f, 0)
528        f.fileno = None     # needed for addinfourl
529        return addinfourl(f, headers, url)
530
531
532class FancyURLopener(URLopener):
533    """Derived class with handlers for errors we can handle (perhaps)."""
534
535    def __init__(self, *args):
536        apply(URLopener.__init__, (self,) + args)
537        self.auth_cache = {}
538        self.tries = 0
539        self.maxtries = 10
540
541    def http_error_default(self, url, fp, errcode, errmsg, headers):
542        """Default error handling -- don't raise an exception."""
543        return addinfourl(fp, headers, "http:" + url)
544
545    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
546        """Error 302 -- relocated (temporarily)."""
547        self.tries += 1
548        if self.maxtries and self.tries >= self.maxtries:
549            if hasattr(self, "http_error_500"):
550                meth = self.http_error_500
551            else:
552                meth = self.http_error_default
553            self.tries = 0
554            return meth(url, fp, 500,
555                        "Internal Server Error: Redirect Recursion", headers)
556        result = self.redirect_internal(url, fp, errcode, errmsg, headers,
557                                        data)
558        self.tries = 0
559        return result
560
561    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
562        if headers.has_key('location'):
563            newurl = headers['location']
564        elif headers.has_key('uri'):
565            newurl = headers['uri']
566        else:
567            return
568        void = fp.read()
569        fp.close()
570        # In case the server sent a relative URL, join with original:
571        newurl = basejoin(self.type + ":" + url, newurl)
572        if data is None:
573            return self.open(newurl)
574        else:
575            return self.open(newurl, data)
576
577    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
578        """Error 301 -- also relocated (permanently)."""
579        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
580
581    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
582        """Error 401 -- authentication required.
583        See this URL for a description of the basic authentication scheme:
584        http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
585        if not headers.has_key('www-authenticate'):
586            URLopener.http_error_default(self, url, fp,
587                                         errcode, errmsg, headers)
588        stuff = headers['www-authenticate']
589        import re
590        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
591        if not match:
592            URLopener.http_error_default(self, url, fp,
593                                         errcode, errmsg, headers)
594        scheme, realm = match.groups()
595        if scheme.lower() != 'basic':
596            URLopener.http_error_default(self, url, fp,
597                                         errcode, errmsg, headers)
598        name = 'retry_' + self.type + '_basic_auth'
599        if data is None:
600            return getattr(self,name)(url, realm)
601        else:
602            return getattr(self,name)(url, realm, data)
603
604    def retry_http_basic_auth(self, url, realm, data=None):
605        host, selector = splithost(url)
606        i = host.find('@') + 1
607        host = host[i:]
608        user, passwd = self.get_user_passwd(host, realm, i)
609        if not (user or passwd): return None
610        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
611        newurl = 'http://' + host + selector
612        if data is None:
613            return self.open(newurl)
614        else:
615            return self.open(newurl, data)
616
617    def retry_https_basic_auth(self, url, realm, data=None):
618        host, selector = splithost(url)
619        i = host.find('@') + 1
620        host = host[i:]
621        user, passwd = self.get_user_passwd(host, realm, i)
622        if not (user or passwd): return None
623        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
624        newurl = '//' + host + selector
625        return self.open_https(newurl, data)
626
627    def get_user_passwd(self, host, realm, clear_cache = 0):
628        key = realm + '@' + host.lower()
629        if self.auth_cache.has_key(key):
630            if clear_cache:
631                del self.auth_cache[key]
632            else:
633                return self.auth_cache[key]
634        user, passwd = self.prompt_user_passwd(host, realm)
635        if user or passwd: self.auth_cache[key] = (user, passwd)
636        return user, passwd
637
638    def prompt_user_passwd(self, host, realm):
639        """Override this in a GUI environment!"""
640        import getpass
641        try:
642            user = raw_input("Enter username for %s at %s: " % (realm,
643                                                                host))
644            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
645                (user, realm, host))
646            return user, passwd
647        except KeyboardInterrupt:
648            print
649            return None, None
650
651
652# Utility functions
653
654_localhost = None
655def localhost():
656    """Return the IP address of the magic hostname 'localhost'."""
657    global _localhost
658    if not _localhost:
659        _localhost = socket.gethostbyname('localhost')
660    return _localhost
661
662_thishost = None
663def thishost():
664    """Return the IP address of the current host."""
665    global _thishost
666    if not _thishost:
667        _thishost = socket.gethostbyname(socket.gethostname())
668    return _thishost
669
670_ftperrors = None
671def ftperrors():
672    """Return the set of errors raised by the FTP class."""
673    global _ftperrors
674    if not _ftperrors:
675        import ftplib
676        _ftperrors = ftplib.all_errors
677    return _ftperrors
678
679_noheaders = None
680def noheaders():
681    """Return an empty mimetools.Message object."""
682    global _noheaders
683    if not _noheaders:
684        import mimetools
685        import StringIO
686        _noheaders = mimetools.Message(StringIO.StringIO(), 0)
687        _noheaders.fp.close()   # Recycle file descriptor
688    return _noheaders
689
690
691# Utility classes
692
693class ftpwrapper:
694    """Class used by open_ftp() for cache of open FTP connections."""
695
696    def __init__(self, user, passwd, host, port, dirs):
697        self.user = user
698        self.passwd = passwd
699        self.host = host
700        self.port = port
701        self.dirs = dirs
702        self.init()
703
704    def init(self):
705        import ftplib
706        self.busy = 0
707        self.ftp = ftplib.FTP()
708        self.ftp.connect(self.host, self.port)
709        self.ftp.login(self.user, self.passwd)
710        for dir in self.dirs:
711            self.ftp.cwd(dir)
712
713    def retrfile(self, file, type):
714        import ftplib
715        self.endtransfer()
716        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
717        else: cmd = 'TYPE ' + type; isdir = 0
718        try:
719            self.ftp.voidcmd(cmd)
720        except ftplib.all_errors:
721            self.init()
722            self.ftp.voidcmd(cmd)
723        conn = None
724        if file and not isdir:
725            # Use nlst to see if the file exists at all
726            try:
727                self.ftp.nlst(file)
728            except ftplib.error_perm, reason:
729                raise IOError, ('ftp error', reason), sys.exc_info()[2]
730            # Restore the transfer mode!
731            self.ftp.voidcmd(cmd)
732            # Try to retrieve as a file
733            try:
734                cmd = 'RETR ' + file
735                conn = self.ftp.ntransfercmd(cmd)
736            except ftplib.error_perm, reason:
737                if str(reason)[:3] != '550':
738                    raise IOError, ('ftp error', reason), sys.exc_info()[2]
739        if not conn:
740            # Set transfer mode to ASCII!
741            self.ftp.voidcmd('TYPE A')
742            # Try a directory listing
743            if file: cmd = 'LIST ' + file
744            else: cmd = 'LIST'
745            conn = self.ftp.ntransfercmd(cmd)
746        self.busy = 1
747        # Pass back both a suitably decorated object and a retrieval length
748        return (addclosehook(conn[0].makefile('rb'),
749                             self.endtransfer), conn[1])
750    def endtransfer(self):
751        if not self.busy:
752            return
753        self.busy = 0
754        try:
755            self.ftp.voidresp()
756        except ftperrors():
757            pass
758
759    def close(self):
760        self.endtransfer()
761        try:
762            self.ftp.close()
763        except ftperrors():
764            pass
765
766class addbase:
767    """Base class for addinfo and addclosehook."""
768
769    def __init__(self, fp):
770        self.fp = fp
771        self.read = self.fp.read
772        self.readline = self.fp.readline
773        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
774        if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
775
776    def __repr__(self):
777        return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
778                                             `id(self)`, `self.fp`)
779
780    def close(self):
781        self.read = None
782        self.readline = None
783        self.readlines = None
784        self.fileno = None
785        if self.fp: self.fp.close()
786        self.fp = None
787
788class addclosehook(addbase):
789    """Class to add a close hook to an open file."""
790
791    def __init__(self, fp, closehook, *hookargs):
792        addbase.__init__(self, fp)
793        self.closehook = closehook
794        self.hookargs = hookargs
795
796    def close(self):
797        addbase.close(self)
798        if self.closehook:
799            apply(self.closehook, self.hookargs)
800            self.closehook = None
801            self.hookargs = None
802
803class addinfo(addbase):
804    """class to add an info() method to an open file."""
805
806    def __init__(self, fp, headers):
807        addbase.__init__(self, fp)
808        self.headers = headers
809
810    def info(self):
811        return self.headers
812
813class addinfourl(addbase):
814    """class to add info() and geturl() methods to an open file."""
815
816    def __init__(self, fp, headers, url):
817        addbase.__init__(self, fp)
818        self.headers = headers
819        self.url = url
820
821    def info(self):
822        return self.headers
823
824    def geturl(self):
825        return self.url
826
827
828def basejoin(base, url):
829    """Utility to combine a URL with a base URL to form a new URL."""
830    type, path = splittype(url)
831    if type:
832        # if url is complete (i.e., it contains a type), return it
833        return url
834    host, path = splithost(path)
835    type, basepath = splittype(base) # inherit type from base
836    if host:
837        # if url contains host, just inherit type
838        if type: return type + '://' + host + path
839        else:
840            # no type inherited, so url must have started with //
841            # just return it
842            return url
843    host, basepath = splithost(basepath) # inherit host
844    basepath, basetag = splittag(basepath) # remove extraneous cruft
845    basepath, basequery = splitquery(basepath) # idem
846    if path[:1] != '/':
847        # non-absolute path name
848        if path[:1] in ('#', '?'):
849            # path is just a tag or query, attach to basepath
850            i = len(basepath)
851        else:
852            # else replace last component
853            i = basepath.rfind('/')
854        if i < 0:
855            # basepath not absolute
856            if host:
857                # host present, make absolute
858                basepath = '/'
859            else:
860                # else keep non-absolute
861                basepath = ''
862        else:
863            # remove last file component
864            basepath = basepath[:i+1]
865        # Interpret ../ (important because of symlinks)
866        while basepath and path[:3] == '../':
867            path = path[3:]
868            i = basepath[:-1].rfind('/')
869            if i > 0:
870                basepath = basepath[:i+1]
871            elif i == 0:
872                basepath = '/'
873                break
874            else:
875                basepath = ''
876
877        path = basepath + path
878    if host and path and path[0] != '/':
879        path = '/' + path
880    if type and host: return type + '://' + host + path
881    elif type: return type + ':' + path
882    elif host: return '//' + host + path # don't know what this means
883    else: return path
884
885
886# Utilities to parse URLs (most of these return None for missing parts):
887# unwrap('<URL:type://host/path>') --> 'type://host/path'
888# splittype('type:opaquestring') --> 'type', 'opaquestring'
889# splithost('//host[:port]/path') --> 'host[:port]', '/path'
890# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
891# splitpasswd('user:passwd') -> 'user', 'passwd'
892# splitport('host:port') --> 'host', 'port'
893# splitquery('/path?query') --> '/path', 'query'
894# splittag('/path#tag') --> '/path', 'tag'
895# splitattr('/path;attr1=value1;attr2=value2;...') ->
896#   '/path', ['attr1=value1', 'attr2=value2', ...]
897# splitvalue('attr=value') --> 'attr', 'value'
898# splitgophertype('/Xselector') --> 'X', 'selector'
899# unquote('abc%20def') -> 'abc def'
900# quote('abc def') -> 'abc%20def')
901
902def toBytes(url):
903    """toBytes(u"URL") --> 'URL'."""
904    # Most URL schemes require ASCII. If that changes, the conversion
905    # can be relaxed
906    if type(url) is types.UnicodeType:
907        try:
908            url = url.encode("ASCII")
909        except UnicodeError:
910            raise UnicodeError("URL " + repr(url) +
911                               " contains non-ASCII characters")
912    return url
913
914def unwrap(url):
915    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
916    url = url.strip()
917    if url[:1] == '<' and url[-1:] == '>':
918        url = url[1:-1].strip()
919    if url[:4] == 'URL:': url = url[4:].strip()
920    return url
921
922_typeprog = None
923def splittype(url):
924    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
925    global _typeprog
926    if _typeprog is None:
927        import re
928        _typeprog = re.compile('^([^/:]+):')
929
930    match = _typeprog.match(url)
931    if match:
932        scheme = match.group(1)
933        return scheme.lower(), url[len(scheme) + 1:]
934    return None, url
935
936_hostprog = None
937def splithost(url):
938    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
939    global _hostprog
940    if _hostprog is None:
941        import re
942        _hostprog = re.compile('^//([^/]*)(.*)$')
943
944    match = _hostprog.match(url)
945    if match: return match.group(1, 2)
946    return None, url
947
948_userprog = None
949def splituser(host):
950    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
951    global _userprog
952    if _userprog is None:
953        import re
954        _userprog = re.compile('^([^@]*)@(.*)$')
955
956    match = _userprog.match(host)
957    if match: return map(unquote, match.group(1, 2))
958    return None, host
959
960_passwdprog = None
961def splitpasswd(user):
962    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
963    global _passwdprog
964    if _passwdprog is None:
965        import re
966        _passwdprog = re.compile('^([^:]*):(.*)$')
967
968    match = _passwdprog.match(user)
969    if match: return match.group(1, 2)
970    return user, None
971
972# splittag('/path#tag') --> '/path', 'tag'
973_portprog = None
974def splitport(host):
975    """splitport('host:port') --> 'host', 'port'."""
976    global _portprog
977    if _portprog is None:
978        import re
979        _portprog = re.compile('^(.*):([0-9]+)$')
980
981    match = _portprog.match(host)
982    if match: return match.group(1, 2)
983    return host, None
984
985_nportprog = None
986def splitnport(host, defport=-1):
987    """Split host and port, returning numeric port.
988    Return given default port if no ':' found; defaults to -1.
989    Return numerical port if a valid number are found after ':'.
990    Return None if ':' but not a valid number."""
991    global _nportprog
992    if _nportprog is None:
993        import re
994        _nportprog = re.compile('^(.*):(.*)$')
995
996    match = _nportprog.match(host)
997    if match:
998        host, port = match.group(1, 2)
999        try:
1000            if not port: raise ValueError, "no digits"
1001            nport = int(port)
1002        except ValueError:
1003            nport = None
1004        return host, nport
1005    return host, defport
1006
1007_queryprog = None
1008def splitquery(url):
1009    """splitquery('/path?query') --> '/path', 'query'."""
1010    global _queryprog
1011    if _queryprog is None:
1012        import re
1013        _queryprog = re.compile('^(.*)\?([^?]*)$')
1014
1015    match = _queryprog.match(url)
1016    if match: return match.group(1, 2)
1017    return url, None
1018
1019_tagprog = None
1020def splittag(url):
1021    """splittag('/path#tag') --> '/path', 'tag'."""
1022    global _tagprog
1023    if _tagprog is None:
1024        import re
1025        _tagprog = re.compile('^(.*)#([^#]*)$')
1026
1027    match = _tagprog.match(url)
1028    if match: return match.group(1, 2)
1029    return url, None
1030
1031def splitattr(url):
1032    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1033        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1034    words = url.split(';')
1035    return words[0], words[1:]
1036
1037_valueprog = None
1038def splitvalue(attr):
1039    """splitvalue('attr=value') --> 'attr', 'value'."""
1040    global _valueprog
1041    if _valueprog is None:
1042        import re
1043        _valueprog = re.compile('^([^=]*)=(.*)$')
1044
1045    match = _valueprog.match(attr)
1046    if match: return match.group(1, 2)
1047    return attr, None
1048
1049def splitgophertype(selector):
1050    """splitgophertype('/Xselector') --> 'X', 'selector'."""
1051    if selector[:1] == '/' and selector[1:2]:
1052        return selector[1], selector[2:]
1053    return None, selector
1054
1055def unquote(s):
1056    """unquote('abc%20def') -> 'abc def'."""
1057    mychr = chr
1058    myatoi = int
1059    list = s.split('%')
1060    res = [list[0]]
1061    myappend = res.append
1062    del list[0]
1063    for item in list:
1064        if item[1:2]:
1065            try:
1066                myappend(mychr(myatoi(item[:2], 16))
1067                     + item[2:])
1068            except ValueError:
1069                myappend('%' + item)
1070        else:
1071            myappend('%' + item)
1072    return "".join(res)
1073
1074def unquote_plus(s):
1075    """unquote('%7e/abc+def') -> '~/abc def'"""
1076    if '+' in s:
1077        # replace '+' with ' '
1078        s = ' '.join(s.split('+'))
1079    return unquote(s)
1080
1081always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1082               'abcdefghijklmnopqrstuvwxyz'
1083               '0123456789' '_.-')
1084
1085_fast_safe_test = always_safe + '/'
1086_fast_safe = None
1087
1088def _fast_quote(s):
1089    global _fast_safe
1090    if _fast_safe is None:
1091        _fast_safe = {}
1092        for c in _fast_safe_test:
1093            _fast_safe[c] = c
1094    res = list(s)
1095    for i in range(len(res)):
1096        c = res[i]
1097        if not _fast_safe.has_key(c):
1098            res[i] = '%%%02X' % ord(c)
1099    return ''.join(res)
1100
1101def quote(s, safe = '/'):
1102    """quote('abc def') -> 'abc%20def'
1103
1104    Each part of a URL, e.g. the path info, the query, etc., has a
1105    different set of reserved characters that must be quoted.
1106
1107    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1108    the following reserved characters.
1109
1110    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1111                  "$" | ","
1112
1113    Each of these characters is reserved in some component of a URL,
1114    but not necessarily in all of them.
1115
1116    By default, the quote function is intended for quoting the path
1117    section of a URL.  Thus, it will not encode '/'.  This character
1118    is reserved, but in typical usage the quote function is being
1119    called on a path where the existing slash characters are used as
1120    reserved characters.
1121    """
1122    safe = always_safe + safe
1123    if _fast_safe_test == safe:
1124        return _fast_quote(s)
1125    res = list(s)
1126    for i in range(len(res)):
1127        c = res[i]
1128        if c not in safe:
1129            res[i] = '%%%02X' % ord(c)
1130    return ''.join(res)
1131
1132def quote_plus(s, safe = ''):
1133    """Quote the query fragment of a URL; replacing ' ' with '+'"""
1134    if ' ' in s:
1135        l = s.split(' ')
1136        for i in range(len(l)):
1137            l[i] = quote(l[i], safe)
1138        return '+'.join(l)
1139    else:
1140        return quote(s, safe)
1141
1142def urlencode(query,doseq=0):
1143    """Encode a sequence of two-element tuples or dictionary into a URL query string.
1144
1145    If any values in the query arg are sequences and doseq is true, each
1146    sequence element is converted to a separate parameter.
1147
1148    If the query arg is a sequence of two-element tuples, the order of the
1149    parameters in the output will match the order of parameters in the
1150    input.
1151    """
1152
1153    if hasattr(query,"items"):
1154        # mapping objects
1155        query = query.items()
1156    else:
1157        # it's a bother at times that strings and string-like objects are
1158        # sequences...
1159        try:
1160            # non-sequence items should not work with len()
1161            x = len(query)
1162            # non-empty strings will fail this
1163            if len(query) and type(query[0]) != types.TupleType:
1164                raise TypeError
1165            # zero-length sequences of all types will get here and succeed,
1166            # but that's a minor nit - since the original implementation
1167            # allowed empty dicts that type of behavior probably should be
1168            # preserved for consistency
1169        except TypeError:
1170            ty,va,tb = sys.exc_info()
1171            raise TypeError, "not a valid non-string sequence or mapping object", tb
1172
1173    l = []
1174    if not doseq:
1175        # preserve old behavior
1176        for k, v in query:
1177            k = quote_plus(str(k))
1178            v = quote_plus(str(v))
1179            l.append(k + '=' + v)
1180    else:
1181        for k, v in query:
1182            k = quote_plus(str(k))
1183            if type(v) == types.StringType:
1184                v = quote_plus(v)
1185                l.append(k + '=' + v)
1186            elif type(v) == types.UnicodeType:
1187                # is there a reasonable way to convert to ASCII?
1188                # encode generates a string, but "replace" or "ignore"
1189                # lose information and "strict" can raise UnicodeError
1190                v = quote_plus(v.encode("ASCII","replace"))
1191                l.append(k + '=' + v)
1192            else:
1193                try:
1194                    # is this a sufficient test for sequence-ness?
1195                    x = len(v)
1196                except TypeError:
1197                    # not a sequence
1198                    v = quote_plus(str(v))
1199                    l.append(k + '=' + v)
1200                else:
1201                    # loop over the sequence
1202                    for elt in v:
1203                        l.append(k + '=' + quote_plus(str(elt)))
1204    return '&'.join(l)
1205
1206# Proxy handling
1207def getproxies_environment():
1208    """Return a dictionary of scheme -> proxy server URL mappings.
1209
1210    Scan the environment for variables named <scheme>_proxy;
1211    this seems to be the standard convention.  If you need a
1212    different way, you can pass a proxies dictionary to the
1213    [Fancy]URLopener constructor.
1214
1215    """
1216    proxies = {}
1217    for name, value in os.environ.items():
1218        name = name.lower()
1219        if value and name[-6:] == '_proxy':
1220            proxies[name[:-6]] = value
1221    return proxies
1222
1223if os.name == 'mac':
1224    def getproxies():
1225        """Return a dictionary of scheme -> proxy server URL mappings.
1226
1227        By convention the mac uses Internet Config to store
1228        proxies.  An HTTP proxy, for instance, is stored under
1229        the HttpProxy key.
1230
1231        """
1232        try:
1233            import ic
1234        except ImportError:
1235            return {}
1236
1237        try:
1238            config = ic.IC()
1239        except ic.error:
1240            return {}
1241        proxies = {}
1242        # HTTP:
1243        if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1244            try:
1245                value = config['HTTPProxyHost']
1246            except ic.error:
1247                pass
1248            else:
1249                proxies['http'] = 'http://%s' % value
1250        # FTP: XXXX To be done.
1251        # Gopher: XXXX To be done.
1252        return proxies
1253
1254    def proxy_bypass(x):
1255        return 0
1256
1257elif os.name == 'nt':
1258    def getproxies_registry():
1259        """Return a dictionary of scheme -> proxy server URL mappings.
1260
1261        Win32 uses the registry to store proxies.
1262
1263        """
1264        proxies = {}
1265        try:
1266            import _winreg
1267        except ImportError:
1268            # Std module, so should be around - but you never know!
1269            return proxies
1270        try:
1271            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1272                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1273            proxyEnable = _winreg.QueryValueEx(internetSettings,
1274                                               'ProxyEnable')[0]
1275            if proxyEnable:
1276                # Returned as Unicode but problems if not converted to ASCII
1277                proxyServer = str(_winreg.QueryValueEx(internetSettings,
1278                                                       'ProxyServer')[0])
1279                if '=' in proxyServer:
1280                    # Per-protocol settings
1281                    for p in proxyServer.split(';'):
1282                        protocol, address = p.split('=', 1)
1283                        proxies[protocol] = '%s://%s' % (protocol, address)
1284                else:
1285                    # Use one setting for all protocols
1286                    if proxyServer[:5] == 'http:':
1287                        proxies['http'] = proxyServer
1288                    else:
1289                        proxies['http'] = 'http://%s' % proxyServer
1290                        proxies['ftp'] = 'ftp://%s' % proxyServer
1291            internetSettings.Close()
1292        except (WindowsError, ValueError, TypeError):
1293            # Either registry key not found etc, or the value in an
1294            # unexpected format.
1295            # proxies already set up to be empty so nothing to do
1296            pass
1297        return proxies
1298
1299    def getproxies():
1300        """Return a dictionary of scheme -> proxy server URL mappings.
1301
1302        Returns settings gathered from the environment, if specified,
1303        or the registry.
1304
1305        """
1306        return getproxies_environment() or getproxies_registry()
1307
1308    def proxy_bypass(host):
1309        try:
1310            import _winreg
1311            import re
1312            import socket
1313        except ImportError:
1314            # Std modules, so should be around - but you never know!
1315            return 0
1316        try:
1317            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1318                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1319            proxyEnable = _winreg.QueryValueEx(internetSettings,
1320                                               'ProxyEnable')[0]
1321            proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1322                                                     'ProxyOverride')[0])
1323            # ^^^^ Returned as Unicode but problems if not converted to ASCII
1324        except WindowsError:
1325            return 0
1326        if not proxyEnable or not proxyOverride:
1327            return 0
1328        # try to make a host list from name and IP address.
1329        host = [host]
1330        try:
1331            addr = socket.gethostbyname(host[0])
1332            if addr != host:
1333                host.append(addr)
1334        except socket.error:
1335            pass
1336        # make a check value list from the registry entry: replace the
1337        # '<local>' string by the localhost entry and the corresponding
1338        # canonical entry.
1339        proxyOverride = proxyOverride.split(';')
1340        i = 0
1341        while i < len(proxyOverride):
1342            if proxyOverride[i] == '<local>':
1343                proxyOverride[i:i+1] = ['localhost',
1344                                        '127.0.0.1',
1345                                        socket.gethostname(),
1346                                        socket.gethostbyname(
1347                                            socket.gethostname())]
1348            i += 1
1349        # print proxyOverride
1350        # now check if we match one of the registry values.
1351        for test in proxyOverride:
1352            test = test.replace(".", r"\.")     # mask dots
1353            test = test.replace("*", r".*")     # change glob sequence
1354            test = test.replace("?", r".")      # change glob char
1355            for val in host:
1356                # print "%s <--> %s" %( test, val )
1357                if re.match(test, val, re.I):
1358                    return 1
1359        return 0
1360
1361else:
1362    # By default use environment variables
1363    getproxies = getproxies_environment
1364
1365    def proxy_bypass(host):
1366        return 0
1367
1368# Test and time quote() and unquote()
1369def test1():
1370    import time
1371    s = ''
1372    for i in range(256): s = s + chr(i)
1373    s = s*4
1374    t0 = time.time()
1375    qs = quote(s)
1376    uqs = unquote(qs)
1377    t1 = time.time()
1378    if uqs != s:
1379        print 'Wrong!'
1380    print `s`
1381    print `qs`
1382    print `uqs`
1383    print round(t1 - t0, 3), 'sec'
1384
1385
1386def reporthook(blocknum, blocksize, totalsize):
1387    # Report during remote transfers
1388    print "Block number: %d, Block size: %d, Total size: %d" % (
1389        blocknum, blocksize, totalsize)
1390
1391# Test program
1392def test(args=[]):
1393    if not args:
1394        args = [
1395            '/etc/passwd',
1396            'file:/etc/passwd',
1397            'file://localhost/etc/passwd',
1398            'ftp://ftp.python.org/etc/passwd',
1399##          'gopher://gopher.micro.umn.edu/1/',
1400            'http://www.python.org/index.html',
1401            ]
1402        if hasattr(URLopener, "open_https"):
1403            args.append('https://synergy.as.cmu.edu/~geek/')
1404    try:
1405        for url in args:
1406            print '-'*10, url, '-'*10
1407            fn, h = urlretrieve(url, None, reporthook)
1408            print fn
1409            if h:
1410                print '======'
1411                for k in h.keys(): print k + ':', h[k]
1412                print '======'
1413            fp = open(fn, 'rb')
1414            data = fp.read()
1415            del fp
1416            if '\r' in data:
1417                table = string.maketrans("", "")
1418                data = data.translate(table, "\r")
1419            print data
1420            fn, h = None, None
1421        print '-'*40
1422    finally:
1423        urlcleanup()
1424
1425def main():
1426    import getopt, sys
1427    try:
1428        opts, args = getopt.getopt(sys.argv[1:], "th")
1429    except getopt.error, msg:
1430        print msg
1431        print "Use -h for help"
1432        return
1433    t = 0
1434    for o, a in opts:
1435        if o == '-t':
1436            t = t + 1
1437        if o == '-h':
1438            print "Usage: python urllib.py [-t] [url ...]"
1439            print "-t runs self-test;",
1440            print "otherwise, contents of urls are printed"
1441            return
1442    if t:
1443        if t > 1:
1444            test1()
1445        test(args)
1446    else:
1447        if not args:
1448            print "Use -h for help"
1449        for url in args:
1450            print urlopen(url).read(),
1451
1452# Run test program when run as a script
1453if __name__ == '__main__':
1454    main()
1455