urllib.py revision 54f0222547b1e92cd018ef132307a6f793dc9505
1"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol.  All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info().  The read*(), fileno()
19and close() methods work like those of open files.
20The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
24
25import string
26import socket
27import os
28import time
29import sys
30import types
31
32__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
33           "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
34           "urlencode", "url2pathname", "pathname2url", "splittag",
35           "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
36           "splittype", "splithost", "splituser", "splitpasswd", "splitport",
37           "splitnport", "splitquery", "splitattr", "splitvalue",
38           "splitgophertype", "getproxies"]
39
40__version__ = '1.15'    # XXX This version is not always updated :-(
41
42MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
43
44# Helper for non-unix systems
45if os.name == 'mac':
46    from macurl2path import url2pathname, pathname2url
47elif os.name == 'nt':
48    from nturl2path import url2pathname, pathname2url
49elif os.name == 'riscos':
50    from rourl2path import url2pathname, pathname2url
51else:
52    def url2pathname(pathname):
53        return unquote(pathname)
54    def pathname2url(pathname):
55        return quote(pathname)
56
57# This really consists of two pieces:
58# (1) a class which handles opening of all sorts of URLs
59#     (plus assorted utilities etc.)
60# (2) a set of functions for parsing URLs
61# XXX Should these be separated out into different modules?
62
63
64# Shortcut for basic usage
65_urlopener = None
66def urlopen(url, data=None, proxies=None):
67    """urlopen(url [, data]) -> open file-like object"""
68    global _urlopener
69    if proxies is not None:
70        opener = FancyURLopener(proxies=proxies)
71    elif not _urlopener:
72        opener = FancyURLopener()
73        _urlopener = opener
74    else:
75        opener = _urlopener
76    if data is None:
77        return opener.open(url)
78    else:
79        return opener.open(url, data)
80def urlretrieve(url, filename=None, reporthook=None, data=None):
81    global _urlopener
82    if not _urlopener:
83        _urlopener = FancyURLopener()
84    return _urlopener.retrieve(url, filename, reporthook, data)
85def urlcleanup():
86    if _urlopener:
87        _urlopener.cleanup()
88
89
90ftpcache = {}
91class URLopener:
92    """Class to open URLs.
93    This is a class rather than just a subroutine because we may need
94    more than one set of global protocol-specific options.
95    Note -- this is a base class for those who don't want the
96    automatic handling of errors type 302 (relocated) and 401
97    (authorization needed)."""
98
99    __tempfiles = None
100
101    version = "Python-urllib/%s" % __version__
102
103    # Constructor
104    def __init__(self, proxies=None, **x509):
105        if proxies is None:
106            proxies = getproxies()
107        assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
108        self.proxies = proxies
109        self.key_file = x509.get('key_file')
110        self.cert_file = x509.get('cert_file')
111        self.addheaders = [('User-agent', self.version)]
112        self.__tempfiles = []
113        self.__unlink = os.unlink # See cleanup()
114        self.tempcache = None
115        # Undocumented feature: if you assign {} to tempcache,
116        # it is used to cache files retrieved with
117        # self.retrieve().  This is not enabled by default
118        # since it does not work for changing documents (and I
119        # haven't got the logic to check expiration headers
120        # yet).
121        self.ftpcache = ftpcache
122        # Undocumented feature: you can use a different
123        # ftp cache by assigning to the .ftpcache member;
124        # in case you want logically independent URL openers
125        # XXX This is not threadsafe.  Bah.
126
127    def __del__(self):
128        self.close()
129
130    def close(self):
131        self.cleanup()
132
133    def cleanup(self):
134        # This code sometimes runs when the rest of this module
135        # has already been deleted, so it can't use any globals
136        # or import anything.
137        if self.__tempfiles:
138            for file in self.__tempfiles:
139                try:
140                    self.__unlink(file)
141                except OSError:
142                    pass
143            del self.__tempfiles[:]
144        if self.tempcache:
145            self.tempcache.clear()
146
147    def addheader(self, *args):
148        """Add a header to be used by the HTTP interface only
149        e.g. u.addheader('Accept', 'sound/basic')"""
150        self.addheaders.append(args)
151
152    # External interface
153    def open(self, fullurl, data=None):
154        """Use URLopener().open(file) instead of open(file, 'r')."""
155        fullurl = unwrap(toBytes(fullurl))
156        if self.tempcache and fullurl in self.tempcache:
157            filename, headers = self.tempcache[fullurl]
158            fp = open(filename, 'rb')
159            return addinfourl(fp, headers, fullurl)
160        urltype, url = splittype(fullurl)
161        if not urltype:
162            urltype = 'file'
163        if urltype in self.proxies:
164            proxy = self.proxies[urltype]
165            urltype, proxyhost = splittype(proxy)
166            host, selector = splithost(proxyhost)
167            url = (host, fullurl) # Signal special case to open_*()
168        else:
169            proxy = None
170        name = 'open_' + urltype
171        self.type = urltype
172        if '-' in name:
173            # replace - with _
174            name = '_'.join(name.split('-'))
175        if not hasattr(self, name):
176            if proxy:
177                return self.open_unknown_proxy(proxy, fullurl, data)
178            else:
179                return self.open_unknown(fullurl, data)
180        try:
181            if data is None:
182                return getattr(self, name)(url)
183            else:
184                return getattr(self, name)(url, data)
185        except socket.error, msg:
186            raise IOError, ('socket error', msg), sys.exc_info()[2]
187
188    def open_unknown(self, fullurl, data=None):
189        """Overridable interface to open unknown URL type."""
190        type, url = splittype(fullurl)
191        raise IOError, ('url error', 'unknown url type', type)
192
193    def open_unknown_proxy(self, proxy, fullurl, data=None):
194        """Overridable interface to open unknown URL type."""
195        type, url = splittype(fullurl)
196        raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
197
198    # External interface
199    def retrieve(self, url, filename=None, reporthook=None, data=None):
200        """retrieve(url) returns (filename, None) for a local object
201        or (tempfilename, headers) for a remote object."""
202        url = unwrap(toBytes(url))
203        if self.tempcache and url in self.tempcache:
204            return self.tempcache[url]
205        type, url1 = splittype(url)
206        if not filename and (not type or type == 'file'):
207            try:
208                fp = self.open_local_file(url1)
209                hdrs = fp.info()
210                del fp
211                return url2pathname(splithost(url1)[1]), hdrs
212            except IOError, msg:
213                pass
214        fp = self.open(url, data)
215        headers = fp.info()
216        if not filename:
217            import tempfile
218            garbage, path = splittype(url)
219            garbage, path = splithost(path or "")
220            path, garbage = splitquery(path or "")
221            path, garbage = splitattr(path or "")
222            suffix = os.path.splitext(path)[1]
223            filename = tempfile.mktemp(suffix)
224            self.__tempfiles.append(filename)
225        result = filename, headers
226        if self.tempcache is not None:
227            self.tempcache[url] = result
228        tfp = open(filename, 'wb')
229        bs = 1024*8
230        size = -1
231        blocknum = 1
232        if reporthook:
233            if "content-length" in headers:
234                size = int(headers["Content-Length"])
235            reporthook(0, bs, size)
236        block = fp.read(bs)
237        if reporthook:
238            reporthook(1, bs, size)
239        while block:
240            tfp.write(block)
241            block = fp.read(bs)
242            blocknum = blocknum + 1
243            if reporthook:
244                reporthook(blocknum, bs, size)
245        fp.close()
246        tfp.close()
247        del fp
248        del tfp
249        return result
250
251    # Each method named open_<type> knows how to open that type of URL
252
253    def open_http(self, url, data=None):
254        """Use HTTP protocol."""
255        import httplib
256        user_passwd = None
257        if type(url) is types.StringType:
258            host, selector = splithost(url)
259            if host:
260                user_passwd, host = splituser(host)
261                host = unquote(host)
262            realhost = host
263        else:
264            host, selector = url
265            urltype, rest = splittype(selector)
266            url = rest
267            user_passwd = None
268            if urltype.lower() != 'http':
269                realhost = None
270            else:
271                realhost, rest = splithost(rest)
272                if realhost:
273                    user_passwd, realhost = splituser(realhost)
274                if user_passwd:
275                    selector = "%s://%s%s" % (urltype, realhost, rest)
276                if proxy_bypass(realhost):
277                    host = realhost
278
279            #print "proxy via http:", host, selector
280        if not host: raise IOError, ('http error', 'no host given')
281        if user_passwd:
282            import base64
283            auth = base64.encodestring(user_passwd).strip()
284        else:
285            auth = None
286        h = httplib.HTTP(host)
287        if data is not None:
288            h.putrequest('POST', selector)
289            h.putheader('Content-type', 'application/x-www-form-urlencoded')
290            h.putheader('Content-length', '%d' % len(data))
291        else:
292            h.putrequest('GET', selector)
293        if auth: h.putheader('Authorization', 'Basic %s' % auth)
294        if realhost: h.putheader('Host', realhost)
295        for args in self.addheaders: apply(h.putheader, args)
296        h.endheaders()
297        if data is not None:
298            h.send(data)
299        errcode, errmsg, headers = h.getreply()
300        fp = h.getfile()
301        if errcode == 200:
302            return addinfourl(fp, headers, "http:" + url)
303        else:
304            if data is None:
305                return self.http_error(url, fp, errcode, errmsg, headers)
306            else:
307                return self.http_error(url, fp, errcode, errmsg, headers, data)
308
309    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
310        """Handle http errors.
311        Derived class can override this, or provide specific handlers
312        named http_error_DDD where DDD is the 3-digit error code."""
313        # First check if there's a specific handler for this error
314        name = 'http_error_%d' % errcode
315        if hasattr(self, name):
316            method = getattr(self, name)
317            if data is None:
318                result = method(url, fp, errcode, errmsg, headers)
319            else:
320                result = method(url, fp, errcode, errmsg, headers, data)
321            if result: return result
322        return self.http_error_default(url, fp, errcode, errmsg, headers)
323
324    def http_error_default(self, url, fp, errcode, errmsg, headers):
325        """Default error handler: close the connection and raise IOError."""
326        void = fp.read()
327        fp.close()
328        raise IOError, ('http error', errcode, errmsg, headers)
329
330    if hasattr(socket, "ssl"):
331        def open_https(self, url, data=None):
332            """Use HTTPS protocol."""
333            import httplib
334            user_passwd = None
335            if type(url) is types.StringType:
336                host, selector = splithost(url)
337                if host:
338                    user_passwd, host = splituser(host)
339                    host = unquote(host)
340                realhost = host
341            else:
342                host, selector = url
343                urltype, rest = splittype(selector)
344                url = rest
345                user_passwd = None
346                if urltype.lower() != 'https':
347                    realhost = None
348                else:
349                    realhost, rest = splithost(rest)
350                    if realhost:
351                        user_passwd, realhost = splituser(realhost)
352                    if user_passwd:
353                        selector = "%s://%s%s" % (urltype, realhost, rest)
354                #print "proxy via https:", host, selector
355            if not host: raise IOError, ('https error', 'no host given')
356            if user_passwd:
357                import base64
358                auth = base64.encodestring(user_passwd).strip()
359            else:
360                auth = None
361            h = httplib.HTTPS(host, 0,
362                              key_file=self.key_file,
363                              cert_file=self.cert_file)
364            if data is not None:
365                h.putrequest('POST', selector)
366                h.putheader('Content-type',
367                            'application/x-www-form-urlencoded')
368                h.putheader('Content-length', '%d' % len(data))
369            else:
370                h.putrequest('GET', selector)
371            if auth: h.putheader('Authorization: Basic %s' % auth)
372            if realhost: h.putheader('Host', realhost)
373            for args in self.addheaders: apply(h.putheader, args)
374            h.endheaders()
375            if data is not None:
376                h.send(data)
377            errcode, errmsg, headers = h.getreply()
378            fp = h.getfile()
379            if errcode == 200:
380                return addinfourl(fp, headers, "https:" + url)
381            else:
382                if data is None:
383                    return self.http_error(url, fp, errcode, errmsg, headers)
384                else:
385                    return self.http_error(url, fp, errcode, errmsg, headers,
386                                           data)
387
388    def open_gopher(self, url):
389        """Use Gopher protocol."""
390        import gopherlib
391        host, selector = splithost(url)
392        if not host: raise IOError, ('gopher error', 'no host given')
393        host = unquote(host)
394        type, selector = splitgophertype(selector)
395        selector, query = splitquery(selector)
396        selector = unquote(selector)
397        if query:
398            query = unquote(query)
399            fp = gopherlib.send_query(selector, query, host)
400        else:
401            fp = gopherlib.send_selector(selector, host)
402        return addinfourl(fp, noheaders(), "gopher:" + url)
403
404    def open_file(self, url):
405        """Use local file or FTP depending on form of URL."""
406        if url[:2] == '//' and url[2:3] != '/':
407            return self.open_ftp(url)
408        else:
409            return self.open_local_file(url)
410
411    def open_local_file(self, url):
412        """Use local file."""
413        import mimetypes, mimetools, rfc822, StringIO
414        host, file = splithost(url)
415        localname = url2pathname(file)
416        try:
417            stats = os.stat(localname)
418        except OSError, e:
419            raise IOError(e.errno, e.strerror, e.filename)
420        size = stats.st_size
421        modified = rfc822.formatdate(stats.st_mtime)
422        mtype = mimetypes.guess_type(url)[0]
423        headers = mimetools.Message(StringIO.StringIO(
424            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
425            (mtype or 'text/plain', size, modified)))
426        if not host:
427            urlfile = file
428            if file[:1] == '/':
429                urlfile = 'file://' + file
430            return addinfourl(open(localname, 'rb'),
431                              headers, urlfile)
432        host, port = splitport(host)
433        if not port \
434           and socket.gethostbyname(host) in (localhost(), thishost()):
435            urlfile = file
436            if file[:1] == '/':
437                urlfile = 'file://' + file
438            return addinfourl(open(localname, 'rb'),
439                              headers, urlfile)
440        raise IOError, ('local file error', 'not on local host')
441
442    def open_ftp(self, url):
443        """Use FTP protocol."""
444        import mimetypes, mimetools, StringIO
445        host, path = splithost(url)
446        if not host: raise IOError, ('ftp error', 'no host given')
447        host, port = splitport(host)
448        user, host = splituser(host)
449        if user: user, passwd = splitpasswd(user)
450        else: passwd = None
451        host = unquote(host)
452        user = unquote(user or '')
453        passwd = unquote(passwd or '')
454        host = socket.gethostbyname(host)
455        if not port:
456            import ftplib
457            port = ftplib.FTP_PORT
458        else:
459            port = int(port)
460        path, attrs = splitattr(path)
461        path = unquote(path)
462        dirs = path.split('/')
463        dirs, file = dirs[:-1], dirs[-1]
464        if dirs and not dirs[0]: dirs = dirs[1:]
465        if dirs and not dirs[0]: dirs[0] = '/'
466        key = user, host, port, '/'.join(dirs)
467        # XXX thread unsafe!
468        if len(self.ftpcache) > MAXFTPCACHE:
469            # Prune the cache, rather arbitrarily
470            for k in self.ftpcache.keys():
471                if k != key:
472                    v = self.ftpcache[k]
473                    del self.ftpcache[k]
474                    v.close()
475        try:
476            if not key in self.ftpcache:
477                self.ftpcache[key] = \
478                    ftpwrapper(user, passwd, host, port, dirs)
479            if not file: type = 'D'
480            else: type = 'I'
481            for attr in attrs:
482                attr, value = splitvalue(attr)
483                if attr.lower() == 'type' and \
484                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
485                    type = value.upper()
486            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
487            mtype = mimetypes.guess_type("ftp:" + url)[0]
488            headers = ""
489            if mtype:
490                headers += "Content-Type: %s\n" % mtype
491            if retrlen is not None and retrlen >= 0:
492                headers += "Content-Length: %d\n" % retrlen
493            headers = mimetools.Message(StringIO.StringIO(headers))
494            return addinfourl(fp, headers, "ftp:" + url)
495        except ftperrors(), msg:
496            raise IOError, ('ftp error', msg), sys.exc_info()[2]
497
498    def open_data(self, url, data=None):
499        """Use "data" URL."""
500        # ignore POSTed data
501        #
502        # syntax of data URLs:
503        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
504        # mediatype := [ type "/" subtype ] *( ";" parameter )
505        # data      := *urlchar
506        # parameter := attribute "=" value
507        import StringIO, mimetools
508        try:
509            [type, data] = url.split(',', 1)
510        except ValueError:
511            raise IOError, ('data error', 'bad data URL')
512        if not type:
513            type = 'text/plain;charset=US-ASCII'
514        semi = type.rfind(';')
515        if semi >= 0 and '=' not in type[semi:]:
516            encoding = type[semi+1:]
517            type = type[:semi]
518        else:
519            encoding = ''
520        msg = []
521        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
522                                            time.gmtime(time.time())))
523        msg.append('Content-type: %s' % type)
524        if encoding == 'base64':
525            import base64
526            data = base64.decodestring(data)
527        else:
528            data = unquote(data)
529        msg.append('Content-length: %d' % len(data))
530        msg.append('')
531        msg.append(data)
532        msg = '\n'.join(msg)
533        f = StringIO.StringIO(msg)
534        headers = mimetools.Message(f, 0)
535        f.fileno = None     # needed for addinfourl
536        return addinfourl(f, headers, url)
537
538
539class FancyURLopener(URLopener):
540    """Derived class with handlers for errors we can handle (perhaps)."""
541
542    def __init__(self, *args):
543        apply(URLopener.__init__, (self,) + args)
544        self.auth_cache = {}
545        self.tries = 0
546        self.maxtries = 10
547
548    def http_error_default(self, url, fp, errcode, errmsg, headers):
549        """Default error handling -- don't raise an exception."""
550        return addinfourl(fp, headers, "http:" + url)
551
552    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
553        """Error 302 -- relocated (temporarily)."""
554        self.tries += 1
555        if self.maxtries and self.tries >= self.maxtries:
556            if hasattr(self, "http_error_500"):
557                meth = self.http_error_500
558            else:
559                meth = self.http_error_default
560            self.tries = 0
561            return meth(url, fp, 500,
562                        "Internal Server Error: Redirect Recursion", headers)
563        result = self.redirect_internal(url, fp, errcode, errmsg, headers,
564                                        data)
565        self.tries = 0
566        return result
567
568    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
569        if 'location' in headers:
570            newurl = headers['location']
571        elif 'uri' in headers:
572            newurl = headers['uri']
573        else:
574            return
575        void = fp.read()
576        fp.close()
577        # In case the server sent a relative URL, join with original:
578        newurl = basejoin(self.type + ":" + url, newurl)
579        if data is None:
580            return self.open(newurl)
581        else:
582            return self.open(newurl, data)
583
584    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
585        """Error 301 -- also relocated (permanently)."""
586        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
587
588    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
589        """Error 401 -- authentication required.
590        See this URL for a description of the basic authentication scheme:
591        http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
592        if not 'www-authenticate' in headers:
593            URLopener.http_error_default(self, url, fp,
594                                         errcode, errmsg, headers)
595        stuff = headers['www-authenticate']
596        import re
597        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
598        if not match:
599            URLopener.http_error_default(self, url, fp,
600                                         errcode, errmsg, headers)
601        scheme, realm = match.groups()
602        if scheme.lower() != 'basic':
603            URLopener.http_error_default(self, url, fp,
604                                         errcode, errmsg, headers)
605        name = 'retry_' + self.type + '_basic_auth'
606        if data is None:
607            return getattr(self,name)(url, realm)
608        else:
609            return getattr(self,name)(url, realm, data)
610
611    def retry_http_basic_auth(self, url, realm, data=None):
612        host, selector = splithost(url)
613        i = host.find('@') + 1
614        host = host[i:]
615        user, passwd = self.get_user_passwd(host, realm, i)
616        if not (user or passwd): return None
617        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
618        newurl = 'http://' + host + selector
619        if data is None:
620            return self.open(newurl)
621        else:
622            return self.open(newurl, data)
623
624    def retry_https_basic_auth(self, url, realm, data=None):
625        host, selector = splithost(url)
626        i = host.find('@') + 1
627        host = host[i:]
628        user, passwd = self.get_user_passwd(host, realm, i)
629        if not (user or passwd): return None
630        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
631        newurl = '//' + host + selector
632        return self.open_https(newurl, data)
633
634    def get_user_passwd(self, host, realm, clear_cache = 0):
635        key = realm + '@' + host.lower()
636        if key in self.auth_cache:
637            if clear_cache:
638                del self.auth_cache[key]
639            else:
640                return self.auth_cache[key]
641        user, passwd = self.prompt_user_passwd(host, realm)
642        if user or passwd: self.auth_cache[key] = (user, passwd)
643        return user, passwd
644
645    def prompt_user_passwd(self, host, realm):
646        """Override this in a GUI environment!"""
647        import getpass
648        try:
649            user = raw_input("Enter username for %s at %s: " % (realm,
650                                                                host))
651            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
652                (user, realm, host))
653            return user, passwd
654        except KeyboardInterrupt:
655            print
656            return None, None
657
658
659# Utility functions
660
661_localhost = None
662def localhost():
663    """Return the IP address of the magic hostname 'localhost'."""
664    global _localhost
665    if not _localhost:
666        _localhost = socket.gethostbyname('localhost')
667    return _localhost
668
669_thishost = None
670def thishost():
671    """Return the IP address of the current host."""
672    global _thishost
673    if not _thishost:
674        _thishost = socket.gethostbyname(socket.gethostname())
675    return _thishost
676
677_ftperrors = None
678def ftperrors():
679    """Return the set of errors raised by the FTP class."""
680    global _ftperrors
681    if not _ftperrors:
682        import ftplib
683        _ftperrors = ftplib.all_errors
684    return _ftperrors
685
686_noheaders = None
687def noheaders():
688    """Return an empty mimetools.Message object."""
689    global _noheaders
690    if not _noheaders:
691        import mimetools
692        import StringIO
693        _noheaders = mimetools.Message(StringIO.StringIO(), 0)
694        _noheaders.fp.close()   # Recycle file descriptor
695    return _noheaders
696
697
698# Utility classes
699
700class ftpwrapper:
701    """Class used by open_ftp() for cache of open FTP connections."""
702
703    def __init__(self, user, passwd, host, port, dirs):
704        self.user = user
705        self.passwd = passwd
706        self.host = host
707        self.port = port
708        self.dirs = dirs
709        self.init()
710
711    def init(self):
712        import ftplib
713        self.busy = 0
714        self.ftp = ftplib.FTP()
715        self.ftp.connect(self.host, self.port)
716        self.ftp.login(self.user, self.passwd)
717        for dir in self.dirs:
718            self.ftp.cwd(dir)
719
720    def retrfile(self, file, type):
721        import ftplib
722        self.endtransfer()
723        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
724        else: cmd = 'TYPE ' + type; isdir = 0
725        try:
726            self.ftp.voidcmd(cmd)
727        except ftplib.all_errors:
728            self.init()
729            self.ftp.voidcmd(cmd)
730        conn = None
731        if file and not isdir:
732            # Use nlst to see if the file exists at all
733            try:
734                self.ftp.nlst(file)
735            except ftplib.error_perm, reason:
736                raise IOError, ('ftp error', reason), sys.exc_info()[2]
737            # Restore the transfer mode!
738            self.ftp.voidcmd(cmd)
739            # Try to retrieve as a file
740            try:
741                cmd = 'RETR ' + file
742                conn = self.ftp.ntransfercmd(cmd)
743            except ftplib.error_perm, reason:
744                if str(reason)[:3] != '550':
745                    raise IOError, ('ftp error', reason), sys.exc_info()[2]
746        if not conn:
747            # Set transfer mode to ASCII!
748            self.ftp.voidcmd('TYPE A')
749            # Try a directory listing
750            if file: cmd = 'LIST ' + file
751            else: cmd = 'LIST'
752            conn = self.ftp.ntransfercmd(cmd)
753        self.busy = 1
754        # Pass back both a suitably decorated object and a retrieval length
755        return (addclosehook(conn[0].makefile('rb'),
756                             self.endtransfer), conn[1])
757    def endtransfer(self):
758        if not self.busy:
759            return
760        self.busy = 0
761        try:
762            self.ftp.voidresp()
763        except ftperrors():
764            pass
765
766    def close(self):
767        self.endtransfer()
768        try:
769            self.ftp.close()
770        except ftperrors():
771            pass
772
773class addbase:
774    """Base class for addinfo and addclosehook."""
775
776    def __init__(self, fp):
777        self.fp = fp
778        self.read = self.fp.read
779        self.readline = self.fp.readline
780        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
781        if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
782
783    def __repr__(self):
784        return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
785                                             `id(self)`, `self.fp`)
786
787    def close(self):
788        self.read = None
789        self.readline = None
790        self.readlines = None
791        self.fileno = None
792        if self.fp: self.fp.close()
793        self.fp = None
794
795class addclosehook(addbase):
796    """Class to add a close hook to an open file."""
797
798    def __init__(self, fp, closehook, *hookargs):
799        addbase.__init__(self, fp)
800        self.closehook = closehook
801        self.hookargs = hookargs
802
803    def close(self):
804        addbase.close(self)
805        if self.closehook:
806            apply(self.closehook, self.hookargs)
807            self.closehook = None
808            self.hookargs = None
809
810class addinfo(addbase):
811    """class to add an info() method to an open file."""
812
813    def __init__(self, fp, headers):
814        addbase.__init__(self, fp)
815        self.headers = headers
816
817    def info(self):
818        return self.headers
819
820class addinfourl(addbase):
821    """class to add info() and geturl() methods to an open file."""
822
823    def __init__(self, fp, headers, url):
824        addbase.__init__(self, fp)
825        self.headers = headers
826        self.url = url
827
828    def info(self):
829        return self.headers
830
831    def geturl(self):
832        return self.url
833
834
835def basejoin(base, url):
836    """Utility to combine a URL with a base URL to form a new URL."""
837    type, path = splittype(url)
838    if type:
839        # if url is complete (i.e., it contains a type), return it
840        return url
841    host, path = splithost(path)
842    type, basepath = splittype(base) # inherit type from base
843    if host:
844        # if url contains host, just inherit type
845        if type: return type + '://' + host + path
846        else:
847            # no type inherited, so url must have started with //
848            # just return it
849            return url
850    host, basepath = splithost(basepath) # inherit host
851    basepath, basetag = splittag(basepath) # remove extraneous cruft
852    basepath, basequery = splitquery(basepath) # idem
853    if path[:1] != '/':
854        # non-absolute path name
855        if path[:1] in ('#', '?'):
856            # path is just a tag or query, attach to basepath
857            i = len(basepath)
858        else:
859            # else replace last component
860            i = basepath.rfind('/')
861        if i < 0:
862            # basepath not absolute
863            if host:
864                # host present, make absolute
865                basepath = '/'
866            else:
867                # else keep non-absolute
868                basepath = ''
869        else:
870            # remove last file component
871            basepath = basepath[:i+1]
872        # Interpret ../ (important because of symlinks)
873        while basepath and path[:3] == '../':
874            path = path[3:]
875            i = basepath[:-1].rfind('/')
876            if i > 0:
877                basepath = basepath[:i+1]
878            elif i == 0:
879                basepath = '/'
880                break
881            else:
882                basepath = ''
883
884        path = basepath + path
885    if host and path and path[0] != '/':
886        path = '/' + path
887    if type and host: return type + '://' + host + path
888    elif type: return type + ':' + path
889    elif host: return '//' + host + path # don't know what this means
890    else: return path
891
892
893# Utilities to parse URLs (most of these return None for missing parts):
894# unwrap('<URL:type://host/path>') --> 'type://host/path'
895# splittype('type:opaquestring') --> 'type', 'opaquestring'
896# splithost('//host[:port]/path') --> 'host[:port]', '/path'
897# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
898# splitpasswd('user:passwd') -> 'user', 'passwd'
899# splitport('host:port') --> 'host', 'port'
900# splitquery('/path?query') --> '/path', 'query'
901# splittag('/path#tag') --> '/path', 'tag'
902# splitattr('/path;attr1=value1;attr2=value2;...') ->
903#   '/path', ['attr1=value1', 'attr2=value2', ...]
904# splitvalue('attr=value') --> 'attr', 'value'
905# splitgophertype('/Xselector') --> 'X', 'selector'
906# unquote('abc%20def') -> 'abc def'
907# quote('abc def') -> 'abc%20def')
908
909if hasattr(types, "UnicodeType"):
910    def _is_unicode(x):
911        return isinstance(x, unicode)
912else:
913    def _is_unicode(x):
914        return 0
915
916def toBytes(url):
917    """toBytes(u"URL") --> 'URL'."""
918    # Most URL schemes require ASCII. If that changes, the conversion
919    # can be relaxed
920    if _is_unicode(url):
921        try:
922            url = url.encode("ASCII")
923        except UnicodeError:
924            raise UnicodeError("URL " + repr(url) +
925                               " contains non-ASCII characters")
926    return url
927
928def unwrap(url):
929    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
930    url = url.strip()
931    if url[:1] == '<' and url[-1:] == '>':
932        url = url[1:-1].strip()
933    if url[:4] == 'URL:': url = url[4:].strip()
934    return url
935
936_typeprog = None
937def splittype(url):
938    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
939    global _typeprog
940    if _typeprog is None:
941        import re
942        _typeprog = re.compile('^([^/:]+):')
943
944    match = _typeprog.match(url)
945    if match:
946        scheme = match.group(1)
947        return scheme.lower(), url[len(scheme) + 1:]
948    return None, url
949
950_hostprog = None
951def splithost(url):
952    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
953    global _hostprog
954    if _hostprog is None:
955        import re
956        _hostprog = re.compile('^//([^/]*)(.*)$')
957
958    match = _hostprog.match(url)
959    if match: return match.group(1, 2)
960    return None, url
961
962_userprog = None
963def splituser(host):
964    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
965    global _userprog
966    if _userprog is None:
967        import re
968        _userprog = re.compile('^([^@]*)@(.*)$')
969
970    match = _userprog.match(host)
971    if match: return map(unquote, match.group(1, 2))
972    return None, host
973
974_passwdprog = None
975def splitpasswd(user):
976    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
977    global _passwdprog
978    if _passwdprog is None:
979        import re
980        _passwdprog = re.compile('^([^:]*):(.*)$')
981
982    match = _passwdprog.match(user)
983    if match: return match.group(1, 2)
984    return user, None
985
986# splittag('/path#tag') --> '/path', 'tag'
987_portprog = None
988def splitport(host):
989    """splitport('host:port') --> 'host', 'port'."""
990    global _portprog
991    if _portprog is None:
992        import re
993        _portprog = re.compile('^(.*):([0-9]+)$')
994
995    match = _portprog.match(host)
996    if match: return match.group(1, 2)
997    return host, None
998
999_nportprog = None
1000def splitnport(host, defport=-1):
1001    """Split host and port, returning numeric port.
1002    Return given default port if no ':' found; defaults to -1.
1003    Return numerical port if a valid number are found after ':'.
1004    Return None if ':' but not a valid number."""
1005    global _nportprog
1006    if _nportprog is None:
1007        import re
1008        _nportprog = re.compile('^(.*):(.*)$')
1009
1010    match = _nportprog.match(host)
1011    if match:
1012        host, port = match.group(1, 2)
1013        try:
1014            if not port: raise ValueError, "no digits"
1015            nport = int(port)
1016        except ValueError:
1017            nport = None
1018        return host, nport
1019    return host, defport
1020
1021_queryprog = None
1022def splitquery(url):
1023    """splitquery('/path?query') --> '/path', 'query'."""
1024    global _queryprog
1025    if _queryprog is None:
1026        import re
1027        _queryprog = re.compile('^(.*)\?([^?]*)$')
1028
1029    match = _queryprog.match(url)
1030    if match: return match.group(1, 2)
1031    return url, None
1032
1033_tagprog = None
1034def splittag(url):
1035    """splittag('/path#tag') --> '/path', 'tag'."""
1036    global _tagprog
1037    if _tagprog is None:
1038        import re
1039        _tagprog = re.compile('^(.*)#([^#]*)$')
1040
1041    match = _tagprog.match(url)
1042    if match: return match.group(1, 2)
1043    return url, None
1044
1045def splitattr(url):
1046    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1047        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1048    words = url.split(';')
1049    return words[0], words[1:]
1050
1051_valueprog = None
1052def splitvalue(attr):
1053    """splitvalue('attr=value') --> 'attr', 'value'."""
1054    global _valueprog
1055    if _valueprog is None:
1056        import re
1057        _valueprog = re.compile('^([^=]*)=(.*)$')
1058
1059    match = _valueprog.match(attr)
1060    if match: return match.group(1, 2)
1061    return attr, None
1062
1063def splitgophertype(selector):
1064    """splitgophertype('/Xselector') --> 'X', 'selector'."""
1065    if selector[:1] == '/' and selector[1:2]:
1066        return selector[1], selector[2:]
1067    return None, selector
1068
1069def unquote(s):
1070    """unquote('abc%20def') -> 'abc def'."""
1071    mychr = chr
1072    myatoi = int
1073    list = s.split('%')
1074    res = [list[0]]
1075    myappend = res.append
1076    del list[0]
1077    for item in list:
1078        if item[1:2]:
1079            try:
1080                myappend(mychr(myatoi(item[:2], 16))
1081                     + item[2:])
1082            except ValueError:
1083                myappend('%' + item)
1084        else:
1085            myappend('%' + item)
1086    return "".join(res)
1087
1088def unquote_plus(s):
1089    """unquote('%7e/abc+def') -> '~/abc def'"""
1090    if '+' in s:
1091        # replace '+' with ' '
1092        s = ' '.join(s.split('+'))
1093    return unquote(s)
1094
1095always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1096               'abcdefghijklmnopqrstuvwxyz'
1097               '0123456789' '_.-')
1098
1099_fast_safe_test = always_safe + '/'
1100_fast_safe = None
1101
1102def _fast_quote(s):
1103    global _fast_safe
1104    if _fast_safe is None:
1105        _fast_safe = {}
1106        for c in _fast_safe_test:
1107            _fast_safe[c] = c
1108    res = list(s)
1109    for i in range(len(res)):
1110        c = res[i]
1111        if not c in _fast_safe:
1112            res[i] = '%%%02X' % ord(c)
1113    return ''.join(res)
1114
1115def quote(s, safe = '/'):
1116    """quote('abc def') -> 'abc%20def'
1117
1118    Each part of a URL, e.g. the path info, the query, etc., has a
1119    different set of reserved characters that must be quoted.
1120
1121    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1122    the following reserved characters.
1123
1124    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1125                  "$" | ","
1126
1127    Each of these characters is reserved in some component of a URL,
1128    but not necessarily in all of them.
1129
1130    By default, the quote function is intended for quoting the path
1131    section of a URL.  Thus, it will not encode '/'.  This character
1132    is reserved, but in typical usage the quote function is being
1133    called on a path where the existing slash characters are used as
1134    reserved characters.
1135    """
1136    safe = always_safe + safe
1137    if _fast_safe_test == safe:
1138        return _fast_quote(s)
1139    res = list(s)
1140    for i in range(len(res)):
1141        c = res[i]
1142        if c not in safe:
1143            res[i] = '%%%02X' % ord(c)
1144    return ''.join(res)
1145
1146def quote_plus(s, safe = ''):
1147    """Quote the query fragment of a URL; replacing ' ' with '+'"""
1148    if ' ' in s:
1149        l = s.split(' ')
1150        for i in range(len(l)):
1151            l[i] = quote(l[i], safe)
1152        return '+'.join(l)
1153    else:
1154        return quote(s, safe)
1155
1156def urlencode(query,doseq=0):
1157    """Encode a sequence of two-element tuples or dictionary into a URL query string.
1158
1159    If any values in the query arg are sequences and doseq is true, each
1160    sequence element is converted to a separate parameter.
1161
1162    If the query arg is a sequence of two-element tuples, the order of the
1163    parameters in the output will match the order of parameters in the
1164    input.
1165    """
1166
1167    if hasattr(query,"items"):
1168        # mapping objects
1169        query = query.items()
1170    else:
1171        # it's a bother at times that strings and string-like objects are
1172        # sequences...
1173        try:
1174            # non-sequence items should not work with len()
1175            # non-empty strings will fail this
1176            if len(query) and type(query[0]) != types.TupleType:
1177                raise TypeError
1178            # zero-length sequences of all types will get here and succeed,
1179            # but that's a minor nit - since the original implementation
1180            # allowed empty dicts that type of behavior probably should be
1181            # preserved for consistency
1182        except TypeError:
1183            ty,va,tb = sys.exc_info()
1184            raise TypeError, "not a valid non-string sequence or mapping object", tb
1185
1186    l = []
1187    if not doseq:
1188        # preserve old behavior
1189        for k, v in query:
1190            k = quote_plus(str(k))
1191            v = quote_plus(str(v))
1192            l.append(k + '=' + v)
1193    else:
1194        for k, v in query:
1195            k = quote_plus(str(k))
1196            if type(v) == types.StringType:
1197                v = quote_plus(v)
1198                l.append(k + '=' + v)
1199            elif _is_unicode(v):
1200                # is there a reasonable way to convert to ASCII?
1201                # encode generates a string, but "replace" or "ignore"
1202                # lose information and "strict" can raise UnicodeError
1203                v = quote_plus(v.encode("ASCII","replace"))
1204                l.append(k + '=' + v)
1205            else:
1206                try:
1207                    # is this a sufficient test for sequence-ness?
1208                    x = len(v)
1209                except TypeError:
1210                    # not a sequence
1211                    v = quote_plus(str(v))
1212                    l.append(k + '=' + v)
1213                else:
1214                    # loop over the sequence
1215                    for elt in v:
1216                        l.append(k + '=' + quote_plus(str(elt)))
1217    return '&'.join(l)
1218
1219# Proxy handling
1220def getproxies_environment():
1221    """Return a dictionary of scheme -> proxy server URL mappings.
1222
1223    Scan the environment for variables named <scheme>_proxy;
1224    this seems to be the standard convention.  If you need a
1225    different way, you can pass a proxies dictionary to the
1226    [Fancy]URLopener constructor.
1227
1228    """
1229    proxies = {}
1230    for name, value in os.environ.items():
1231        name = name.lower()
1232        if value and name[-6:] == '_proxy':
1233            proxies[name[:-6]] = value
1234    return proxies
1235
1236if os.name == 'mac':
1237    def getproxies():
1238        """Return a dictionary of scheme -> proxy server URL mappings.
1239
1240        By convention the mac uses Internet Config to store
1241        proxies.  An HTTP proxy, for instance, is stored under
1242        the HttpProxy key.
1243
1244        """
1245        try:
1246            import ic
1247        except ImportError:
1248            return {}
1249
1250        try:
1251            config = ic.IC()
1252        except ic.error:
1253            return {}
1254        proxies = {}
1255        # HTTP:
1256        if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
1257            try:
1258                value = config['HTTPProxyHost']
1259            except ic.error:
1260                pass
1261            else:
1262                proxies['http'] = 'http://%s' % value
1263        # FTP: XXXX To be done.
1264        # Gopher: XXXX To be done.
1265        return proxies
1266
1267    def proxy_bypass(x):
1268        return 0
1269
1270elif os.name == 'nt':
1271    def getproxies_registry():
1272        """Return a dictionary of scheme -> proxy server URL mappings.
1273
1274        Win32 uses the registry to store proxies.
1275
1276        """
1277        proxies = {}
1278        try:
1279            import _winreg
1280        except ImportError:
1281            # Std module, so should be around - but you never know!
1282            return proxies
1283        try:
1284            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1285                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1286            proxyEnable = _winreg.QueryValueEx(internetSettings,
1287                                               'ProxyEnable')[0]
1288            if proxyEnable:
1289                # Returned as Unicode but problems if not converted to ASCII
1290                proxyServer = str(_winreg.QueryValueEx(internetSettings,
1291                                                       'ProxyServer')[0])
1292                if '=' in proxyServer:
1293                    # Per-protocol settings
1294                    for p in proxyServer.split(';'):
1295                        protocol, address = p.split('=', 1)
1296                        # See if address has a type:// prefix
1297                        import re
1298                        if not re.match('^([^/:]+)://', address):
1299                            address = '%s://%s' % (protocol, address)
1300                        proxies[protocol] = address
1301                else:
1302                    # Use one setting for all protocols
1303                    if proxyServer[:5] == 'http:':
1304                        proxies['http'] = proxyServer
1305                    else:
1306                        proxies['http'] = 'http://%s' % proxyServer
1307                        proxies['ftp'] = 'ftp://%s' % proxyServer
1308            internetSettings.Close()
1309        except (WindowsError, ValueError, TypeError):
1310            # Either registry key not found etc, or the value in an
1311            # unexpected format.
1312            # proxies already set up to be empty so nothing to do
1313            pass
1314        return proxies
1315
1316    def getproxies():
1317        """Return a dictionary of scheme -> proxy server URL mappings.
1318
1319        Returns settings gathered from the environment, if specified,
1320        or the registry.
1321
1322        """
1323        return getproxies_environment() or getproxies_registry()
1324
1325    def proxy_bypass(host):
1326        try:
1327            import _winreg
1328            import re
1329        except ImportError:
1330            # Std modules, so should be around - but you never know!
1331            return 0
1332        try:
1333            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1334                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1335            proxyEnable = _winreg.QueryValueEx(internetSettings,
1336                                               'ProxyEnable')[0]
1337            proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1338                                                     'ProxyOverride')[0])
1339            # ^^^^ Returned as Unicode but problems if not converted to ASCII
1340        except WindowsError:
1341            return 0
1342        if not proxyEnable or not proxyOverride:
1343            return 0
1344        # try to make a host list from name and IP address.
1345        host = [host]
1346        try:
1347            addr = socket.gethostbyname(host[0])
1348            if addr != host:
1349                host.append(addr)
1350        except socket.error:
1351            pass
1352        # make a check value list from the registry entry: replace the
1353        # '<local>' string by the localhost entry and the corresponding
1354        # canonical entry.
1355        proxyOverride = proxyOverride.split(';')
1356        i = 0
1357        while i < len(proxyOverride):
1358            if proxyOverride[i] == '<local>':
1359                proxyOverride[i:i+1] = ['localhost',
1360                                        '127.0.0.1',
1361                                        socket.gethostname(),
1362                                        socket.gethostbyname(
1363                                            socket.gethostname())]
1364            i += 1
1365        # print proxyOverride
1366        # now check if we match one of the registry values.
1367        for test in proxyOverride:
1368            test = test.replace(".", r"\.")     # mask dots
1369            test = test.replace("*", r".*")     # change glob sequence
1370            test = test.replace("?", r".")      # change glob char
1371            for val in host:
1372                # print "%s <--> %s" %( test, val )
1373                if re.match(test, val, re.I):
1374                    return 1
1375        return 0
1376
1377else:
1378    # By default use environment variables
1379    getproxies = getproxies_environment
1380
1381    def proxy_bypass(host):
1382        return 0
1383
1384# Test and time quote() and unquote()
1385def test1():
1386    s = ''
1387    for i in range(256): s = s + chr(i)
1388    s = s*4
1389    t0 = time.time()
1390    qs = quote(s)
1391    uqs = unquote(qs)
1392    t1 = time.time()
1393    if uqs != s:
1394        print 'Wrong!'
1395    print `s`
1396    print `qs`
1397    print `uqs`
1398    print round(t1 - t0, 3), 'sec'
1399
1400
1401def reporthook(blocknum, blocksize, totalsize):
1402    # Report during remote transfers
1403    print "Block number: %d, Block size: %d, Total size: %d" % (
1404        blocknum, blocksize, totalsize)
1405
1406# Test program
1407def test(args=[]):
1408    if not args:
1409        args = [
1410            '/etc/passwd',
1411            'file:/etc/passwd',
1412            'file://localhost/etc/passwd',
1413            'ftp://ftp.python.org/pub/python/README',
1414##          'gopher://gopher.micro.umn.edu/1/',
1415            'http://www.python.org/index.html',
1416            ]
1417        if hasattr(URLopener, "open_https"):
1418            args.append('https://synergy.as.cmu.edu/~geek/')
1419    try:
1420        for url in args:
1421            print '-'*10, url, '-'*10
1422            fn, h = urlretrieve(url, None, reporthook)
1423            print fn
1424            if h:
1425                print '======'
1426                for k in h.keys(): print k + ':', h[k]
1427                print '======'
1428            fp = open(fn, 'rb')
1429            data = fp.read()
1430            del fp
1431            if '\r' in data:
1432                table = string.maketrans("", "")
1433                data = data.translate(table, "\r")
1434            print data
1435            fn, h = None, None
1436        print '-'*40
1437    finally:
1438        urlcleanup()
1439
1440def main():
1441    import getopt, sys
1442    try:
1443        opts, args = getopt.getopt(sys.argv[1:], "th")
1444    except getopt.error, msg:
1445        print msg
1446        print "Use -h for help"
1447        return
1448    t = 0
1449    for o, a in opts:
1450        if o == '-t':
1451            t = t + 1
1452        if o == '-h':
1453            print "Usage: python urllib.py [-t] [url ...]"
1454            print "-t runs self-test;",
1455            print "otherwise, contents of urls are printed"
1456            return
1457    if t:
1458        if t > 1:
1459            test1()
1460        test(args)
1461    else:
1462        if not args:
1463            print "Use -h for help"
1464        for url in args:
1465            print urlopen(url).read(),
1466
1467# Run test program when run as a script
1468if __name__ == '__main__':
1469    main()
1470