urllib.py revision c680ae8002e955ef616741ae59338f0cde0f2ff8
1"""Open an arbitrary URL. 2 3See the following document for more info on URLs: 4"Names and Addresses, URIs, URLs, URNs, URCs", at 5http://www.w3.org/pub/WWW/Addressing/Overview.html 6 7See also the HTTP spec (from which the error codes are derived): 8"HTTP - Hypertext Transfer Protocol", at 9http://www.w3.org/pub/WWW/Protocols/ 10 11Related standards and specs: 12- RFC1808: the "relative URL" spec. (authoritative status) 13- RFC1738 - the "URL standard". (authoritative status) 14- RFC1630 - the "URI spec". (informational status) 15 16The object returned by URLopener().open(file) will differ per 17protocol. All you know is that is has methods read(), readline(), 18readlines(), fileno(), close() and info(). The read*(), fileno() 19and close() methods work like those of open files. 20The info() method returns a mimetools.Message object which can be 21used to query various info about the object, if available. 22(mimetools.Message objects are queried with the getheader() method.) 23""" 24 25import string 26import socket 27import os 28import stat 29import time 30import sys 31import types 32 33__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve", 34 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus", 35 "urlencode", "url2pathname", "pathname2url", "splittag", 36 "localhost", "thishost", "ftperrors", "basejoin", "unwrap", 37 "splittype", "splithost", "splituser", "splitpasswd", "splitport", 38 "splitnport", "splitquery", "splitattr", "splitvalue", 39 "splitgophertype", "getproxies"] 40 41__version__ = '1.15' # XXX This version is not always updated :-( 42 43MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 44 45# Helper for non-unix systems 46if os.name == 'mac': 47 from macurl2path import url2pathname, pathname2url 48elif os.name == 'nt': 49 from nturl2path import url2pathname, pathname2url 50elif os.name == 'riscos': 51 from rourl2path import url2pathname, pathname2url 52else: 53 def url2pathname(pathname): 54 return unquote(pathname) 55 def pathname2url(pathname): 56 return quote(pathname) 57 58# This really consists of two pieces: 59# (1) a class which handles opening of all sorts of URLs 60# (plus assorted utilities etc.) 61# (2) a set of functions for parsing URLs 62# XXX Should these be separated out into different modules? 63 64 65# Shortcut for basic usage 66_urlopener = None 67def urlopen(url, data=None): 68 """urlopen(url [, data]) -> open file-like object""" 69 global _urlopener 70 if not _urlopener: 71 _urlopener = FancyURLopener() 72 if data is None: 73 return _urlopener.open(url) 74 else: 75 return _urlopener.open(url, data) 76def urlretrieve(url, filename=None, reporthook=None, data=None): 77 global _urlopener 78 if not _urlopener: 79 _urlopener = FancyURLopener() 80 return _urlopener.retrieve(url, filename, reporthook, data) 81def urlcleanup(): 82 if _urlopener: 83 _urlopener.cleanup() 84 85 86ftpcache = {} 87class URLopener: 88 """Class to open URLs. 89 This is a class rather than just a subroutine because we may need 90 more than one set of global protocol-specific options. 91 Note -- this is a base class for those who don't want the 92 automatic handling of errors type 302 (relocated) and 401 93 (authorization needed).""" 94 95 __tempfiles = None 96 97 version = "Python-urllib/%s" % __version__ 98 99 # Constructor 100 def __init__(self, proxies=None, **x509): 101 if proxies is None: 102 proxies = getproxies() 103 assert hasattr(proxies, 'has_key'), "proxies must be a mapping" 104 self.proxies = proxies 105 self.key_file = x509.get('key_file') 106 self.cert_file = x509.get('cert_file') 107 self.addheaders = [('User-agent', self.version)] 108 self.__tempfiles = [] 109 self.__unlink = os.unlink # See cleanup() 110 self.tempcache = None 111 # Undocumented feature: if you assign {} to tempcache, 112 # it is used to cache files retrieved with 113 # self.retrieve(). This is not enabled by default 114 # since it does not work for changing documents (and I 115 # haven't got the logic to check expiration headers 116 # yet). 117 self.ftpcache = ftpcache 118 # Undocumented feature: you can use a different 119 # ftp cache by assigning to the .ftpcache member; 120 # in case you want logically independent URL openers 121 # XXX This is not threadsafe. Bah. 122 123 def __del__(self): 124 self.close() 125 126 def close(self): 127 self.cleanup() 128 129 def cleanup(self): 130 # This code sometimes runs when the rest of this module 131 # has already been deleted, so it can't use any globals 132 # or import anything. 133 if self.__tempfiles: 134 for file in self.__tempfiles: 135 try: 136 self.__unlink(file) 137 except OSError: 138 pass 139 del self.__tempfiles[:] 140 if self.tempcache: 141 self.tempcache.clear() 142 143 def addheader(self, *args): 144 """Add a header to be used by the HTTP interface only 145 e.g. u.addheader('Accept', 'sound/basic')""" 146 self.addheaders.append(args) 147 148 # External interface 149 def open(self, fullurl, data=None): 150 """Use URLopener().open(file) instead of open(file, 'r').""" 151 fullurl = unwrap(toBytes(fullurl)) 152 if self.tempcache and self.tempcache.has_key(fullurl): 153 filename, headers = self.tempcache[fullurl] 154 fp = open(filename, 'rb') 155 return addinfourl(fp, headers, fullurl) 156 urltype, url = splittype(fullurl) 157 if not urltype: 158 urltype = 'file' 159 if self.proxies.has_key(urltype): 160 proxy = self.proxies[urltype] 161 urltype, proxyhost = splittype(proxy) 162 host, selector = splithost(proxyhost) 163 url = (host, fullurl) # Signal special case to open_*() 164 else: 165 proxy = None 166 name = 'open_' + urltype 167 self.type = urltype 168 if '-' in name: 169 # replace - with _ 170 name = '_'.join(name.split('-')) 171 if not hasattr(self, name): 172 if proxy: 173 return self.open_unknown_proxy(proxy, fullurl, data) 174 else: 175 return self.open_unknown(fullurl, data) 176 try: 177 if data is None: 178 return getattr(self, name)(url) 179 else: 180 return getattr(self, name)(url, data) 181 except socket.error, msg: 182 raise IOError, ('socket error', msg), sys.exc_info()[2] 183 184 def open_unknown(self, fullurl, data=None): 185 """Overridable interface to open unknown URL type.""" 186 type, url = splittype(fullurl) 187 raise IOError, ('url error', 'unknown url type', type) 188 189 def open_unknown_proxy(self, proxy, fullurl, data=None): 190 """Overridable interface to open unknown URL type.""" 191 type, url = splittype(fullurl) 192 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy) 193 194 # External interface 195 def retrieve(self, url, filename=None, reporthook=None, data=None): 196 """retrieve(url) returns (filename, None) for a local object 197 or (tempfilename, headers) for a remote object.""" 198 url = unwrap(toBytes(url)) 199 if self.tempcache and self.tempcache.has_key(url): 200 return self.tempcache[url] 201 type, url1 = splittype(url) 202 if not filename and (not type or type == 'file'): 203 try: 204 fp = self.open_local_file(url1) 205 hdrs = fp.info() 206 del fp 207 return url2pathname(splithost(url1)[1]), hdrs 208 except IOError, msg: 209 pass 210 fp = self.open(url, data) 211 headers = fp.info() 212 if not filename: 213 import tempfile 214 garbage, path = splittype(url) 215 garbage, path = splithost(path or "") 216 path, garbage = splitquery(path or "") 217 path, garbage = splitattr(path or "") 218 suffix = os.path.splitext(path)[1] 219 filename = tempfile.mktemp(suffix) 220 self.__tempfiles.append(filename) 221 result = filename, headers 222 if self.tempcache is not None: 223 self.tempcache[url] = result 224 tfp = open(filename, 'wb') 225 bs = 1024*8 226 size = -1 227 blocknum = 1 228 if reporthook: 229 if headers.has_key("content-length"): 230 size = int(headers["Content-Length"]) 231 reporthook(0, bs, size) 232 block = fp.read(bs) 233 if reporthook: 234 reporthook(1, bs, size) 235 while block: 236 tfp.write(block) 237 block = fp.read(bs) 238 blocknum = blocknum + 1 239 if reporthook: 240 reporthook(blocknum, bs, size) 241 fp.close() 242 tfp.close() 243 del fp 244 del tfp 245 return result 246 247 # Each method named open_<type> knows how to open that type of URL 248 249 def open_http(self, url, data=None): 250 """Use HTTP protocol.""" 251 import httplib 252 user_passwd = None 253 if type(url) is types.StringType: 254 host, selector = splithost(url) 255 if host: 256 user_passwd, host = splituser(host) 257 host = unquote(host) 258 realhost = host 259 else: 260 host, selector = url 261 urltype, rest = splittype(selector) 262 url = rest 263 user_passwd = None 264 if urltype.lower() != 'http': 265 realhost = None 266 else: 267 realhost, rest = splithost(rest) 268 if realhost: 269 user_passwd, realhost = splituser(realhost) 270 if user_passwd: 271 selector = "%s://%s%s" % (urltype, realhost, rest) 272 if proxy_bypass(realhost): 273 host = realhost 274 275 #print "proxy via http:", host, selector 276 if not host: raise IOError, ('http error', 'no host given') 277 if user_passwd: 278 import base64 279 auth = base64.encodestring(user_passwd).strip() 280 else: 281 auth = None 282 h = httplib.HTTP(host) 283 if data is not None: 284 h.putrequest('POST', selector) 285 h.putheader('Content-type', 'application/x-www-form-urlencoded') 286 h.putheader('Content-length', '%d' % len(data)) 287 else: 288 h.putrequest('GET', selector) 289 if auth: h.putheader('Authorization', 'Basic %s' % auth) 290 if realhost: h.putheader('Host', realhost) 291 for args in self.addheaders: apply(h.putheader, args) 292 h.endheaders() 293 if data is not None: 294 h.send(data) 295 errcode, errmsg, headers = h.getreply() 296 fp = h.getfile() 297 if errcode == 200: 298 return addinfourl(fp, headers, "http:" + url) 299 else: 300 if data is None: 301 return self.http_error(url, fp, errcode, errmsg, headers) 302 else: 303 return self.http_error(url, fp, errcode, errmsg, headers, data) 304 305 def http_error(self, url, fp, errcode, errmsg, headers, data=None): 306 """Handle http errors. 307 Derived class can override this, or provide specific handlers 308 named http_error_DDD where DDD is the 3-digit error code.""" 309 # First check if there's a specific handler for this error 310 name = 'http_error_%d' % errcode 311 if hasattr(self, name): 312 method = getattr(self, name) 313 if data is None: 314 result = method(url, fp, errcode, errmsg, headers) 315 else: 316 result = method(url, fp, errcode, errmsg, headers, data) 317 if result: return result 318 return self.http_error_default(url, fp, errcode, errmsg, headers) 319 320 def http_error_default(self, url, fp, errcode, errmsg, headers): 321 """Default error handler: close the connection and raise IOError.""" 322 void = fp.read() 323 fp.close() 324 raise IOError, ('http error', errcode, errmsg, headers) 325 326 if hasattr(socket, "ssl"): 327 def open_https(self, url, data=None): 328 """Use HTTPS protocol.""" 329 import httplib 330 user_passwd = None 331 if type(url) is types.StringType: 332 host, selector = splithost(url) 333 if host: 334 user_passwd, host = splituser(host) 335 host = unquote(host) 336 realhost = host 337 else: 338 host, selector = url 339 urltype, rest = splittype(selector) 340 url = rest 341 user_passwd = None 342 if urltype.lower() != 'https': 343 realhost = None 344 else: 345 realhost, rest = splithost(rest) 346 if realhost: 347 user_passwd, realhost = splituser(realhost) 348 if user_passwd: 349 selector = "%s://%s%s" % (urltype, realhost, rest) 350 #print "proxy via https:", host, selector 351 if not host: raise IOError, ('https error', 'no host given') 352 if user_passwd: 353 import base64 354 auth = base64.encodestring(user_passwd).strip() 355 else: 356 auth = None 357 h = httplib.HTTPS(host, 0, 358 key_file=self.key_file, 359 cert_file=self.cert_file) 360 if data is not None: 361 h.putrequest('POST', selector) 362 h.putheader('Content-type', 363 'application/x-www-form-urlencoded') 364 h.putheader('Content-length', '%d' % len(data)) 365 else: 366 h.putrequest('GET', selector) 367 if auth: h.putheader('Authorization: Basic %s' % auth) 368 if realhost: h.putheader('Host', realhost) 369 for args in self.addheaders: apply(h.putheader, args) 370 h.endheaders() 371 if data is not None: 372 h.send(data) 373 errcode, errmsg, headers = h.getreply() 374 fp = h.getfile() 375 if errcode == 200: 376 return addinfourl(fp, headers, url) 377 else: 378 if data is None: 379 return self.http_error(url, fp, errcode, errmsg, headers) 380 else: 381 return self.http_error(url, fp, errcode, errmsg, headers, 382 data) 383 384 def open_gopher(self, url): 385 """Use Gopher protocol.""" 386 import gopherlib 387 host, selector = splithost(url) 388 if not host: raise IOError, ('gopher error', 'no host given') 389 host = unquote(host) 390 type, selector = splitgophertype(selector) 391 selector, query = splitquery(selector) 392 selector = unquote(selector) 393 if query: 394 query = unquote(query) 395 fp = gopherlib.send_query(selector, query, host) 396 else: 397 fp = gopherlib.send_selector(selector, host) 398 return addinfourl(fp, noheaders(), "gopher:" + url) 399 400 def open_file(self, url): 401 """Use local file or FTP depending on form of URL.""" 402 if url[:2] == '//' and url[2:3] != '/': 403 return self.open_ftp(url) 404 else: 405 return self.open_local_file(url) 406 407 def open_local_file(self, url): 408 """Use local file.""" 409 import mimetypes, mimetools, rfc822, StringIO 410 host, file = splithost(url) 411 localname = url2pathname(file) 412 stats = os.stat(localname) 413 size = stats[stat.ST_SIZE] 414 modified = rfc822.formatdate(stats[stat.ST_MTIME]) 415 mtype = mimetypes.guess_type(url)[0] 416 headers = mimetools.Message(StringIO.StringIO( 417 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % 418 (mtype or 'text/plain', size, modified))) 419 if not host: 420 urlfile = file 421 if file[:1] == '/': 422 urlfile = 'file://' + file 423 return addinfourl(open(localname, 'rb'), 424 headers, urlfile) 425 host, port = splitport(host) 426 if not port \ 427 and socket.gethostbyname(host) in (localhost(), thishost()): 428 urlfile = file 429 if file[:1] == '/': 430 urlfile = 'file://' + file 431 return addinfourl(open(localname, 'rb'), 432 headers, urlfile) 433 raise IOError, ('local file error', 'not on local host') 434 435 def open_ftp(self, url): 436 """Use FTP protocol.""" 437 import mimetypes, mimetools, StringIO 438 host, path = splithost(url) 439 if not host: raise IOError, ('ftp error', 'no host given') 440 host, port = splitport(host) 441 user, host = splituser(host) 442 if user: user, passwd = splitpasswd(user) 443 else: passwd = None 444 host = unquote(host) 445 user = unquote(user or '') 446 passwd = unquote(passwd or '') 447 host = socket.gethostbyname(host) 448 if not port: 449 import ftplib 450 port = ftplib.FTP_PORT 451 else: 452 port = int(port) 453 path, attrs = splitattr(path) 454 path = unquote(path) 455 dirs = path.split('/') 456 dirs, file = dirs[:-1], dirs[-1] 457 if dirs and not dirs[0]: dirs = dirs[1:] 458 if dirs and not dirs[0]: dirs[0] = '/' 459 key = user, host, port, '/'.join(dirs) 460 # XXX thread unsafe! 461 if len(self.ftpcache) > MAXFTPCACHE: 462 # Prune the cache, rather arbitrarily 463 for k in self.ftpcache.keys(): 464 if k != key: 465 v = self.ftpcache[k] 466 del self.ftpcache[k] 467 v.close() 468 try: 469 if not self.ftpcache.has_key(key): 470 self.ftpcache[key] = \ 471 ftpwrapper(user, passwd, host, port, dirs) 472 if not file: type = 'D' 473 else: type = 'I' 474 for attr in attrs: 475 attr, value = splitvalue(attr) 476 if attr.lower() == 'type' and \ 477 value in ('a', 'A', 'i', 'I', 'd', 'D'): 478 type = value.upper() 479 (fp, retrlen) = self.ftpcache[key].retrfile(file, type) 480 mtype = mimetypes.guess_type("ftp:" + url)[0] 481 headers = "" 482 if mtype: 483 headers += "Content-Type: %s\n" % mtype 484 if retrlen is not None and retrlen >= 0: 485 headers += "Content-Length: %d\n" % retrlen 486 headers = mimetools.Message(StringIO.StringIO(headers)) 487 return addinfourl(fp, headers, "ftp:" + url) 488 except ftperrors(), msg: 489 raise IOError, ('ftp error', msg), sys.exc_info()[2] 490 491 def open_data(self, url, data=None): 492 """Use "data" URL.""" 493 # ignore POSTed data 494 # 495 # syntax of data URLs: 496 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 497 # mediatype := [ type "/" subtype ] *( ";" parameter ) 498 # data := *urlchar 499 # parameter := attribute "=" value 500 import StringIO, mimetools, time 501 try: 502 [type, data] = url.split(',', 1) 503 except ValueError: 504 raise IOError, ('data error', 'bad data URL') 505 if not type: 506 type = 'text/plain;charset=US-ASCII' 507 semi = type.rfind(';') 508 if semi >= 0 and '=' not in type[semi:]: 509 encoding = type[semi+1:] 510 type = type[:semi] 511 else: 512 encoding = '' 513 msg = [] 514 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT', 515 time.gmtime(time.time()))) 516 msg.append('Content-type: %s' % type) 517 if encoding == 'base64': 518 import base64 519 data = base64.decodestring(data) 520 else: 521 data = unquote(data) 522 msg.append('Content-length: %d' % len(data)) 523 msg.append('') 524 msg.append(data) 525 msg = '\n'.join(msg) 526 f = StringIO.StringIO(msg) 527 headers = mimetools.Message(f, 0) 528 f.fileno = None # needed for addinfourl 529 return addinfourl(f, headers, url) 530 531 532class FancyURLopener(URLopener): 533 """Derived class with handlers for errors we can handle (perhaps).""" 534 535 def __init__(self, *args): 536 apply(URLopener.__init__, (self,) + args) 537 self.auth_cache = {} 538 self.tries = 0 539 self.maxtries = 10 540 541 def http_error_default(self, url, fp, errcode, errmsg, headers): 542 """Default error handling -- don't raise an exception.""" 543 return addinfourl(fp, headers, "http:" + url) 544 545 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 546 """Error 302 -- relocated (temporarily).""" 547 self.tries += 1 548 if self.maxtries and self.tries >= self.maxtries: 549 if hasattr(self, "http_error_500"): 550 meth = self.http_error_500 551 else: 552 meth = self.http_error_default 553 self.tries = 0 554 return meth(url, fp, 500, 555 "Internal Server Error: Redirect Recursion", headers) 556 result = self.redirect_internal(url, fp, errcode, errmsg, headers, 557 data) 558 self.tries = 0 559 return result 560 561 def redirect_internal(self, url, fp, errcode, errmsg, headers, data): 562 if headers.has_key('location'): 563 newurl = headers['location'] 564 elif headers.has_key('uri'): 565 newurl = headers['uri'] 566 else: 567 return 568 void = fp.read() 569 fp.close() 570 # In case the server sent a relative URL, join with original: 571 newurl = basejoin(self.type + ":" + url, newurl) 572 if data is None: 573 return self.open(newurl) 574 else: 575 return self.open(newurl, data) 576 577 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): 578 """Error 301 -- also relocated (permanently).""" 579 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 580 581 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): 582 """Error 401 -- authentication required. 583 See this URL for a description of the basic authentication scheme: 584 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt""" 585 if not headers.has_key('www-authenticate'): 586 URLopener.http_error_default(self, url, fp, 587 errcode, errmsg, headers) 588 stuff = headers['www-authenticate'] 589 import re 590 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 591 if not match: 592 URLopener.http_error_default(self, url, fp, 593 errcode, errmsg, headers) 594 scheme, realm = match.groups() 595 if scheme.lower() != 'basic': 596 URLopener.http_error_default(self, url, fp, 597 errcode, errmsg, headers) 598 name = 'retry_' + self.type + '_basic_auth' 599 if data is None: 600 return getattr(self,name)(url, realm) 601 else: 602 return getattr(self,name)(url, realm, data) 603 604 def retry_http_basic_auth(self, url, realm, data=None): 605 host, selector = splithost(url) 606 i = host.find('@') + 1 607 host = host[i:] 608 user, passwd = self.get_user_passwd(host, realm, i) 609 if not (user or passwd): return None 610 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host 611 newurl = 'http://' + host + selector 612 if data is None: 613 return self.open(newurl) 614 else: 615 return self.open(newurl, data) 616 617 def retry_https_basic_auth(self, url, realm, data=None): 618 host, selector = splithost(url) 619 i = host.find('@') + 1 620 host = host[i:] 621 user, passwd = self.get_user_passwd(host, realm, i) 622 if not (user or passwd): return None 623 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host 624 newurl = '//' + host + selector 625 return self.open_https(newurl, data) 626 627 def get_user_passwd(self, host, realm, clear_cache = 0): 628 key = realm + '@' + host.lower() 629 if self.auth_cache.has_key(key): 630 if clear_cache: 631 del self.auth_cache[key] 632 else: 633 return self.auth_cache[key] 634 user, passwd = self.prompt_user_passwd(host, realm) 635 if user or passwd: self.auth_cache[key] = (user, passwd) 636 return user, passwd 637 638 def prompt_user_passwd(self, host, realm): 639 """Override this in a GUI environment!""" 640 import getpass 641 try: 642 user = raw_input("Enter username for %s at %s: " % (realm, 643 host)) 644 passwd = getpass.getpass("Enter password for %s in %s at %s: " % 645 (user, realm, host)) 646 return user, passwd 647 except KeyboardInterrupt: 648 print 649 return None, None 650 651 652# Utility functions 653 654_localhost = None 655def localhost(): 656 """Return the IP address of the magic hostname 'localhost'.""" 657 global _localhost 658 if not _localhost: 659 _localhost = socket.gethostbyname('localhost') 660 return _localhost 661 662_thishost = None 663def thishost(): 664 """Return the IP address of the current host.""" 665 global _thishost 666 if not _thishost: 667 _thishost = socket.gethostbyname(socket.gethostname()) 668 return _thishost 669 670_ftperrors = None 671def ftperrors(): 672 """Return the set of errors raised by the FTP class.""" 673 global _ftperrors 674 if not _ftperrors: 675 import ftplib 676 _ftperrors = ftplib.all_errors 677 return _ftperrors 678 679_noheaders = None 680def noheaders(): 681 """Return an empty mimetools.Message object.""" 682 global _noheaders 683 if not _noheaders: 684 import mimetools 685 import StringIO 686 _noheaders = mimetools.Message(StringIO.StringIO(), 0) 687 _noheaders.fp.close() # Recycle file descriptor 688 return _noheaders 689 690 691# Utility classes 692 693class ftpwrapper: 694 """Class used by open_ftp() for cache of open FTP connections.""" 695 696 def __init__(self, user, passwd, host, port, dirs): 697 self.user = user 698 self.passwd = passwd 699 self.host = host 700 self.port = port 701 self.dirs = dirs 702 self.init() 703 704 def init(self): 705 import ftplib 706 self.busy = 0 707 self.ftp = ftplib.FTP() 708 self.ftp.connect(self.host, self.port) 709 self.ftp.login(self.user, self.passwd) 710 for dir in self.dirs: 711 self.ftp.cwd(dir) 712 713 def retrfile(self, file, type): 714 import ftplib 715 self.endtransfer() 716 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 717 else: cmd = 'TYPE ' + type; isdir = 0 718 try: 719 self.ftp.voidcmd(cmd) 720 except ftplib.all_errors: 721 self.init() 722 self.ftp.voidcmd(cmd) 723 conn = None 724 if file and not isdir: 725 # Use nlst to see if the file exists at all 726 try: 727 self.ftp.nlst(file) 728 except ftplib.error_perm, reason: 729 raise IOError, ('ftp error', reason), sys.exc_info()[2] 730 # Restore the transfer mode! 731 self.ftp.voidcmd(cmd) 732 # Try to retrieve as a file 733 try: 734 cmd = 'RETR ' + file 735 conn = self.ftp.ntransfercmd(cmd) 736 except ftplib.error_perm, reason: 737 if str(reason)[:3] != '550': 738 raise IOError, ('ftp error', reason), sys.exc_info()[2] 739 if not conn: 740 # Set transfer mode to ASCII! 741 self.ftp.voidcmd('TYPE A') 742 # Try a directory listing 743 if file: cmd = 'LIST ' + file 744 else: cmd = 'LIST' 745 conn = self.ftp.ntransfercmd(cmd) 746 self.busy = 1 747 # Pass back both a suitably decorated object and a retrieval length 748 return (addclosehook(conn[0].makefile('rb'), 749 self.endtransfer), conn[1]) 750 def endtransfer(self): 751 if not self.busy: 752 return 753 self.busy = 0 754 try: 755 self.ftp.voidresp() 756 except ftperrors(): 757 pass 758 759 def close(self): 760 self.endtransfer() 761 try: 762 self.ftp.close() 763 except ftperrors(): 764 pass 765 766class addbase: 767 """Base class for addinfo and addclosehook.""" 768 769 def __init__(self, fp): 770 self.fp = fp 771 self.read = self.fp.read 772 self.readline = self.fp.readline 773 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines 774 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno 775 776 def __repr__(self): 777 return '<%s at %s whose fp = %s>' % (self.__class__.__name__, 778 `id(self)`, `self.fp`) 779 780 def close(self): 781 self.read = None 782 self.readline = None 783 self.readlines = None 784 self.fileno = None 785 if self.fp: self.fp.close() 786 self.fp = None 787 788class addclosehook(addbase): 789 """Class to add a close hook to an open file.""" 790 791 def __init__(self, fp, closehook, *hookargs): 792 addbase.__init__(self, fp) 793 self.closehook = closehook 794 self.hookargs = hookargs 795 796 def close(self): 797 addbase.close(self) 798 if self.closehook: 799 apply(self.closehook, self.hookargs) 800 self.closehook = None 801 self.hookargs = None 802 803class addinfo(addbase): 804 """class to add an info() method to an open file.""" 805 806 def __init__(self, fp, headers): 807 addbase.__init__(self, fp) 808 self.headers = headers 809 810 def info(self): 811 return self.headers 812 813class addinfourl(addbase): 814 """class to add info() and geturl() methods to an open file.""" 815 816 def __init__(self, fp, headers, url): 817 addbase.__init__(self, fp) 818 self.headers = headers 819 self.url = url 820 821 def info(self): 822 return self.headers 823 824 def geturl(self): 825 return self.url 826 827 828def basejoin(base, url): 829 """Utility to combine a URL with a base URL to form a new URL.""" 830 type, path = splittype(url) 831 if type: 832 # if url is complete (i.e., it contains a type), return it 833 return url 834 host, path = splithost(path) 835 type, basepath = splittype(base) # inherit type from base 836 if host: 837 # if url contains host, just inherit type 838 if type: return type + '://' + host + path 839 else: 840 # no type inherited, so url must have started with // 841 # just return it 842 return url 843 host, basepath = splithost(basepath) # inherit host 844 basepath, basetag = splittag(basepath) # remove extraneous cruft 845 basepath, basequery = splitquery(basepath) # idem 846 if path[:1] != '/': 847 # non-absolute path name 848 if path[:1] in ('#', '?'): 849 # path is just a tag or query, attach to basepath 850 i = len(basepath) 851 else: 852 # else replace last component 853 i = basepath.rfind('/') 854 if i < 0: 855 # basepath not absolute 856 if host: 857 # host present, make absolute 858 basepath = '/' 859 else: 860 # else keep non-absolute 861 basepath = '' 862 else: 863 # remove last file component 864 basepath = basepath[:i+1] 865 # Interpret ../ (important because of symlinks) 866 while basepath and path[:3] == '../': 867 path = path[3:] 868 i = basepath[:-1].rfind('/') 869 if i > 0: 870 basepath = basepath[:i+1] 871 elif i == 0: 872 basepath = '/' 873 break 874 else: 875 basepath = '' 876 877 path = basepath + path 878 if host and path and path[0] != '/': 879 path = '/' + path 880 if type and host: return type + '://' + host + path 881 elif type: return type + ':' + path 882 elif host: return '//' + host + path # don't know what this means 883 else: return path 884 885 886# Utilities to parse URLs (most of these return None for missing parts): 887# unwrap('<URL:type://host/path>') --> 'type://host/path' 888# splittype('type:opaquestring') --> 'type', 'opaquestring' 889# splithost('//host[:port]/path') --> 'host[:port]', '/path' 890# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' 891# splitpasswd('user:passwd') -> 'user', 'passwd' 892# splitport('host:port') --> 'host', 'port' 893# splitquery('/path?query') --> '/path', 'query' 894# splittag('/path#tag') --> '/path', 'tag' 895# splitattr('/path;attr1=value1;attr2=value2;...') -> 896# '/path', ['attr1=value1', 'attr2=value2', ...] 897# splitvalue('attr=value') --> 'attr', 'value' 898# splitgophertype('/Xselector') --> 'X', 'selector' 899# unquote('abc%20def') -> 'abc def' 900# quote('abc def') -> 'abc%20def') 901 902def toBytes(url): 903 """toBytes(u"URL") --> 'URL'.""" 904 # Most URL schemes require ASCII. If that changes, the conversion 905 # can be relaxed 906 if type(url) is types.UnicodeType: 907 try: 908 url = url.encode("ASCII") 909 except UnicodeError: 910 raise UnicodeError("URL " + repr(url) + 911 " contains non-ASCII characters") 912 return url 913 914def unwrap(url): 915 """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" 916 url = url.strip() 917 if url[:1] == '<' and url[-1:] == '>': 918 url = url[1:-1].strip() 919 if url[:4] == 'URL:': url = url[4:].strip() 920 return url 921 922_typeprog = None 923def splittype(url): 924 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" 925 global _typeprog 926 if _typeprog is None: 927 import re 928 _typeprog = re.compile('^([^/:]+):') 929 930 match = _typeprog.match(url) 931 if match: 932 scheme = match.group(1) 933 return scheme.lower(), url[len(scheme) + 1:] 934 return None, url 935 936_hostprog = None 937def splithost(url): 938 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" 939 global _hostprog 940 if _hostprog is None: 941 import re 942 _hostprog = re.compile('^//([^/]*)(.*)$') 943 944 match = _hostprog.match(url) 945 if match: return match.group(1, 2) 946 return None, url 947 948_userprog = None 949def splituser(host): 950 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" 951 global _userprog 952 if _userprog is None: 953 import re 954 _userprog = re.compile('^([^@]*)@(.*)$') 955 956 match = _userprog.match(host) 957 if match: return map(unquote, match.group(1, 2)) 958 return None, host 959 960_passwdprog = None 961def splitpasswd(user): 962 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" 963 global _passwdprog 964 if _passwdprog is None: 965 import re 966 _passwdprog = re.compile('^([^:]*):(.*)$') 967 968 match = _passwdprog.match(user) 969 if match: return match.group(1, 2) 970 return user, None 971 972# splittag('/path#tag') --> '/path', 'tag' 973_portprog = None 974def splitport(host): 975 """splitport('host:port') --> 'host', 'port'.""" 976 global _portprog 977 if _portprog is None: 978 import re 979 _portprog = re.compile('^(.*):([0-9]+)$') 980 981 match = _portprog.match(host) 982 if match: return match.group(1, 2) 983 return host, None 984 985_nportprog = None 986def splitnport(host, defport=-1): 987 """Split host and port, returning numeric port. 988 Return given default port if no ':' found; defaults to -1. 989 Return numerical port if a valid number are found after ':'. 990 Return None if ':' but not a valid number.""" 991 global _nportprog 992 if _nportprog is None: 993 import re 994 _nportprog = re.compile('^(.*):(.*)$') 995 996 match = _nportprog.match(host) 997 if match: 998 host, port = match.group(1, 2) 999 try: 1000 if not port: raise ValueError, "no digits" 1001 nport = int(port) 1002 except ValueError: 1003 nport = None 1004 return host, nport 1005 return host, defport 1006 1007_queryprog = None 1008def splitquery(url): 1009 """splitquery('/path?query') --> '/path', 'query'.""" 1010 global _queryprog 1011 if _queryprog is None: 1012 import re 1013 _queryprog = re.compile('^(.*)\?([^?]*)$') 1014 1015 match = _queryprog.match(url) 1016 if match: return match.group(1, 2) 1017 return url, None 1018 1019_tagprog = None 1020def splittag(url): 1021 """splittag('/path#tag') --> '/path', 'tag'.""" 1022 global _tagprog 1023 if _tagprog is None: 1024 import re 1025 _tagprog = re.compile('^(.*)#([^#]*)$') 1026 1027 match = _tagprog.match(url) 1028 if match: return match.group(1, 2) 1029 return url, None 1030 1031def splitattr(url): 1032 """splitattr('/path;attr1=value1;attr2=value2;...') -> 1033 '/path', ['attr1=value1', 'attr2=value2', ...].""" 1034 words = url.split(';') 1035 return words[0], words[1:] 1036 1037_valueprog = None 1038def splitvalue(attr): 1039 """splitvalue('attr=value') --> 'attr', 'value'.""" 1040 global _valueprog 1041 if _valueprog is None: 1042 import re 1043 _valueprog = re.compile('^([^=]*)=(.*)$') 1044 1045 match = _valueprog.match(attr) 1046 if match: return match.group(1, 2) 1047 return attr, None 1048 1049def splitgophertype(selector): 1050 """splitgophertype('/Xselector') --> 'X', 'selector'.""" 1051 if selector[:1] == '/' and selector[1:2]: 1052 return selector[1], selector[2:] 1053 return None, selector 1054 1055def unquote(s): 1056 """unquote('abc%20def') -> 'abc def'.""" 1057 mychr = chr 1058 myatoi = int 1059 list = s.split('%') 1060 res = [list[0]] 1061 myappend = res.append 1062 del list[0] 1063 for item in list: 1064 if item[1:2]: 1065 try: 1066 myappend(mychr(myatoi(item[:2], 16)) 1067 + item[2:]) 1068 except ValueError: 1069 myappend('%' + item) 1070 else: 1071 myappend('%' + item) 1072 return "".join(res) 1073 1074def unquote_plus(s): 1075 """unquote('%7e/abc+def') -> '~/abc def'""" 1076 if '+' in s: 1077 # replace '+' with ' ' 1078 s = ' '.join(s.split('+')) 1079 return unquote(s) 1080 1081always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 1082 'abcdefghijklmnopqrstuvwxyz' 1083 '0123456789' '_.-') 1084 1085_fast_safe_test = always_safe + '/' 1086_fast_safe = None 1087 1088def _fast_quote(s): 1089 global _fast_safe 1090 if _fast_safe is None: 1091 _fast_safe = {} 1092 for c in _fast_safe_test: 1093 _fast_safe[c] = c 1094 res = list(s) 1095 for i in range(len(res)): 1096 c = res[i] 1097 if not _fast_safe.has_key(c): 1098 res[i] = '%%%02X' % ord(c) 1099 return ''.join(res) 1100 1101def quote(s, safe = '/'): 1102 """quote('abc def') -> 'abc%20def' 1103 1104 Each part of a URL, e.g. the path info, the query, etc., has a 1105 different set of reserved characters that must be quoted. 1106 1107 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists 1108 the following reserved characters. 1109 1110 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 1111 "$" | "," 1112 1113 Each of these characters is reserved in some component of a URL, 1114 but not necessarily in all of them. 1115 1116 By default, the quote function is intended for quoting the path 1117 section of a URL. Thus, it will not encode '/'. This character 1118 is reserved, but in typical usage the quote function is being 1119 called on a path where the existing slash characters are used as 1120 reserved characters. 1121 """ 1122 safe = always_safe + safe 1123 if _fast_safe_test == safe: 1124 return _fast_quote(s) 1125 res = list(s) 1126 for i in range(len(res)): 1127 c = res[i] 1128 if c not in safe: 1129 res[i] = '%%%02X' % ord(c) 1130 return ''.join(res) 1131 1132def quote_plus(s, safe = ''): 1133 """Quote the query fragment of a URL; replacing ' ' with '+'""" 1134 if ' ' in s: 1135 l = s.split(' ') 1136 for i in range(len(l)): 1137 l[i] = quote(l[i], safe) 1138 return '+'.join(l) 1139 else: 1140 return quote(s, safe) 1141 1142def urlencode(query,doseq=0): 1143 """Encode a sequence of two-element tuples or dictionary into a URL query string. 1144 1145 If any values in the query arg are sequences and doseq is true, each 1146 sequence element is converted to a separate parameter. 1147 1148 If the query arg is a sequence of two-element tuples, the order of the 1149 parameters in the output will match the order of parameters in the 1150 input. 1151 """ 1152 1153 if hasattr(query,"items"): 1154 # mapping objects 1155 query = query.items() 1156 else: 1157 # it's a bother at times that strings and string-like objects are 1158 # sequences... 1159 try: 1160 # non-sequence items should not work with len() 1161 x = len(query) 1162 # non-empty strings will fail this 1163 if len(query) and type(query[0]) != types.TupleType: 1164 raise TypeError 1165 # zero-length sequences of all types will get here and succeed, 1166 # but that's a minor nit - since the original implementation 1167 # allowed empty dicts that type of behavior probably should be 1168 # preserved for consistency 1169 except TypeError: 1170 ty,va,tb = sys.exc_info() 1171 raise TypeError, "not a valid non-string sequence or mapping object", tb 1172 1173 l = [] 1174 if not doseq: 1175 # preserve old behavior 1176 for k, v in query: 1177 k = quote_plus(str(k)) 1178 v = quote_plus(str(v)) 1179 l.append(k + '=' + v) 1180 else: 1181 for k, v in query: 1182 k = quote_plus(str(k)) 1183 if type(v) == types.StringType: 1184 v = quote_plus(v) 1185 l.append(k + '=' + v) 1186 elif type(v) == types.UnicodeType: 1187 # is there a reasonable way to convert to ASCII? 1188 # encode generates a string, but "replace" or "ignore" 1189 # lose information and "strict" can raise UnicodeError 1190 v = quote_plus(v.encode("ASCII","replace")) 1191 l.append(k + '=' + v) 1192 else: 1193 try: 1194 # is this a sufficient test for sequence-ness? 1195 x = len(v) 1196 except TypeError: 1197 # not a sequence 1198 v = quote_plus(str(v)) 1199 l.append(k + '=' + v) 1200 else: 1201 # loop over the sequence 1202 for elt in v: 1203 l.append(k + '=' + quote_plus(str(elt))) 1204 return '&'.join(l) 1205 1206# Proxy handling 1207def getproxies_environment(): 1208 """Return a dictionary of scheme -> proxy server URL mappings. 1209 1210 Scan the environment for variables named <scheme>_proxy; 1211 this seems to be the standard convention. If you need a 1212 different way, you can pass a proxies dictionary to the 1213 [Fancy]URLopener constructor. 1214 1215 """ 1216 proxies = {} 1217 for name, value in os.environ.items(): 1218 name = name.lower() 1219 if value and name[-6:] == '_proxy': 1220 proxies[name[:-6]] = value 1221 return proxies 1222 1223if os.name == 'mac': 1224 def getproxies(): 1225 """Return a dictionary of scheme -> proxy server URL mappings. 1226 1227 By convention the mac uses Internet Config to store 1228 proxies. An HTTP proxy, for instance, is stored under 1229 the HttpProxy key. 1230 1231 """ 1232 try: 1233 import ic 1234 except ImportError: 1235 return {} 1236 1237 try: 1238 config = ic.IC() 1239 except ic.error: 1240 return {} 1241 proxies = {} 1242 # HTTP: 1243 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']: 1244 try: 1245 value = config['HTTPProxyHost'] 1246 except ic.error: 1247 pass 1248 else: 1249 proxies['http'] = 'http://%s' % value 1250 # FTP: XXXX To be done. 1251 # Gopher: XXXX To be done. 1252 return proxies 1253 1254 def proxy_bypass(x): 1255 return 0 1256 1257elif os.name == 'nt': 1258 def getproxies_registry(): 1259 """Return a dictionary of scheme -> proxy server URL mappings. 1260 1261 Win32 uses the registry to store proxies. 1262 1263 """ 1264 proxies = {} 1265 try: 1266 import _winreg 1267 except ImportError: 1268 # Std module, so should be around - but you never know! 1269 return proxies 1270 try: 1271 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, 1272 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 1273 proxyEnable = _winreg.QueryValueEx(internetSettings, 1274 'ProxyEnable')[0] 1275 if proxyEnable: 1276 # Returned as Unicode but problems if not converted to ASCII 1277 proxyServer = str(_winreg.QueryValueEx(internetSettings, 1278 'ProxyServer')[0]) 1279 if '=' in proxyServer: 1280 # Per-protocol settings 1281 for p in proxyServer.split(';'): 1282 protocol, address = p.split('=', 1) 1283 proxies[protocol] = '%s://%s' % (protocol, address) 1284 else: 1285 # Use one setting for all protocols 1286 if proxyServer[:5] == 'http:': 1287 proxies['http'] = proxyServer 1288 else: 1289 proxies['http'] = 'http://%s' % proxyServer 1290 proxies['ftp'] = 'ftp://%s' % proxyServer 1291 internetSettings.Close() 1292 except (WindowsError, ValueError, TypeError): 1293 # Either registry key not found etc, or the value in an 1294 # unexpected format. 1295 # proxies already set up to be empty so nothing to do 1296 pass 1297 return proxies 1298 1299 def getproxies(): 1300 """Return a dictionary of scheme -> proxy server URL mappings. 1301 1302 Returns settings gathered from the environment, if specified, 1303 or the registry. 1304 1305 """ 1306 return getproxies_environment() or getproxies_registry() 1307 1308 def proxy_bypass(host): 1309 try: 1310 import _winreg 1311 import re 1312 import socket 1313 except ImportError: 1314 # Std modules, so should be around - but you never know! 1315 return 0 1316 try: 1317 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, 1318 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 1319 proxyEnable = _winreg.QueryValueEx(internetSettings, 1320 'ProxyEnable')[0] 1321 proxyOverride = str(_winreg.QueryValueEx(internetSettings, 1322 'ProxyOverride')[0]) 1323 # ^^^^ Returned as Unicode but problems if not converted to ASCII 1324 except WindowsError: 1325 return 0 1326 if not proxyEnable or not proxyOverride: 1327 return 0 1328 # try to make a host list from name and IP address. 1329 host = [host] 1330 try: 1331 addr = socket.gethostbyname(host[0]) 1332 if addr != host: 1333 host.append(addr) 1334 except socket.error: 1335 pass 1336 # make a check value list from the registry entry: replace the 1337 # '<local>' string by the localhost entry and the corresponding 1338 # canonical entry. 1339 proxyOverride = proxyOverride.split(';') 1340 i = 0 1341 while i < len(proxyOverride): 1342 if proxyOverride[i] == '<local>': 1343 proxyOverride[i:i+1] = ['localhost', 1344 '127.0.0.1', 1345 socket.gethostname(), 1346 socket.gethostbyname( 1347 socket.gethostname())] 1348 i += 1 1349 # print proxyOverride 1350 # now check if we match one of the registry values. 1351 for test in proxyOverride: 1352 test = test.replace(".", r"\.") # mask dots 1353 test = test.replace("*", r".*") # change glob sequence 1354 test = test.replace("?", r".") # change glob char 1355 for val in host: 1356 # print "%s <--> %s" %( test, val ) 1357 if re.match(test, val, re.I): 1358 return 1 1359 return 0 1360 1361else: 1362 # By default use environment variables 1363 getproxies = getproxies_environment 1364 1365 def proxy_bypass(host): 1366 return 0 1367 1368# Test and time quote() and unquote() 1369def test1(): 1370 import time 1371 s = '' 1372 for i in range(256): s = s + chr(i) 1373 s = s*4 1374 t0 = time.time() 1375 qs = quote(s) 1376 uqs = unquote(qs) 1377 t1 = time.time() 1378 if uqs != s: 1379 print 'Wrong!' 1380 print `s` 1381 print `qs` 1382 print `uqs` 1383 print round(t1 - t0, 3), 'sec' 1384 1385 1386def reporthook(blocknum, blocksize, totalsize): 1387 # Report during remote transfers 1388 print "Block number: %d, Block size: %d, Total size: %d" % ( 1389 blocknum, blocksize, totalsize) 1390 1391# Test program 1392def test(args=[]): 1393 if not args: 1394 args = [ 1395 '/etc/passwd', 1396 'file:/etc/passwd', 1397 'file://localhost/etc/passwd', 1398 'ftp://ftp.python.org/etc/passwd', 1399## 'gopher://gopher.micro.umn.edu/1/', 1400 'http://www.python.org/index.html', 1401 ] 1402 if hasattr(URLopener, "open_https"): 1403 args.append('https://synergy.as.cmu.edu/~geek/') 1404 try: 1405 for url in args: 1406 print '-'*10, url, '-'*10 1407 fn, h = urlretrieve(url, None, reporthook) 1408 print fn 1409 if h: 1410 print '======' 1411 for k in h.keys(): print k + ':', h[k] 1412 print '======' 1413 fp = open(fn, 'rb') 1414 data = fp.read() 1415 del fp 1416 if '\r' in data: 1417 table = string.maketrans("", "") 1418 data = data.translate(table, "\r") 1419 print data 1420 fn, h = None, None 1421 print '-'*40 1422 finally: 1423 urlcleanup() 1424 1425def main(): 1426 import getopt, sys 1427 try: 1428 opts, args = getopt.getopt(sys.argv[1:], "th") 1429 except getopt.error, msg: 1430 print msg 1431 print "Use -h for help" 1432 return 1433 t = 0 1434 for o, a in opts: 1435 if o == '-t': 1436 t = t + 1 1437 if o == '-h': 1438 print "Usage: python urllib.py [-t] [url ...]" 1439 print "-t runs self-test;", 1440 print "otherwise, contents of urls are printed" 1441 return 1442 if t: 1443 if t > 1: 1444 test1() 1445 test(args) 1446 else: 1447 if not args: 1448 print "Use -h for help" 1449 for url in args: 1450 print urlopen(url).read(), 1451 1452# Run test program when run as a script 1453if __name__ == '__main__': 1454 main() 1455