urllib.py revision ec3dfdee6a09e1ad8d4e319876d0cf7691397245
1"""Open an arbitrary URL. 2 3See the following document for more info on URLs: 4"Names and Addresses, URIs, URLs, URNs, URCs", at 5http://www.w3.org/pub/WWW/Addressing/Overview.html 6 7See also the HTTP spec (from which the error codes are derived): 8"HTTP - Hypertext Transfer Protocol", at 9http://www.w3.org/pub/WWW/Protocols/ 10 11Related standards and specs: 12- RFC1808: the "relative URL" spec. (authoritative status) 13- RFC1738 - the "URL standard". (authoritative status) 14- RFC1630 - the "URI spec". (informational status) 15 16The object returned by URLopener().open(file) will differ per 17protocol. All you know is that is has methods read(), readline(), 18readlines(), fileno(), close() and info(). The read*(), fileno() 19and close() methods work like those of open files. 20The info() method returns a mimetools.Message object which can be 21used to query various info about the object, if available. 22(mimetools.Message objects are queried with the getheader() method.) 23""" 24 25import string 26import socket 27import os 28import sys 29import types 30 31__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve", 32 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus", 33 "urlencode", "url2pathname", "pathname2url", "splittag", 34 "localhost", "thishost", "ftperrors", "basejoin", "unwrap", 35 "splittype", "splithost", "splituser", "splitpasswd", "splitport", 36 "splitnport", "splitquery", "splitattr", "splitvalue", 37 "splitgophertype", "getproxies"] 38 39__version__ = '1.15' # XXX This version is not always updated :-( 40 41MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 42 43# Helper for non-unix systems 44if os.name == 'mac': 45 from macurl2path import url2pathname, pathname2url 46elif os.name == 'nt': 47 from nturl2path import url2pathname, pathname2url 48elif os.name == 'riscos': 49 from rourl2path import url2pathname, pathname2url 50else: 51 def url2pathname(pathname): 52 return unquote(pathname) 53 def pathname2url(pathname): 54 return quote(pathname) 55 56# This really consists of two pieces: 57# (1) a class which handles opening of all sorts of URLs 58# (plus assorted utilities etc.) 59# (2) a set of functions for parsing URLs 60# XXX Should these be separated out into different modules? 61 62 63# Shortcut for basic usage 64_urlopener = None 65def urlopen(url, data=None): 66 """urlopen(url [, data]) -> open file-like object""" 67 global _urlopener 68 if not _urlopener: 69 _urlopener = FancyURLopener() 70 if data is None: 71 return _urlopener.open(url) 72 else: 73 return _urlopener.open(url, data) 74def urlretrieve(url, filename=None, reporthook=None, data=None): 75 global _urlopener 76 if not _urlopener: 77 _urlopener = FancyURLopener() 78 return _urlopener.retrieve(url, filename, reporthook, data) 79def urlcleanup(): 80 if _urlopener: 81 _urlopener.cleanup() 82 83 84ftpcache = {} 85class URLopener: 86 """Class to open URLs. 87 This is a class rather than just a subroutine because we may need 88 more than one set of global protocol-specific options. 89 Note -- this is a base class for those who don't want the 90 automatic handling of errors type 302 (relocated) and 401 91 (authorization needed).""" 92 93 __tempfiles = None 94 95 version = "Python-urllib/%s" % __version__ 96 97 # Constructor 98 def __init__(self, proxies=None, **x509): 99 if proxies is None: 100 proxies = getproxies() 101 assert hasattr(proxies, 'has_key'), "proxies must be a mapping" 102 self.proxies = proxies 103 self.key_file = x509.get('key_file') 104 self.cert_file = x509.get('cert_file') 105 self.addheaders = [('User-agent', self.version)] 106 self.__tempfiles = [] 107 self.__unlink = os.unlink # See cleanup() 108 self.tempcache = None 109 # Undocumented feature: if you assign {} to tempcache, 110 # it is used to cache files retrieved with 111 # self.retrieve(). This is not enabled by default 112 # since it does not work for changing documents (and I 113 # haven't got the logic to check expiration headers 114 # yet). 115 self.ftpcache = ftpcache 116 # Undocumented feature: you can use a different 117 # ftp cache by assigning to the .ftpcache member; 118 # in case you want logically independent URL openers 119 # XXX This is not threadsafe. Bah. 120 121 def __del__(self): 122 self.close() 123 124 def close(self): 125 self.cleanup() 126 127 def cleanup(self): 128 # This code sometimes runs when the rest of this module 129 # has already been deleted, so it can't use any globals 130 # or import anything. 131 if self.__tempfiles: 132 for file in self.__tempfiles: 133 try: 134 self.__unlink(file) 135 except: 136 pass 137 del self.__tempfiles[:] 138 if self.tempcache: 139 self.tempcache.clear() 140 141 def addheader(self, *args): 142 """Add a header to be used by the HTTP interface only 143 e.g. u.addheader('Accept', 'sound/basic')""" 144 self.addheaders.append(args) 145 146 # External interface 147 def open(self, fullurl, data=None): 148 """Use URLopener().open(file) instead of open(file, 'r').""" 149 fullurl = unwrap(toBytes(fullurl)) 150 if self.tempcache and self.tempcache.has_key(fullurl): 151 filename, headers = self.tempcache[fullurl] 152 fp = open(filename, 'rb') 153 return addinfourl(fp, headers, fullurl) 154 urltype, url = splittype(fullurl) 155 if not urltype: 156 urltype = 'file' 157 if self.proxies.has_key(urltype): 158 proxy = self.proxies[urltype] 159 urltype, proxyhost = splittype(proxy) 160 host, selector = splithost(proxyhost) 161 url = (host, fullurl) # Signal special case to open_*() 162 else: 163 proxy = None 164 name = 'open_' + urltype 165 self.type = urltype 166 if '-' in name: 167 # replace - with _ 168 name = '_'.join(name.split('-')) 169 if not hasattr(self, name): 170 if proxy: 171 return self.open_unknown_proxy(proxy, fullurl, data) 172 else: 173 return self.open_unknown(fullurl, data) 174 try: 175 if data is None: 176 return getattr(self, name)(url) 177 else: 178 return getattr(self, name)(url, data) 179 except socket.error, msg: 180 raise IOError, ('socket error', msg), sys.exc_info()[2] 181 182 def open_unknown(self, fullurl, data=None): 183 """Overridable interface to open unknown URL type.""" 184 type, url = splittype(fullurl) 185 raise IOError, ('url error', 'unknown url type', type) 186 187 def open_unknown_proxy(self, proxy, fullurl, data=None): 188 """Overridable interface to open unknown URL type.""" 189 type, url = splittype(fullurl) 190 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy) 191 192 # External interface 193 def retrieve(self, url, filename=None, reporthook=None, data=None): 194 """retrieve(url) returns (filename, None) for a local object 195 or (tempfilename, headers) for a remote object.""" 196 url = unwrap(toBytes(url)) 197 if self.tempcache and self.tempcache.has_key(url): 198 return self.tempcache[url] 199 type, url1 = splittype(url) 200 if not filename and (not type or type == 'file'): 201 try: 202 fp = self.open_local_file(url1) 203 hdrs = fp.info() 204 del fp 205 return url2pathname(splithost(url1)[1]), hdrs 206 except IOError, msg: 207 pass 208 fp = self.open(url, data) 209 headers = fp.info() 210 if not filename: 211 import tempfile 212 garbage, path = splittype(url) 213 garbage, path = splithost(path or "") 214 path, garbage = splitquery(path or "") 215 path, garbage = splitattr(path or "") 216 suffix = os.path.splitext(path)[1] 217 filename = tempfile.mktemp(suffix) 218 self.__tempfiles.append(filename) 219 result = filename, headers 220 if self.tempcache is not None: 221 self.tempcache[url] = result 222 tfp = open(filename, 'wb') 223 bs = 1024*8 224 size = -1 225 blocknum = 1 226 if reporthook: 227 if headers.has_key("content-length"): 228 size = int(headers["Content-Length"]) 229 reporthook(0, bs, size) 230 block = fp.read(bs) 231 if reporthook: 232 reporthook(1, bs, size) 233 while block: 234 tfp.write(block) 235 block = fp.read(bs) 236 blocknum = blocknum + 1 237 if reporthook: 238 reporthook(blocknum, bs, size) 239 fp.close() 240 tfp.close() 241 del fp 242 del tfp 243 return result 244 245 # Each method named open_<type> knows how to open that type of URL 246 247 def open_http(self, url, data=None): 248 """Use HTTP protocol.""" 249 import httplib 250 user_passwd = None 251 if type(url) is types.StringType: 252 host, selector = splithost(url) 253 if host: 254 user_passwd, host = splituser(host) 255 host = unquote(host) 256 realhost = host 257 else: 258 host, selector = url 259 urltype, rest = splittype(selector) 260 url = rest 261 user_passwd = None 262 if urltype.lower() != 'http': 263 realhost = None 264 else: 265 realhost, rest = splithost(rest) 266 if realhost: 267 user_passwd, realhost = splituser(realhost) 268 if user_passwd: 269 selector = "%s://%s%s" % (urltype, realhost, rest) 270 #print "proxy via http:", host, selector 271 if not host: raise IOError, ('http error', 'no host given') 272 if user_passwd: 273 import base64 274 auth = base64.encodestring(user_passwd).strip() 275 else: 276 auth = None 277 h = httplib.HTTP(host) 278 if data is not None: 279 h.putrequest('POST', selector) 280 h.putheader('Content-type', 'application/x-www-form-urlencoded') 281 h.putheader('Content-length', '%d' % len(data)) 282 else: 283 h.putrequest('GET', selector) 284 if auth: h.putheader('Authorization', 'Basic %s' % auth) 285 if realhost: h.putheader('Host', realhost) 286 for args in self.addheaders: apply(h.putheader, args) 287 h.endheaders() 288 if data is not None: 289 h.send(data) 290 errcode, errmsg, headers = h.getreply() 291 fp = h.getfile() 292 if errcode == 200: 293 return addinfourl(fp, headers, "http:" + url) 294 else: 295 if data is None: 296 return self.http_error(url, fp, errcode, errmsg, headers) 297 else: 298 return self.http_error(url, fp, errcode, errmsg, headers, data) 299 300 def http_error(self, url, fp, errcode, errmsg, headers, data=None): 301 """Handle http errors. 302 Derived class can override this, or provide specific handlers 303 named http_error_DDD where DDD is the 3-digit error code.""" 304 # First check if there's a specific handler for this error 305 name = 'http_error_%d' % errcode 306 if hasattr(self, name): 307 method = getattr(self, name) 308 if data is None: 309 result = method(url, fp, errcode, errmsg, headers) 310 else: 311 result = method(url, fp, errcode, errmsg, headers, data) 312 if result: return result 313 return self.http_error_default(url, fp, errcode, errmsg, headers) 314 315 def http_error_default(self, url, fp, errcode, errmsg, headers): 316 """Default error handler: close the connection and raise IOError.""" 317 void = fp.read() 318 fp.close() 319 raise IOError, ('http error', errcode, errmsg, headers) 320 321 if hasattr(socket, "ssl"): 322 def open_https(self, url, data=None): 323 """Use HTTPS protocol.""" 324 import httplib 325 user_passwd = None 326 if type(url) is types.StringType: 327 host, selector = splithost(url) 328 if host: 329 user_passwd, host = splituser(host) 330 host = unquote(host) 331 realhost = host 332 else: 333 host, selector = url 334 urltype, rest = splittype(selector) 335 url = rest 336 user_passwd = None 337 if urltype.lower() != 'https': 338 realhost = None 339 else: 340 realhost, rest = splithost(rest) 341 if realhost: 342 user_passwd, realhost = splituser(realhost) 343 if user_passwd: 344 selector = "%s://%s%s" % (urltype, realhost, rest) 345 #print "proxy via https:", host, selector 346 if not host: raise IOError, ('https error', 'no host given') 347 if user_passwd: 348 import base64 349 auth = base64.encodestring(user_passwd).strip() 350 else: 351 auth = None 352 h = httplib.HTTPS(host, 0, 353 key_file=self.key_file, 354 cert_file=self.cert_file) 355 if data is not None: 356 h.putrequest('POST', selector) 357 h.putheader('Content-type', 358 'application/x-www-form-urlencoded') 359 h.putheader('Content-length', '%d' % len(data)) 360 else: 361 h.putrequest('GET', selector) 362 if auth: h.putheader('Authorization: Basic %s' % auth) 363 if realhost: h.putheader('Host', realhost) 364 for args in self.addheaders: apply(h.putheader, args) 365 h.endheaders() 366 if data is not None: 367 h.send(data) 368 errcode, errmsg, headers = h.getreply() 369 fp = h.getfile() 370 if errcode == 200: 371 return addinfourl(fp, headers, url) 372 else: 373 if data is None: 374 return self.http_error(url, fp, errcode, errmsg, headers) 375 else: 376 return self.http_error(url, fp, errcode, errmsg, headers, 377 data) 378 379 def open_gopher(self, url): 380 """Use Gopher protocol.""" 381 import gopherlib 382 host, selector = splithost(url) 383 if not host: raise IOError, ('gopher error', 'no host given') 384 host = unquote(host) 385 type, selector = splitgophertype(selector) 386 selector, query = splitquery(selector) 387 selector = unquote(selector) 388 if query: 389 query = unquote(query) 390 fp = gopherlib.send_query(selector, query, host) 391 else: 392 fp = gopherlib.send_selector(selector, host) 393 return addinfourl(fp, noheaders(), "gopher:" + url) 394 395 def open_file(self, url): 396 """Use local file or FTP depending on form of URL.""" 397 if url[:2] == '//' and url[2:3] != '/': 398 return self.open_ftp(url) 399 else: 400 return self.open_local_file(url) 401 402 def open_local_file(self, url): 403 """Use local file.""" 404 import mimetypes, mimetools, StringIO 405 mtype = mimetypes.guess_type(url)[0] 406 headers = mimetools.Message(StringIO.StringIO( 407 'Content-Type: %s\n' % (mtype or 'text/plain'))) 408 host, file = splithost(url) 409 if not host: 410 urlfile = file 411 if file[:1] == '/': 412 urlfile = 'file://' + file 413 return addinfourl(open(url2pathname(file), 'rb'), 414 headers, urlfile) 415 host, port = splitport(host) 416 if not port \ 417 and socket.gethostbyname(host) in (localhost(), thishost()): 418 urlfile = file 419 if file[:1] == '/': 420 urlfile = 'file://' + file 421 return addinfourl(open(url2pathname(file), 'rb'), 422 headers, urlfile) 423 raise IOError, ('local file error', 'not on local host') 424 425 def open_ftp(self, url): 426 """Use FTP protocol.""" 427 host, path = splithost(url) 428 if not host: raise IOError, ('ftp error', 'no host given') 429 host, port = splitport(host) 430 user, host = splituser(host) 431 if user: user, passwd = splitpasswd(user) 432 else: passwd = None 433 host = unquote(host) 434 user = unquote(user or '') 435 passwd = unquote(passwd or '') 436 host = socket.gethostbyname(host) 437 if not port: 438 import ftplib 439 port = ftplib.FTP_PORT 440 else: 441 port = int(port) 442 path, attrs = splitattr(path) 443 path = unquote(path) 444 dirs = path.split('/') 445 dirs, file = dirs[:-1], dirs[-1] 446 if dirs and not dirs[0]: dirs = dirs[1:] 447 if dirs and not dirs[0]: dirs[0] = '/' 448 key = user, host, port, '/'.join(dirs) 449 # XXX thread unsafe! 450 if len(self.ftpcache) > MAXFTPCACHE: 451 # Prune the cache, rather arbitrarily 452 for k in self.ftpcache.keys(): 453 if k != key: 454 v = self.ftpcache[k] 455 del self.ftpcache[k] 456 v.close() 457 try: 458 if not self.ftpcache.has_key(key): 459 self.ftpcache[key] = \ 460 ftpwrapper(user, passwd, host, port, dirs) 461 if not file: type = 'D' 462 else: type = 'I' 463 for attr in attrs: 464 attr, value = splitvalue(attr) 465 if attr.lower() == 'type' and \ 466 value in ('a', 'A', 'i', 'I', 'd', 'D'): 467 type = value.upper() 468 (fp, retrlen) = self.ftpcache[key].retrfile(file, type) 469 if retrlen is not None and retrlen >= 0: 470 import mimetools, StringIO 471 headers = mimetools.Message(StringIO.StringIO( 472 'Content-Length: %d\n' % retrlen)) 473 else: 474 headers = noheaders() 475 return addinfourl(fp, headers, "ftp:" + url) 476 except ftperrors(), msg: 477 raise IOError, ('ftp error', msg), sys.exc_info()[2] 478 479 def open_data(self, url, data=None): 480 """Use "data" URL.""" 481 # ignore POSTed data 482 # 483 # syntax of data URLs: 484 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 485 # mediatype := [ type "/" subtype ] *( ";" parameter ) 486 # data := *urlchar 487 # parameter := attribute "=" value 488 import StringIO, mimetools, time 489 try: 490 [type, data] = url.split(',', 1) 491 except ValueError: 492 raise IOError, ('data error', 'bad data URL') 493 if not type: 494 type = 'text/plain;charset=US-ASCII' 495 semi = type.rfind(';') 496 if semi >= 0 and '=' not in type[semi:]: 497 encoding = type[semi+1:] 498 type = type[:semi] 499 else: 500 encoding = '' 501 msg = [] 502 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT', 503 time.gmtime(time.time()))) 504 msg.append('Content-type: %s' % type) 505 if encoding == 'base64': 506 import base64 507 data = base64.decodestring(data) 508 else: 509 data = unquote(data) 510 msg.append('Content-length: %d' % len(data)) 511 msg.append('') 512 msg.append(data) 513 msg = '\n'.join(msg) 514 f = StringIO.StringIO(msg) 515 headers = mimetools.Message(f, 0) 516 f.fileno = None # needed for addinfourl 517 return addinfourl(f, headers, url) 518 519 520class FancyURLopener(URLopener): 521 """Derived class with handlers for errors we can handle (perhaps).""" 522 523 def __init__(self, *args): 524 apply(URLopener.__init__, (self,) + args) 525 self.auth_cache = {} 526 self.tries = 0 527 self.maxtries = 10 528 529 def http_error_default(self, url, fp, errcode, errmsg, headers): 530 """Default error handling -- don't raise an exception.""" 531 return addinfourl(fp, headers, "http:" + url) 532 533 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 534 """Error 302 -- relocated (temporarily).""" 535 self.tries += 1 536 if self.maxtries and self.tries >= self.maxtries: 537 if hasattr(self, "http_error_500"): 538 meth = self.http_error_500 539 else: 540 meth = self.http_error_default 541 self.tries = 0 542 return meth(url, fp, 500, 543 "Internal Server Error: Redirect Recursion", headers) 544 result = self.redirect_internal(url, fp, errcode, errmsg, headers, 545 data) 546 self.tries = 0 547 return result 548 549 def redirect_internal(self, url, fp, errcode, errmsg, headers, data): 550 if headers.has_key('location'): 551 newurl = headers['location'] 552 elif headers.has_key('uri'): 553 newurl = headers['uri'] 554 else: 555 return 556 void = fp.read() 557 fp.close() 558 # In case the server sent a relative URL, join with original: 559 newurl = basejoin(self.type + ":" + url, newurl) 560 if data is None: 561 return self.open(newurl) 562 else: 563 return self.open(newurl, data) 564 565 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): 566 """Error 301 -- also relocated (permanently).""" 567 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 568 569 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): 570 """Error 401 -- authentication required. 571 See this URL for a description of the basic authentication scheme: 572 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt""" 573 if not headers.has_key('www-authenticate'): 574 URLopener.http_error_default(self, url, fp, 575 errmsg, headers) 576 stuff = headers['www-authenticate'] 577 import re 578 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 579 if not match: 580 URLopener.http_error_default(self, url, fp, 581 errcode, errmsg, headers) 582 scheme, realm = match.groups() 583 if scheme.lower() != 'basic': 584 URLopener.http_error_default(self, url, fp, 585 errcode, errmsg, headers) 586 name = 'retry_' + self.type + '_basic_auth' 587 if data is None: 588 return getattr(self,name)(url, realm) 589 else: 590 return getattr(self,name)(url, realm, data) 591 592 def retry_http_basic_auth(self, url, realm, data=None): 593 host, selector = splithost(url) 594 i = host.find('@') + 1 595 host = host[i:] 596 user, passwd = self.get_user_passwd(host, realm, i) 597 if not (user or passwd): return None 598 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host 599 newurl = 'http://' + host + selector 600 if data is None: 601 return self.open(newurl) 602 else: 603 return self.open(newurl, data) 604 605 def retry_https_basic_auth(self, url, realm, data=None): 606 host, selector = splithost(url) 607 i = host.find('@') + 1 608 host = host[i:] 609 user, passwd = self.get_user_passwd(host, realm, i) 610 if not (user or passwd): return None 611 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host 612 newurl = '//' + host + selector 613 return self.open_https(newurl, data) 614 615 def get_user_passwd(self, host, realm, clear_cache = 0): 616 key = realm + '@' + host.lower() 617 if self.auth_cache.has_key(key): 618 if clear_cache: 619 del self.auth_cache[key] 620 else: 621 return self.auth_cache[key] 622 user, passwd = self.prompt_user_passwd(host, realm) 623 if user or passwd: self.auth_cache[key] = (user, passwd) 624 return user, passwd 625 626 def prompt_user_passwd(self, host, realm): 627 """Override this in a GUI environment!""" 628 import getpass 629 try: 630 user = raw_input("Enter username for %s at %s: " % (realm, 631 host)) 632 passwd = getpass.getpass("Enter password for %s in %s at %s: " % 633 (user, realm, host)) 634 return user, passwd 635 except KeyboardInterrupt: 636 print 637 return None, None 638 639 640# Utility functions 641 642_localhost = None 643def localhost(): 644 """Return the IP address of the magic hostname 'localhost'.""" 645 global _localhost 646 if not _localhost: 647 _localhost = socket.gethostbyname('localhost') 648 return _localhost 649 650_thishost = None 651def thishost(): 652 """Return the IP address of the current host.""" 653 global _thishost 654 if not _thishost: 655 _thishost = socket.gethostbyname(socket.gethostname()) 656 return _thishost 657 658_ftperrors = None 659def ftperrors(): 660 """Return the set of errors raised by the FTP class.""" 661 global _ftperrors 662 if not _ftperrors: 663 import ftplib 664 _ftperrors = ftplib.all_errors 665 return _ftperrors 666 667_noheaders = None 668def noheaders(): 669 """Return an empty mimetools.Message object.""" 670 global _noheaders 671 if not _noheaders: 672 import mimetools 673 import StringIO 674 _noheaders = mimetools.Message(StringIO.StringIO(), 0) 675 _noheaders.fp.close() # Recycle file descriptor 676 return _noheaders 677 678 679# Utility classes 680 681class ftpwrapper: 682 """Class used by open_ftp() for cache of open FTP connections.""" 683 684 def __init__(self, user, passwd, host, port, dirs): 685 self.user = user 686 self.passwd = passwd 687 self.host = host 688 self.port = port 689 self.dirs = dirs 690 self.init() 691 692 def init(self): 693 import ftplib 694 self.busy = 0 695 self.ftp = ftplib.FTP() 696 self.ftp.connect(self.host, self.port) 697 self.ftp.login(self.user, self.passwd) 698 for dir in self.dirs: 699 self.ftp.cwd(dir) 700 701 def retrfile(self, file, type): 702 import ftplib 703 self.endtransfer() 704 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 705 else: cmd = 'TYPE ' + type; isdir = 0 706 try: 707 self.ftp.voidcmd(cmd) 708 except ftplib.all_errors: 709 self.init() 710 self.ftp.voidcmd(cmd) 711 conn = None 712 if file and not isdir: 713 # Use nlst to see if the file exists at all 714 try: 715 self.ftp.nlst(file) 716 except ftplib.error_perm, reason: 717 raise IOError, ('ftp error', reason), sys.exc_info()[2] 718 # Restore the transfer mode! 719 self.ftp.voidcmd(cmd) 720 # Try to retrieve as a file 721 try: 722 cmd = 'RETR ' + file 723 conn = self.ftp.ntransfercmd(cmd) 724 except ftplib.error_perm, reason: 725 if str(reason)[:3] != '550': 726 raise IOError, ('ftp error', reason), sys.exc_info()[2] 727 if not conn: 728 # Set transfer mode to ASCII! 729 self.ftp.voidcmd('TYPE A') 730 # Try a directory listing 731 if file: cmd = 'LIST ' + file 732 else: cmd = 'LIST' 733 conn = self.ftp.ntransfercmd(cmd) 734 self.busy = 1 735 # Pass back both a suitably decorated object and a retrieval length 736 return (addclosehook(conn[0].makefile('rb'), 737 self.endtransfer), conn[1]) 738 def endtransfer(self): 739 if not self.busy: 740 return 741 self.busy = 0 742 try: 743 self.ftp.voidresp() 744 except ftperrors(): 745 pass 746 747 def close(self): 748 self.endtransfer() 749 try: 750 self.ftp.close() 751 except ftperrors(): 752 pass 753 754class addbase: 755 """Base class for addinfo and addclosehook.""" 756 757 def __init__(self, fp): 758 self.fp = fp 759 self.read = self.fp.read 760 self.readline = self.fp.readline 761 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines 762 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno 763 764 def __repr__(self): 765 return '<%s at %s whose fp = %s>' % (self.__class__.__name__, 766 `id(self)`, `self.fp`) 767 768 def close(self): 769 self.read = None 770 self.readline = None 771 self.readlines = None 772 self.fileno = None 773 if self.fp: self.fp.close() 774 self.fp = None 775 776class addclosehook(addbase): 777 """Class to add a close hook to an open file.""" 778 779 def __init__(self, fp, closehook, *hookargs): 780 addbase.__init__(self, fp) 781 self.closehook = closehook 782 self.hookargs = hookargs 783 784 def close(self): 785 addbase.close(self) 786 if self.closehook: 787 apply(self.closehook, self.hookargs) 788 self.closehook = None 789 self.hookargs = None 790 791class addinfo(addbase): 792 """class to add an info() method to an open file.""" 793 794 def __init__(self, fp, headers): 795 addbase.__init__(self, fp) 796 self.headers = headers 797 798 def info(self): 799 return self.headers 800 801class addinfourl(addbase): 802 """class to add info() and geturl() methods to an open file.""" 803 804 def __init__(self, fp, headers, url): 805 addbase.__init__(self, fp) 806 self.headers = headers 807 self.url = url 808 809 def info(self): 810 return self.headers 811 812 def geturl(self): 813 return self.url 814 815 816def basejoin(base, url): 817 """Utility to combine a URL with a base URL to form a new URL.""" 818 type, path = splittype(url) 819 if type: 820 # if url is complete (i.e., it contains a type), return it 821 return url 822 host, path = splithost(path) 823 type, basepath = splittype(base) # inherit type from base 824 if host: 825 # if url contains host, just inherit type 826 if type: return type + '://' + host + path 827 else: 828 # no type inherited, so url must have started with // 829 # just return it 830 return url 831 host, basepath = splithost(basepath) # inherit host 832 basepath, basetag = splittag(basepath) # remove extraneous cruft 833 basepath, basequery = splitquery(basepath) # idem 834 if path[:1] != '/': 835 # non-absolute path name 836 if path[:1] in ('#', '?'): 837 # path is just a tag or query, attach to basepath 838 i = len(basepath) 839 else: 840 # else replace last component 841 i = basepath.rfind('/') 842 if i < 0: 843 # basepath not absolute 844 if host: 845 # host present, make absolute 846 basepath = '/' 847 else: 848 # else keep non-absolute 849 basepath = '' 850 else: 851 # remove last file component 852 basepath = basepath[:i+1] 853 # Interpret ../ (important because of symlinks) 854 while basepath and path[:3] == '../': 855 path = path[3:] 856 i = basepath[:-1].rfind('/') 857 if i > 0: 858 basepath = basepath[:i+1] 859 elif i == 0: 860 basepath = '/' 861 break 862 else: 863 basepath = '' 864 865 path = basepath + path 866 if host and path and path[0] != '/': 867 path = '/' + path 868 if type and host: return type + '://' + host + path 869 elif type: return type + ':' + path 870 elif host: return '//' + host + path # don't know what this means 871 else: return path 872 873 874# Utilities to parse URLs (most of these return None for missing parts): 875# unwrap('<URL:type://host/path>') --> 'type://host/path' 876# splittype('type:opaquestring') --> 'type', 'opaquestring' 877# splithost('//host[:port]/path') --> 'host[:port]', '/path' 878# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' 879# splitpasswd('user:passwd') -> 'user', 'passwd' 880# splitport('host:port') --> 'host', 'port' 881# splitquery('/path?query') --> '/path', 'query' 882# splittag('/path#tag') --> '/path', 'tag' 883# splitattr('/path;attr1=value1;attr2=value2;...') -> 884# '/path', ['attr1=value1', 'attr2=value2', ...] 885# splitvalue('attr=value') --> 'attr', 'value' 886# splitgophertype('/Xselector') --> 'X', 'selector' 887# unquote('abc%20def') -> 'abc def' 888# quote('abc def') -> 'abc%20def') 889 890def toBytes(url): 891 """toBytes(u"URL") --> 'URL'.""" 892 # Most URL schemes require ASCII. If that changes, the conversion 893 # can be relaxed 894 if type(url) is types.UnicodeType: 895 try: 896 url = url.encode("ASCII") 897 except UnicodeError: 898 raise UnicodeError("URL " + repr(url) + 899 " contains non-ASCII characters") 900 return url 901 902def unwrap(url): 903 """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" 904 url = url.strip() 905 if url[:1] == '<' and url[-1:] == '>': 906 url = url[1:-1].strip() 907 if url[:4] == 'URL:': url = url[4:].strip() 908 return url 909 910_typeprog = None 911def splittype(url): 912 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" 913 global _typeprog 914 if _typeprog is None: 915 import re 916 _typeprog = re.compile('^([^/:]+):') 917 918 match = _typeprog.match(url) 919 if match: 920 scheme = match.group(1) 921 return scheme.lower(), url[len(scheme) + 1:] 922 return None, url 923 924_hostprog = None 925def splithost(url): 926 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" 927 global _hostprog 928 if _hostprog is None: 929 import re 930 _hostprog = re.compile('^//([^/]*)(.*)$') 931 932 match = _hostprog.match(url) 933 if match: return match.group(1, 2) 934 return None, url 935 936_userprog = None 937def splituser(host): 938 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" 939 global _userprog 940 if _userprog is None: 941 import re 942 _userprog = re.compile('^([^@]*)@(.*)$') 943 944 match = _userprog.match(host) 945 if match: return map(unquote, match.group(1, 2)) 946 return None, host 947 948_passwdprog = None 949def splitpasswd(user): 950 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" 951 global _passwdprog 952 if _passwdprog is None: 953 import re 954 _passwdprog = re.compile('^([^:]*):(.*)$') 955 956 match = _passwdprog.match(user) 957 if match: return match.group(1, 2) 958 return user, None 959 960# splittag('/path#tag') --> '/path', 'tag' 961_portprog = None 962def splitport(host): 963 """splitport('host:port') --> 'host', 'port'.""" 964 global _portprog 965 if _portprog is None: 966 import re 967 _portprog = re.compile('^(.*):([0-9]+)$') 968 969 match = _portprog.match(host) 970 if match: return match.group(1, 2) 971 return host, None 972 973_nportprog = None 974def splitnport(host, defport=-1): 975 """Split host and port, returning numeric port. 976 Return given default port if no ':' found; defaults to -1. 977 Return numerical port if a valid number are found after ':'. 978 Return None if ':' but not a valid number.""" 979 global _nportprog 980 if _nportprog is None: 981 import re 982 _nportprog = re.compile('^(.*):(.*)$') 983 984 match = _nportprog.match(host) 985 if match: 986 host, port = match.group(1, 2) 987 try: 988 if not port: raise ValueError, "no digits" 989 nport = int(port) 990 except ValueError: 991 nport = None 992 return host, nport 993 return host, defport 994 995_queryprog = None 996def splitquery(url): 997 """splitquery('/path?query') --> '/path', 'query'.""" 998 global _queryprog 999 if _queryprog is None: 1000 import re 1001 _queryprog = re.compile('^(.*)\?([^?]*)$') 1002 1003 match = _queryprog.match(url) 1004 if match: return match.group(1, 2) 1005 return url, None 1006 1007_tagprog = None 1008def splittag(url): 1009 """splittag('/path#tag') --> '/path', 'tag'.""" 1010 global _tagprog 1011 if _tagprog is None: 1012 import re 1013 _tagprog = re.compile('^(.*)#([^#]*)$') 1014 1015 match = _tagprog.match(url) 1016 if match: return match.group(1, 2) 1017 return url, None 1018 1019def splitattr(url): 1020 """splitattr('/path;attr1=value1;attr2=value2;...') -> 1021 '/path', ['attr1=value1', 'attr2=value2', ...].""" 1022 words = url.split(';') 1023 return words[0], words[1:] 1024 1025_valueprog = None 1026def splitvalue(attr): 1027 """splitvalue('attr=value') --> 'attr', 'value'.""" 1028 global _valueprog 1029 if _valueprog is None: 1030 import re 1031 _valueprog = re.compile('^([^=]*)=(.*)$') 1032 1033 match = _valueprog.match(attr) 1034 if match: return match.group(1, 2) 1035 return attr, None 1036 1037def splitgophertype(selector): 1038 """splitgophertype('/Xselector') --> 'X', 'selector'.""" 1039 if selector[:1] == '/' and selector[1:2]: 1040 return selector[1], selector[2:] 1041 return None, selector 1042 1043def unquote(s): 1044 """unquote('abc%20def') -> 'abc def'.""" 1045 mychr = chr 1046 myatoi = int 1047 list = s.split('%') 1048 res = [list[0]] 1049 myappend = res.append 1050 del list[0] 1051 for item in list: 1052 if item[1:2]: 1053 try: 1054 myappend(mychr(myatoi(item[:2], 16)) 1055 + item[2:]) 1056 except: 1057 myappend('%' + item) 1058 else: 1059 myappend('%' + item) 1060 return "".join(res) 1061 1062def unquote_plus(s): 1063 """unquote('%7e/abc+def') -> '~/abc def'""" 1064 if '+' in s: 1065 # replace '+' with ' ' 1066 s = ' '.join(s.split('+')) 1067 return unquote(s) 1068 1069always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 1070 'abcdefghijklmnopqrstuvwxyz' 1071 '0123456789' '_.-') 1072 1073_fast_safe_test = always_safe + '/' 1074_fast_safe = None 1075 1076def _fast_quote(s): 1077 global _fast_safe 1078 if _fast_safe is None: 1079 _fast_safe = {} 1080 for c in _fast_safe_test: 1081 _fast_safe[c] = c 1082 res = list(s) 1083 for i in range(len(res)): 1084 c = res[i] 1085 if not _fast_safe.has_key(c): 1086 res[i] = '%%%02X' % ord(c) 1087 return ''.join(res) 1088 1089def quote(s, safe = '/'): 1090 """quote('abc def') -> 'abc%20def' 1091 1092 Each part of a URL, e.g. the path info, the query, etc., has a 1093 different set of reserved characters that must be quoted. 1094 1095 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists 1096 the following reserved characters. 1097 1098 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 1099 "$" | "," 1100 1101 Each of these characters is reserved in some component of a URL, 1102 but not necessarily in all of them. 1103 1104 By default, the quote function is intended for quoting the path 1105 section of a URL. Thus, it will not encode '/'. This character 1106 is reserved, but in typical usage the quote function is being 1107 called on a path where the existing slash characters are used as 1108 reserved characters. 1109 """ 1110 safe = always_safe + safe 1111 if _fast_safe_test == safe: 1112 return _fast_quote(s) 1113 res = list(s) 1114 for i in range(len(res)): 1115 c = res[i] 1116 if c not in safe: 1117 res[i] = '%%%02X' % ord(c) 1118 return ''.join(res) 1119 1120def quote_plus(s, safe = ''): 1121 """Quote the query fragment of a URL; replacing ' ' with '+'""" 1122 if ' ' in s: 1123 l = s.split(' ') 1124 for i in range(len(l)): 1125 l[i] = quote(l[i], safe) 1126 return '+'.join(l) 1127 else: 1128 return quote(s, safe) 1129 1130def urlencode(query,doseq=0): 1131 """Encode a sequence of two-element tuples or dictionary into a URL query string. 1132 1133 If any values in the query arg are sequences and doseq is true, each 1134 sequence element is converted to a separate parameter. 1135 1136 If the query arg is a sequence of two-element tuples, the order of the 1137 parameters in the output will match the order of parameters in the 1138 input. 1139 """ 1140 1141 if hasattr(query,"items"): 1142 # mapping objects 1143 query = query.items() 1144 else: 1145 # it's a bother at times that strings and string-like objects are 1146 # sequences... 1147 try: 1148 # non-sequence items should not work with len() 1149 x = len(query) 1150 # non-empty strings will fail this 1151 if len(query) and type(query[0]) != types.TupleType: 1152 raise TypeError 1153 # zero-length sequences of all types will get here and succeed, 1154 # but that's a minor nit - since the original implementation 1155 # allowed empty dicts that type of behavior probably should be 1156 # preserved for consistency 1157 except TypeError: 1158 ty,va,tb = sys.exc_info() 1159 raise TypeError, "not a valid non-string sequence or mapping object", tb 1160 1161 l = [] 1162 if not doseq: 1163 # preserve old behavior 1164 for k, v in query: 1165 k = quote_plus(str(k)) 1166 v = quote_plus(str(v)) 1167 l.append(k + '=' + v) 1168 else: 1169 for k, v in query: 1170 k = quote_plus(str(k)) 1171 if type(v) == types.StringType: 1172 v = quote_plus(v) 1173 l.append(k + '=' + v) 1174 elif type(v) == types.UnicodeType: 1175 # is there a reasonable way to convert to ASCII? 1176 # encode generates a string, but "replace" or "ignore" 1177 # lose information and "strict" can raise UnicodeError 1178 v = quote_plus(v.encode("ASCII","replace")) 1179 l.append(k + '=' + v) 1180 else: 1181 try: 1182 # is this a sufficient test for sequence-ness? 1183 x = len(v) 1184 except TypeError: 1185 # not a sequence 1186 v = quote_plus(str(v)) 1187 l.append(k + '=' + v) 1188 else: 1189 # loop over the sequence 1190 for elt in v: 1191 l.append(k + '=' + quote_plus(str(elt))) 1192 return '&'.join(l) 1193 1194# Proxy handling 1195def getproxies_environment(): 1196 """Return a dictionary of scheme -> proxy server URL mappings. 1197 1198 Scan the environment for variables named <scheme>_proxy; 1199 this seems to be the standard convention. If you need a 1200 different way, you can pass a proxies dictionary to the 1201 [Fancy]URLopener constructor. 1202 1203 """ 1204 proxies = {} 1205 for name, value in os.environ.items(): 1206 name = name.lower() 1207 if value and name[-6:] == '_proxy': 1208 proxies[name[:-6]] = value 1209 return proxies 1210 1211if os.name == 'mac': 1212 def getproxies(): 1213 """Return a dictionary of scheme -> proxy server URL mappings. 1214 1215 By convention the mac uses Internet Config to store 1216 proxies. An HTTP proxy, for instance, is stored under 1217 the HttpProxy key. 1218 1219 """ 1220 try: 1221 import ic 1222 except ImportError: 1223 return {} 1224 1225 try: 1226 config = ic.IC() 1227 except ic.error: 1228 return {} 1229 proxies = {} 1230 # HTTP: 1231 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']: 1232 try: 1233 value = config['HTTPProxyHost'] 1234 except ic.error: 1235 pass 1236 else: 1237 proxies['http'] = 'http://%s' % value 1238 # FTP: XXXX To be done. 1239 # Gopher: XXXX To be done. 1240 return proxies 1241 1242elif os.name == 'nt': 1243 def getproxies_registry(): 1244 """Return a dictionary of scheme -> proxy server URL mappings. 1245 1246 Win32 uses the registry to store proxies. 1247 1248 """ 1249 proxies = {} 1250 try: 1251 import _winreg 1252 except ImportError: 1253 # Std module, so should be around - but you never know! 1254 return proxies 1255 try: 1256 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, 1257 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 1258 proxyEnable = _winreg.QueryValueEx(internetSettings, 1259 'ProxyEnable')[0] 1260 if proxyEnable: 1261 # Returned as Unicode but problems if not converted to ASCII 1262 proxyServer = str(_winreg.QueryValueEx(internetSettings, 1263 'ProxyServer')[0]) 1264 if '=' in proxyServer: 1265 # Per-protocol settings 1266 for p in proxyServer.split(';'): 1267 protocol, address = p.split('=', 1) 1268 proxies[protocol] = '%s://%s' % (protocol, address) 1269 else: 1270 # Use one setting for all protocols 1271 if proxyServer[:5] == 'http:': 1272 proxies['http'] = proxyServer 1273 else: 1274 proxies['http'] = 'http://%s' % proxyServer 1275 proxies['ftp'] = 'ftp://%s' % proxyServer 1276 internetSettings.Close() 1277 except (WindowsError, ValueError, TypeError): 1278 # Either registry key not found etc, or the value in an 1279 # unexpected format. 1280 # proxies already set up to be empty so nothing to do 1281 pass 1282 return proxies 1283 1284 def getproxies(): 1285 """Return a dictionary of scheme -> proxy server URL mappings. 1286 1287 Returns settings gathered from the environment, if specified, 1288 or the registry. 1289 1290 """ 1291 return getproxies_environment() or getproxies_registry() 1292else: 1293 # By default use environment variables 1294 getproxies = getproxies_environment 1295 1296 1297# Test and time quote() and unquote() 1298def test1(): 1299 import time 1300 s = '' 1301 for i in range(256): s = s + chr(i) 1302 s = s*4 1303 t0 = time.time() 1304 qs = quote(s) 1305 uqs = unquote(qs) 1306 t1 = time.time() 1307 if uqs != s: 1308 print 'Wrong!' 1309 print `s` 1310 print `qs` 1311 print `uqs` 1312 print round(t1 - t0, 3), 'sec' 1313 1314 1315def reporthook(blocknum, blocksize, totalsize): 1316 # Report during remote transfers 1317 print "Block number: %d, Block size: %d, Total size: %d" % ( 1318 blocknum, blocksize, totalsize) 1319 1320# Test program 1321def test(args=[]): 1322 if not args: 1323 args = [ 1324 '/etc/passwd', 1325 'file:/etc/passwd', 1326 'file://localhost/etc/passwd', 1327 'ftp://ftp.python.org/etc/passwd', 1328## 'gopher://gopher.micro.umn.edu/1/', 1329 'http://www.python.org/index.html', 1330 ] 1331 if hasattr(URLopener, "open_https"): 1332 args.append('https://synergy.as.cmu.edu/~geek/') 1333 try: 1334 for url in args: 1335 print '-'*10, url, '-'*10 1336 fn, h = urlretrieve(url, None, reporthook) 1337 print fn 1338 if h: 1339 print '======' 1340 for k in h.keys(): print k + ':', h[k] 1341 print '======' 1342 fp = open(fn, 'rb') 1343 data = fp.read() 1344 del fp 1345 if '\r' in data: 1346 table = string.maketrans("", "") 1347 data = data.translate(table, "\r") 1348 print data 1349 fn, h = None, None 1350 print '-'*40 1351 finally: 1352 urlcleanup() 1353 1354def main(): 1355 import getopt, sys 1356 try: 1357 opts, args = getopt.getopt(sys.argv[1:], "th") 1358 except getopt.error, msg: 1359 print msg 1360 print "Use -h for help" 1361 return 1362 t = 0 1363 for o, a in opts: 1364 if o == '-t': 1365 t = t + 1 1366 if o == '-h': 1367 print "Usage: python urllib.py [-t] [url ...]" 1368 print "-t runs self-test;", 1369 print "otherwise, contents of urls are printed" 1370 return 1371 if t: 1372 if t > 1: 1373 test1() 1374 test(args) 1375 else: 1376 if not args: 1377 print "Use -h for help" 1378 for url in args: 1379 print urlopen(url).read(), 1380 1381# Run test program when run as a script 1382if __name__ == '__main__': 1383 main() 1384