urllib.py revision 49985638fa33230fdf1ef95613d918fe5e385f5e
1"""Open an arbitrary URL. 2 3See the following document for more info on URLs: 4"Names and Addresses, URIs, URLs, URNs, URCs", at 5http://www.w3.org/pub/WWW/Addressing/Overview.html 6 7See also the HTTP spec (from which the error codes are derived): 8"HTTP - Hypertext Transfer Protocol", at 9http://www.w3.org/pub/WWW/Protocols/ 10 11Related standards and specs: 12- RFC1808: the "relative URL" spec. (authoritative status) 13- RFC1738 - the "URL standard". (authoritative status) 14- RFC1630 - the "URI spec". (informational status) 15 16The object returned by URLopener().open(file) will differ per 17protocol. All you know is that is has methods read(), readline(), 18readlines(), fileno(), close() and info(). The read*(), fileno() 19and close() methods work like those of open files. 20The info() method returns a mimetools.Message object which can be 21used to query various info about the object, if available. 22(mimetools.Message objects are queried with the getheader() method.) 23""" 24 25import string 26import socket 27import os 28import sys 29import types 30 31__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve", 32 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus", 33 "urlencode", "url2pathname", "pathname2url"] 34 35__version__ = '1.15' # XXX This version is not always updated :-( 36 37MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 38 39# Helper for non-unix systems 40if os.name == 'mac': 41 from macurl2path import url2pathname, pathname2url 42elif os.name == 'nt': 43 from nturl2path import url2pathname, pathname2url 44elif os.name == 'riscos': 45 from rourl2path import url2pathname, pathname2url 46else: 47 def url2pathname(pathname): 48 return unquote(pathname) 49 def pathname2url(pathname): 50 return quote(pathname) 51 52# This really consists of two pieces: 53# (1) a class which handles opening of all sorts of URLs 54# (plus assorted utilities etc.) 55# (2) a set of functions for parsing URLs 56# XXX Should these be separated out into different modules? 57 58 59# Shortcut for basic usage 60_urlopener = None 61def urlopen(url, data=None): 62 """urlopen(url [, data]) -> open file-like object""" 63 global _urlopener 64 if not _urlopener: 65 _urlopener = FancyURLopener() 66 if data is None: 67 return _urlopener.open(url) 68 else: 69 return _urlopener.open(url, data) 70def urlretrieve(url, filename=None, reporthook=None, data=None): 71 global _urlopener 72 if not _urlopener: 73 _urlopener = FancyURLopener() 74 return _urlopener.retrieve(url, filename, reporthook, data) 75def urlcleanup(): 76 if _urlopener: 77 _urlopener.cleanup() 78 79 80ftpcache = {} 81class URLopener: 82 """Class to open URLs. 83 This is a class rather than just a subroutine because we may need 84 more than one set of global protocol-specific options. 85 Note -- this is a base class for those who don't want the 86 automatic handling of errors type 302 (relocated) and 401 87 (authorization needed).""" 88 89 __tempfiles = None 90 91 version = "Python-urllib/%s" % __version__ 92 93 # Constructor 94 def __init__(self, proxies=None, **x509): 95 if proxies is None: 96 proxies = getproxies() 97 assert hasattr(proxies, 'has_key'), "proxies must be a mapping" 98 self.proxies = proxies 99 self.key_file = x509.get('key_file') 100 self.cert_file = x509.get('cert_file') 101 self.addheaders = [('User-agent', self.version)] 102 self.__tempfiles = [] 103 self.__unlink = os.unlink # See cleanup() 104 self.tempcache = None 105 # Undocumented feature: if you assign {} to tempcache, 106 # it is used to cache files retrieved with 107 # self.retrieve(). This is not enabled by default 108 # since it does not work for changing documents (and I 109 # haven't got the logic to check expiration headers 110 # yet). 111 self.ftpcache = ftpcache 112 # Undocumented feature: you can use a different 113 # ftp cache by assigning to the .ftpcache member; 114 # in case you want logically independent URL openers 115 # XXX This is not threadsafe. Bah. 116 117 def __del__(self): 118 self.close() 119 120 def close(self): 121 self.cleanup() 122 123 def cleanup(self): 124 # This code sometimes runs when the rest of this module 125 # has already been deleted, so it can't use any globals 126 # or import anything. 127 if self.__tempfiles: 128 for file in self.__tempfiles: 129 try: 130 self.__unlink(file) 131 except: 132 pass 133 del self.__tempfiles[:] 134 if self.tempcache: 135 self.tempcache.clear() 136 137 def addheader(self, *args): 138 """Add a header to be used by the HTTP interface only 139 e.g. u.addheader('Accept', 'sound/basic')""" 140 self.addheaders.append(args) 141 142 # External interface 143 def open(self, fullurl, data=None): 144 """Use URLopener().open(file) instead of open(file, 'r').""" 145 fullurl = unwrap(toBytes(fullurl)) 146 if self.tempcache and self.tempcache.has_key(fullurl): 147 filename, headers = self.tempcache[fullurl] 148 fp = open(filename, 'rb') 149 return addinfourl(fp, headers, fullurl) 150 urltype, url = splittype(fullurl) 151 if not urltype: 152 urltype = 'file' 153 if self.proxies.has_key(urltype): 154 proxy = self.proxies[urltype] 155 urltype, proxyhost = splittype(proxy) 156 host, selector = splithost(proxyhost) 157 url = (host, fullurl) # Signal special case to open_*() 158 else: 159 proxy = None 160 name = 'open_' + urltype 161 self.type = urltype 162 if '-' in name: 163 # replace - with _ 164 name = '_'.join(name.split('-')) 165 if not hasattr(self, name): 166 if proxy: 167 return self.open_unknown_proxy(proxy, fullurl, data) 168 else: 169 return self.open_unknown(fullurl, data) 170 try: 171 if data is None: 172 return getattr(self, name)(url) 173 else: 174 return getattr(self, name)(url, data) 175 except socket.error, msg: 176 raise IOError, ('socket error', msg), sys.exc_info()[2] 177 178 def open_unknown(self, fullurl, data=None): 179 """Overridable interface to open unknown URL type.""" 180 type, url = splittype(fullurl) 181 raise IOError, ('url error', 'unknown url type', type) 182 183 def open_unknown_proxy(self, proxy, fullurl, data=None): 184 """Overridable interface to open unknown URL type.""" 185 type, url = splittype(fullurl) 186 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy) 187 188 # External interface 189 def retrieve(self, url, filename=None, reporthook=None, data=None): 190 """retrieve(url) returns (filename, None) for a local object 191 or (tempfilename, headers) for a remote object.""" 192 url = unwrap(toBytes(url)) 193 if self.tempcache and self.tempcache.has_key(url): 194 return self.tempcache[url] 195 type, url1 = splittype(url) 196 if not filename and (not type or type == 'file'): 197 try: 198 fp = self.open_local_file(url1) 199 hdrs = fp.info() 200 del fp 201 return url2pathname(splithost(url1)[1]), hdrs 202 except IOError, msg: 203 pass 204 fp = self.open(url, data) 205 headers = fp.info() 206 if not filename: 207 import tempfile 208 garbage, path = splittype(url) 209 garbage, path = splithost(path or "") 210 path, garbage = splitquery(path or "") 211 path, garbage = splitattr(path or "") 212 suffix = os.path.splitext(path)[1] 213 filename = tempfile.mktemp(suffix) 214 self.__tempfiles.append(filename) 215 result = filename, headers 216 if self.tempcache is not None: 217 self.tempcache[url] = result 218 tfp = open(filename, 'wb') 219 bs = 1024*8 220 size = -1 221 blocknum = 1 222 if reporthook: 223 if headers.has_key("content-length"): 224 size = int(headers["Content-Length"]) 225 reporthook(0, bs, size) 226 block = fp.read(bs) 227 if reporthook: 228 reporthook(1, bs, size) 229 while block: 230 tfp.write(block) 231 block = fp.read(bs) 232 blocknum = blocknum + 1 233 if reporthook: 234 reporthook(blocknum, bs, size) 235 fp.close() 236 tfp.close() 237 del fp 238 del tfp 239 return result 240 241 # Each method named open_<type> knows how to open that type of URL 242 243 def open_http(self, url, data=None): 244 """Use HTTP protocol.""" 245 import httplib 246 user_passwd = None 247 if type(url) is types.StringType: 248 host, selector = splithost(url) 249 if host: 250 user_passwd, host = splituser(host) 251 host = unquote(host) 252 realhost = host 253 else: 254 host, selector = url 255 urltype, rest = splittype(selector) 256 url = rest 257 user_passwd = None 258 if urltype.lower() != 'http': 259 realhost = None 260 else: 261 realhost, rest = splithost(rest) 262 if realhost: 263 user_passwd, realhost = splituser(realhost) 264 if user_passwd: 265 selector = "%s://%s%s" % (urltype, realhost, rest) 266 #print "proxy via http:", host, selector 267 if not host: raise IOError, ('http error', 'no host given') 268 if user_passwd: 269 import base64 270 auth = base64.encodestring(user_passwd).strip() 271 else: 272 auth = None 273 h = httplib.HTTP(host) 274 if data is not None: 275 h.putrequest('POST', selector) 276 h.putheader('Content-type', 'application/x-www-form-urlencoded') 277 h.putheader('Content-length', '%d' % len(data)) 278 else: 279 h.putrequest('GET', selector) 280 if auth: h.putheader('Authorization', 'Basic %s' % auth) 281 if realhost: h.putheader('Host', realhost) 282 for args in self.addheaders: apply(h.putheader, args) 283 h.endheaders() 284 if data is not None: 285 h.send(data + '\r\n') 286 errcode, errmsg, headers = h.getreply() 287 fp = h.getfile() 288 if errcode == 200: 289 return addinfourl(fp, headers, "http:" + url) 290 else: 291 if data is None: 292 return self.http_error(url, fp, errcode, errmsg, headers) 293 else: 294 return self.http_error(url, fp, errcode, errmsg, headers, data) 295 296 def http_error(self, url, fp, errcode, errmsg, headers, data=None): 297 """Handle http errors. 298 Derived class can override this, or provide specific handlers 299 named http_error_DDD where DDD is the 3-digit error code.""" 300 # First check if there's a specific handler for this error 301 name = 'http_error_%d' % errcode 302 if hasattr(self, name): 303 method = getattr(self, name) 304 if data is None: 305 result = method(url, fp, errcode, errmsg, headers) 306 else: 307 result = method(url, fp, errcode, errmsg, headers, data) 308 if result: return result 309 return self.http_error_default(url, fp, errcode, errmsg, headers) 310 311 def http_error_default(self, url, fp, errcode, errmsg, headers): 312 """Default error handler: close the connection and raise IOError.""" 313 void = fp.read() 314 fp.close() 315 raise IOError, ('http error', errcode, errmsg, headers) 316 317 if hasattr(socket, "ssl"): 318 def open_https(self, url, data=None): 319 """Use HTTPS protocol.""" 320 import httplib 321 user_passwd = None 322 if type(url) is types.StringType: 323 host, selector = splithost(url) 324 if host: 325 user_passwd, host = splituser(host) 326 host = unquote(host) 327 realhost = host 328 else: 329 host, selector = url 330 urltype, rest = splittype(selector) 331 url = rest 332 user_passwd = None 333 if urltype.lower() != 'https': 334 realhost = None 335 else: 336 realhost, rest = splithost(rest) 337 if realhost: 338 user_passwd, realhost = splituser(realhost) 339 if user_passwd: 340 selector = "%s://%s%s" % (urltype, realhost, rest) 341 #print "proxy via https:", host, selector 342 if not host: raise IOError, ('https error', 'no host given') 343 if user_passwd: 344 import base64 345 auth = base64.encodestring(user_passwd).strip() 346 else: 347 auth = None 348 h = httplib.HTTPS(host, 0, 349 key_file=self.key_file, 350 cert_file=self.cert_file) 351 if data is not None: 352 h.putrequest('POST', selector) 353 h.putheader('Content-type', 354 'application/x-www-form-urlencoded') 355 h.putheader('Content-length', '%d' % len(data)) 356 else: 357 h.putrequest('GET', selector) 358 if auth: h.putheader('Authorization: Basic %s' % auth) 359 if realhost: h.putheader('Host', realhost) 360 for args in self.addheaders: apply(h.putheader, args) 361 h.endheaders() 362 if data is not None: 363 h.send(data + '\r\n') 364 errcode, errmsg, headers = h.getreply() 365 fp = h.getfile() 366 if errcode == 200: 367 return addinfourl(fp, headers, url) 368 else: 369 if data is None: 370 return self.http_error(url, fp, errcode, errmsg, headers) 371 else: 372 return self.http_error(url, fp, errcode, errmsg, headers, 373 data) 374 375 def open_gopher(self, url): 376 """Use Gopher protocol.""" 377 import gopherlib 378 host, selector = splithost(url) 379 if not host: raise IOError, ('gopher error', 'no host given') 380 host = unquote(host) 381 type, selector = splitgophertype(selector) 382 selector, query = splitquery(selector) 383 selector = unquote(selector) 384 if query: 385 query = unquote(query) 386 fp = gopherlib.send_query(selector, query, host) 387 else: 388 fp = gopherlib.send_selector(selector, host) 389 return addinfourl(fp, noheaders(), "gopher:" + url) 390 391 def open_file(self, url): 392 """Use local file or FTP depending on form of URL.""" 393 if url[:2] == '//' and url[2:3] != '/': 394 return self.open_ftp(url) 395 else: 396 return self.open_local_file(url) 397 398 def open_local_file(self, url): 399 """Use local file.""" 400 import mimetypes, mimetools, StringIO 401 mtype = mimetypes.guess_type(url)[0] 402 headers = mimetools.Message(StringIO.StringIO( 403 'Content-Type: %s\n' % (mtype or 'text/plain'))) 404 host, file = splithost(url) 405 if not host: 406 urlfile = file 407 if file[:1] == '/': 408 urlfile = 'file://' + file 409 return addinfourl(open(url2pathname(file), 'rb'), 410 headers, urlfile) 411 host, port = splitport(host) 412 if not port \ 413 and socket.gethostbyname(host) in (localhost(), thishost()): 414 urlfile = file 415 if file[:1] == '/': 416 urlfile = 'file://' + file 417 return addinfourl(open(url2pathname(file), 'rb'), 418 headers, urlfile) 419 raise IOError, ('local file error', 'not on local host') 420 421 def open_ftp(self, url): 422 """Use FTP protocol.""" 423 host, path = splithost(url) 424 if not host: raise IOError, ('ftp error', 'no host given') 425 host, port = splitport(host) 426 user, host = splituser(host) 427 if user: user, passwd = splitpasswd(user) 428 else: passwd = None 429 host = unquote(host) 430 user = unquote(user or '') 431 passwd = unquote(passwd or '') 432 host = socket.gethostbyname(host) 433 if not port: 434 import ftplib 435 port = ftplib.FTP_PORT 436 else: 437 port = int(port) 438 path, attrs = splitattr(path) 439 path = unquote(path) 440 dirs = path.split('/') 441 dirs, file = dirs[:-1], dirs[-1] 442 if dirs and not dirs[0]: dirs = dirs[1:] 443 if dirs and not dirs[0]: dirs[0] = '/' 444 key = user, host, port, '/'.join(dirs) 445 # XXX thread unsafe! 446 if len(self.ftpcache) > MAXFTPCACHE: 447 # Prune the cache, rather arbitrarily 448 for k in self.ftpcache.keys(): 449 if k != key: 450 v = self.ftpcache[k] 451 del self.ftpcache[k] 452 v.close() 453 try: 454 if not self.ftpcache.has_key(key): 455 self.ftpcache[key] = \ 456 ftpwrapper(user, passwd, host, port, dirs) 457 if not file: type = 'D' 458 else: type = 'I' 459 for attr in attrs: 460 attr, value = splitvalue(attr) 461 if attr.lower() == 'type' and \ 462 value in ('a', 'A', 'i', 'I', 'd', 'D'): 463 type = value.upper() 464 (fp, retrlen) = self.ftpcache[key].retrfile(file, type) 465 if retrlen is not None and retrlen >= 0: 466 import mimetools, StringIO 467 headers = mimetools.Message(StringIO.StringIO( 468 'Content-Length: %d\n' % retrlen)) 469 else: 470 headers = noheaders() 471 return addinfourl(fp, headers, "ftp:" + url) 472 except ftperrors(), msg: 473 raise IOError, ('ftp error', msg), sys.exc_info()[2] 474 475 def open_data(self, url, data=None): 476 """Use "data" URL.""" 477 # ignore POSTed data 478 # 479 # syntax of data URLs: 480 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 481 # mediatype := [ type "/" subtype ] *( ";" parameter ) 482 # data := *urlchar 483 # parameter := attribute "=" value 484 import StringIO, mimetools, time 485 try: 486 [type, data] = url.split(',', 1) 487 except ValueError: 488 raise IOError, ('data error', 'bad data URL') 489 if not type: 490 type = 'text/plain;charset=US-ASCII' 491 semi = type.rfind(';') 492 if semi >= 0 and '=' not in type[semi:]: 493 encoding = type[semi+1:] 494 type = type[:semi] 495 else: 496 encoding = '' 497 msg = [] 498 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT', 499 time.gmtime(time.time()))) 500 msg.append('Content-type: %s' % type) 501 if encoding == 'base64': 502 import base64 503 data = base64.decodestring(data) 504 else: 505 data = unquote(data) 506 msg.append('Content-length: %d' % len(data)) 507 msg.append('') 508 msg.append(data) 509 msg = '\n'.join(msg) 510 f = StringIO.StringIO(msg) 511 headers = mimetools.Message(f, 0) 512 f.fileno = None # needed for addinfourl 513 return addinfourl(f, headers, url) 514 515 516class FancyURLopener(URLopener): 517 """Derived class with handlers for errors we can handle (perhaps).""" 518 519 def __init__(self, *args): 520 apply(URLopener.__init__, (self,) + args) 521 self.auth_cache = {} 522 self.tries = 0 523 self.maxtries = 10 524 525 def http_error_default(self, url, fp, errcode, errmsg, headers): 526 """Default error handling -- don't raise an exception.""" 527 return addinfourl(fp, headers, "http:" + url) 528 529 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 530 """Error 302 -- relocated (temporarily).""" 531 self.tries += 1 532 if self.maxtries and self.tries >= self.maxtries: 533 if hasattr(self, "http_error_500"): 534 meth = self.http_error_500 535 else: 536 meth = self.http_error_default 537 self.tries = 0 538 return meth(url, fp, 500, 539 "Internal Server Error: Redirect Recursion", headers) 540 result = self.redirect_internal(url, fp, errcode, errmsg, headers, 541 data) 542 self.tries = 0 543 return result 544 545 def redirect_internal(self, url, fp, errcode, errmsg, headers, data): 546 if headers.has_key('location'): 547 newurl = headers['location'] 548 elif headers.has_key('uri'): 549 newurl = headers['uri'] 550 else: 551 return 552 void = fp.read() 553 fp.close() 554 # In case the server sent a relative URL, join with original: 555 newurl = basejoin("http:" + url, newurl) 556 if data is None: 557 return self.open(newurl) 558 else: 559 return self.open(newurl, data) 560 561 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): 562 """Error 301 -- also relocated (permanently).""" 563 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 564 565 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): 566 """Error 401 -- authentication required. 567 See this URL for a description of the basic authentication scheme: 568 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt""" 569 if not headers.has_key('www-authenticate'): 570 URLopener.http_error_default(self, url, fp, 571 errmsg, headers) 572 stuff = headers['www-authenticate'] 573 import re 574 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 575 if not match: 576 URLopener.http_error_default(self, url, fp, 577 errcode, errmsg, headers) 578 scheme, realm = match.groups() 579 if scheme.lower() != 'basic': 580 URLopener.http_error_default(self, url, fp, 581 errcode, errmsg, headers) 582 name = 'retry_' + self.type + '_basic_auth' 583 if data is None: 584 return getattr(self,name)(url, realm) 585 else: 586 return getattr(self,name)(url, realm, data) 587 588 def retry_http_basic_auth(self, url, realm, data=None): 589 host, selector = splithost(url) 590 i = host.find('@') + 1 591 host = host[i:] 592 user, passwd = self.get_user_passwd(host, realm, i) 593 if not (user or passwd): return None 594 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host 595 newurl = 'http://' + host + selector 596 if data is None: 597 return self.open(newurl) 598 else: 599 return self.open(newurl, data) 600 601 def retry_https_basic_auth(self, url, realm, data=None): 602 host, selector = splithost(url) 603 i = host.find('@') + 1 604 host = host[i:] 605 user, passwd = self.get_user_passwd(host, realm, i) 606 if not (user or passwd): return None 607 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host 608 newurl = '//' + host + selector 609 return self.open_https(newurl, data) 610 611 def get_user_passwd(self, host, realm, clear_cache = 0): 612 key = realm + '@' + host.lower() 613 if self.auth_cache.has_key(key): 614 if clear_cache: 615 del self.auth_cache[key] 616 else: 617 return self.auth_cache[key] 618 user, passwd = self.prompt_user_passwd(host, realm) 619 if user or passwd: self.auth_cache[key] = (user, passwd) 620 return user, passwd 621 622 def prompt_user_passwd(self, host, realm): 623 """Override this in a GUI environment!""" 624 import getpass 625 try: 626 user = raw_input("Enter username for %s at %s: " % (realm, 627 host)) 628 passwd = getpass.getpass("Enter password for %s in %s at %s: " % 629 (user, realm, host)) 630 return user, passwd 631 except KeyboardInterrupt: 632 print 633 return None, None 634 635 636# Utility functions 637 638_localhost = None 639def localhost(): 640 """Return the IP address of the magic hostname 'localhost'.""" 641 global _localhost 642 if not _localhost: 643 _localhost = socket.gethostbyname('localhost') 644 return _localhost 645 646_thishost = None 647def thishost(): 648 """Return the IP address of the current host.""" 649 global _thishost 650 if not _thishost: 651 _thishost = socket.gethostbyname(socket.gethostname()) 652 return _thishost 653 654_ftperrors = None 655def ftperrors(): 656 """Return the set of errors raised by the FTP class.""" 657 global _ftperrors 658 if not _ftperrors: 659 import ftplib 660 _ftperrors = ftplib.all_errors 661 return _ftperrors 662 663_noheaders = None 664def noheaders(): 665 """Return an empty mimetools.Message object.""" 666 global _noheaders 667 if not _noheaders: 668 import mimetools 669 import StringIO 670 _noheaders = mimetools.Message(StringIO.StringIO(), 0) 671 _noheaders.fp.close() # Recycle file descriptor 672 return _noheaders 673 674 675# Utility classes 676 677class ftpwrapper: 678 """Class used by open_ftp() for cache of open FTP connections.""" 679 680 def __init__(self, user, passwd, host, port, dirs): 681 self.user = user 682 self.passwd = passwd 683 self.host = host 684 self.port = port 685 self.dirs = dirs 686 self.init() 687 688 def init(self): 689 import ftplib 690 self.busy = 0 691 self.ftp = ftplib.FTP() 692 self.ftp.connect(self.host, self.port) 693 self.ftp.login(self.user, self.passwd) 694 for dir in self.dirs: 695 self.ftp.cwd(dir) 696 697 def retrfile(self, file, type): 698 import ftplib 699 self.endtransfer() 700 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 701 else: cmd = 'TYPE ' + type; isdir = 0 702 try: 703 self.ftp.voidcmd(cmd) 704 except ftplib.all_errors: 705 self.init() 706 self.ftp.voidcmd(cmd) 707 conn = None 708 if file and not isdir: 709 # Use nlst to see if the file exists at all 710 try: 711 self.ftp.nlst(file) 712 except ftplib.error_perm, reason: 713 raise IOError, ('ftp error', reason), sys.exc_info()[2] 714 # Restore the transfer mode! 715 self.ftp.voidcmd(cmd) 716 # Try to retrieve as a file 717 try: 718 cmd = 'RETR ' + file 719 conn = self.ftp.ntransfercmd(cmd) 720 except ftplib.error_perm, reason: 721 if str(reason)[:3] != '550': 722 raise IOError, ('ftp error', reason), sys.exc_info()[2] 723 if not conn: 724 # Set transfer mode to ASCII! 725 self.ftp.voidcmd('TYPE A') 726 # Try a directory listing 727 if file: cmd = 'LIST ' + file 728 else: cmd = 'LIST' 729 conn = self.ftp.ntransfercmd(cmd) 730 self.busy = 1 731 # Pass back both a suitably decorated object and a retrieval length 732 return (addclosehook(conn[0].makefile('rb'), 733 self.endtransfer), conn[1]) 734 def endtransfer(self): 735 if not self.busy: 736 return 737 self.busy = 0 738 try: 739 self.ftp.voidresp() 740 except ftperrors(): 741 pass 742 743 def close(self): 744 self.endtransfer() 745 try: 746 self.ftp.close() 747 except ftperrors(): 748 pass 749 750class addbase: 751 """Base class for addinfo and addclosehook.""" 752 753 def __init__(self, fp): 754 self.fp = fp 755 self.read = self.fp.read 756 self.readline = self.fp.readline 757 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines 758 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno 759 760 def __repr__(self): 761 return '<%s at %s whose fp = %s>' % (self.__class__.__name__, 762 `id(self)`, `self.fp`) 763 764 def close(self): 765 self.read = None 766 self.readline = None 767 self.readlines = None 768 self.fileno = None 769 if self.fp: self.fp.close() 770 self.fp = None 771 772class addclosehook(addbase): 773 """Class to add a close hook to an open file.""" 774 775 def __init__(self, fp, closehook, *hookargs): 776 addbase.__init__(self, fp) 777 self.closehook = closehook 778 self.hookargs = hookargs 779 780 def close(self): 781 addbase.close(self) 782 if self.closehook: 783 apply(self.closehook, self.hookargs) 784 self.closehook = None 785 self.hookargs = None 786 787class addinfo(addbase): 788 """class to add an info() method to an open file.""" 789 790 def __init__(self, fp, headers): 791 addbase.__init__(self, fp) 792 self.headers = headers 793 794 def info(self): 795 return self.headers 796 797class addinfourl(addbase): 798 """class to add info() and geturl() methods to an open file.""" 799 800 def __init__(self, fp, headers, url): 801 addbase.__init__(self, fp) 802 self.headers = headers 803 self.url = url 804 805 def info(self): 806 return self.headers 807 808 def geturl(self): 809 return self.url 810 811 812def basejoin(base, url): 813 """Utility to combine a URL with a base URL to form a new URL.""" 814 type, path = splittype(url) 815 if type: 816 # if url is complete (i.e., it contains a type), return it 817 return url 818 host, path = splithost(path) 819 type, basepath = splittype(base) # inherit type from base 820 if host: 821 # if url contains host, just inherit type 822 if type: return type + '://' + host + path 823 else: 824 # no type inherited, so url must have started with // 825 # just return it 826 return url 827 host, basepath = splithost(basepath) # inherit host 828 basepath, basetag = splittag(basepath) # remove extraneous cruft 829 basepath, basequery = splitquery(basepath) # idem 830 if path[:1] != '/': 831 # non-absolute path name 832 if path[:1] in ('#', '?'): 833 # path is just a tag or query, attach to basepath 834 i = len(basepath) 835 else: 836 # else replace last component 837 i = basepath.rfind('/') 838 if i < 0: 839 # basepath not absolute 840 if host: 841 # host present, make absolute 842 basepath = '/' 843 else: 844 # else keep non-absolute 845 basepath = '' 846 else: 847 # remove last file component 848 basepath = basepath[:i+1] 849 # Interpret ../ (important because of symlinks) 850 while basepath and path[:3] == '../': 851 path = path[3:] 852 i = basepath[:-1].rfind('/') 853 if i > 0: 854 basepath = basepath[:i+1] 855 elif i == 0: 856 basepath = '/' 857 break 858 else: 859 basepath = '' 860 861 path = basepath + path 862 if type and host: return type + '://' + host + path 863 elif type: return type + ':' + path 864 elif host: return '//' + host + path # don't know what this means 865 else: return path 866 867 868# Utilities to parse URLs (most of these return None for missing parts): 869# unwrap('<URL:type://host/path>') --> 'type://host/path' 870# splittype('type:opaquestring') --> 'type', 'opaquestring' 871# splithost('//host[:port]/path') --> 'host[:port]', '/path' 872# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' 873# splitpasswd('user:passwd') -> 'user', 'passwd' 874# splitport('host:port') --> 'host', 'port' 875# splitquery('/path?query') --> '/path', 'query' 876# splittag('/path#tag') --> '/path', 'tag' 877# splitattr('/path;attr1=value1;attr2=value2;...') -> 878# '/path', ['attr1=value1', 'attr2=value2', ...] 879# splitvalue('attr=value') --> 'attr', 'value' 880# splitgophertype('/Xselector') --> 'X', 'selector' 881# unquote('abc%20def') -> 'abc def' 882# quote('abc def') -> 'abc%20def') 883 884def toBytes(url): 885 """toBytes(u"URL") --> 'URL'.""" 886 # Most URL schemes require ASCII. If that changes, the conversion 887 # can be relaxed 888 if type(url) is types.UnicodeType: 889 try: 890 url = url.encode("ASCII") 891 except UnicodeError: 892 raise UnicodeError("URL " + repr(url) + 893 " contains non-ASCII characters") 894 return url 895 896def unwrap(url): 897 """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" 898 url = url.strip() 899 if url[:1] == '<' and url[-1:] == '>': 900 url = url[1:-1].strip() 901 if url[:4] == 'URL:': url = url[4:].strip() 902 return url 903 904_typeprog = None 905def splittype(url): 906 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" 907 global _typeprog 908 if _typeprog is None: 909 import re 910 _typeprog = re.compile('^([^/:]+):') 911 912 match = _typeprog.match(url) 913 if match: 914 scheme = match.group(1) 915 return scheme.lower(), url[len(scheme) + 1:] 916 return None, url 917 918_hostprog = None 919def splithost(url): 920 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" 921 global _hostprog 922 if _hostprog is None: 923 import re 924 _hostprog = re.compile('^//([^/]*)(.*)$') 925 926 match = _hostprog.match(url) 927 if match: return match.group(1, 2) 928 return None, url 929 930_userprog = None 931def splituser(host): 932 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" 933 global _userprog 934 if _userprog is None: 935 import re 936 _userprog = re.compile('^([^@]*)@(.*)$') 937 938 match = _userprog.match(host) 939 if match: return map(unquote, match.group(1, 2)) 940 return None, host 941 942_passwdprog = None 943def splitpasswd(user): 944 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" 945 global _passwdprog 946 if _passwdprog is None: 947 import re 948 _passwdprog = re.compile('^([^:]*):(.*)$') 949 950 match = _passwdprog.match(user) 951 if match: return match.group(1, 2) 952 return user, None 953 954# splittag('/path#tag') --> '/path', 'tag' 955_portprog = None 956def splitport(host): 957 """splitport('host:port') --> 'host', 'port'.""" 958 global _portprog 959 if _portprog is None: 960 import re 961 _portprog = re.compile('^(.*):([0-9]+)$') 962 963 match = _portprog.match(host) 964 if match: return match.group(1, 2) 965 return host, None 966 967_nportprog = None 968def splitnport(host, defport=-1): 969 """Split host and port, returning numeric port. 970 Return given default port if no ':' found; defaults to -1. 971 Return numerical port if a valid number are found after ':'. 972 Return None if ':' but not a valid number.""" 973 global _nportprog 974 if _nportprog is None: 975 import re 976 _nportprog = re.compile('^(.*):(.*)$') 977 978 match = _nportprog.match(host) 979 if match: 980 host, port = match.group(1, 2) 981 try: 982 if not port: raise ValueError, "no digits" 983 nport = int(port) 984 except ValueError: 985 nport = None 986 return host, nport 987 return host, defport 988 989_queryprog = None 990def splitquery(url): 991 """splitquery('/path?query') --> '/path', 'query'.""" 992 global _queryprog 993 if _queryprog is None: 994 import re 995 _queryprog = re.compile('^(.*)\?([^?]*)$') 996 997 match = _queryprog.match(url) 998 if match: return match.group(1, 2) 999 return url, None 1000 1001_tagprog = None 1002def splittag(url): 1003 """splittag('/path#tag') --> '/path', 'tag'.""" 1004 global _tagprog 1005 if _tagprog is None: 1006 import re 1007 _tagprog = re.compile('^(.*)#([^#]*)$') 1008 1009 match = _tagprog.match(url) 1010 if match: return match.group(1, 2) 1011 return url, None 1012 1013def splitattr(url): 1014 """splitattr('/path;attr1=value1;attr2=value2;...') -> 1015 '/path', ['attr1=value1', 'attr2=value2', ...].""" 1016 words = url.split(';') 1017 return words[0], words[1:] 1018 1019_valueprog = None 1020def splitvalue(attr): 1021 """splitvalue('attr=value') --> 'attr', 'value'.""" 1022 global _valueprog 1023 if _valueprog is None: 1024 import re 1025 _valueprog = re.compile('^([^=]*)=(.*)$') 1026 1027 match = _valueprog.match(attr) 1028 if match: return match.group(1, 2) 1029 return attr, None 1030 1031def splitgophertype(selector): 1032 """splitgophertype('/Xselector') --> 'X', 'selector'.""" 1033 if selector[:1] == '/' and selector[1:2]: 1034 return selector[1], selector[2:] 1035 return None, selector 1036 1037def unquote(s): 1038 """unquote('abc%20def') -> 'abc def'.""" 1039 mychr = chr 1040 myatoi = int 1041 list = s.split('%') 1042 res = [list[0]] 1043 myappend = res.append 1044 del list[0] 1045 for item in list: 1046 if item[1:2]: 1047 try: 1048 myappend(mychr(myatoi(item[:2], 16)) 1049 + item[2:]) 1050 except: 1051 myappend('%' + item) 1052 else: 1053 myappend('%' + item) 1054 return "".join(res) 1055 1056def unquote_plus(s): 1057 """unquote('%7e/abc+def') -> '~/abc def'""" 1058 if '+' in s: 1059 # replace '+' with ' ' 1060 s = ' '.join(s.split('+')) 1061 return unquote(s) 1062 1063always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 1064 'abcdefghijklmnopqrstuvwxyz' 1065 '0123456789' '_.-') 1066 1067_fast_safe_test = always_safe + '/' 1068_fast_safe = None 1069 1070def _fast_quote(s): 1071 global _fast_safe 1072 if _fast_safe is None: 1073 _fast_safe = {} 1074 for c in _fast_safe_test: 1075 _fast_safe[c] = c 1076 res = list(s) 1077 for i in range(len(res)): 1078 c = res[i] 1079 if not _fast_safe.has_key(c): 1080 res[i] = '%%%02X' % ord(c) 1081 return ''.join(res) 1082 1083def quote(s, safe = '/'): 1084 """quote('abc def') -> 'abc%20def' 1085 1086 Each part of a URL, e.g. the path info, the query, etc., has a 1087 different set of reserved characters that must be quoted. 1088 1089 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists 1090 the following reserved characters. 1091 1092 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 1093 "$" | "," 1094 1095 Each of these characters is reserved in some component of a URL, 1096 but not necessarily in all of them. 1097 1098 By default, the quote function is intended for quoting the path 1099 section of a URL. Thus, it will not encode '/'. This character 1100 is reserved, but in typical usage the quote function is being 1101 called on a path where the existing slash characters are used as 1102 reserved characters. 1103 """ 1104 safe = always_safe + safe 1105 if _fast_safe_test == safe: 1106 return _fast_quote(s) 1107 res = list(s) 1108 for i in range(len(res)): 1109 c = res[i] 1110 if c not in safe: 1111 res[i] = '%%%02X' % ord(c) 1112 return ''.join(res) 1113 1114def quote_plus(s, safe = ''): 1115 """Quote the query fragment of a URL; replacing ' ' with '+'""" 1116 if ' ' in s: 1117 l = s.split(' ') 1118 for i in range(len(l)): 1119 l[i] = quote(l[i], safe) 1120 return '+'.join(l) 1121 else: 1122 return quote(s, safe) 1123 1124def urlencode(query,doseq=0): 1125 """Encode a sequence of two-element tuples or dictionary into a URL query string. 1126 1127 If any values in the query arg are sequences and doseq is true, each 1128 sequence element is converted to a separate parameter. 1129 1130 If the query arg is a sequence of two-element tuples, the order of the 1131 parameters in the output will match the order of parameters in the 1132 input. 1133 """ 1134 1135 if hasattr(query,"items"): 1136 # mapping objects 1137 query = query.items() 1138 else: 1139 # it's a bother at times that strings and string-like objects are 1140 # sequences... 1141 try: 1142 # non-sequence items should not work with len() 1143 x = len(query) 1144 # non-empty strings will fail this 1145 if len(query) and type(query[0]) != types.TupleType: 1146 raise TypeError 1147 # zero-length sequences of all types will get here and succeed, 1148 # but that's a minor nit - since the original implementation 1149 # allowed empty dicts that type of behavior probably should be 1150 # preserved for consistency 1151 except TypeError: 1152 ty,va,tb = sys.exc_info() 1153 raise TypeError, "not a valid non-string sequence or mapping object", tb 1154 1155 l = [] 1156 if not doseq: 1157 # preserve old behavior 1158 for k, v in query: 1159 k = quote_plus(str(k)) 1160 v = quote_plus(str(v)) 1161 l.append(k + '=' + v) 1162 else: 1163 for k, v in query: 1164 k = quote_plus(str(k)) 1165 if type(v) == types.StringType: 1166 v = quote_plus(v) 1167 l.append(k + '=' + v) 1168 elif type(v) == types.UnicodeType: 1169 # is there a reasonable way to convert to ASCII? 1170 # encode generates a string, but "replace" or "ignore" 1171 # lose information and "strict" can raise UnicodeError 1172 v = quote_plus(v.encode("ASCII","replace")) 1173 l.append(k + '=' + v) 1174 else: 1175 try: 1176 # is this a sufficient test for sequence-ness? 1177 x = len(v) 1178 except TypeError: 1179 # not a sequence 1180 v = quote_plus(str(v)) 1181 l.append(k + '=' + v) 1182 else: 1183 # loop over the sequence 1184 for elt in v: 1185 l.append(k + '=' + quote_plus(str(elt))) 1186 return '&'.join(l) 1187 1188# Proxy handling 1189def getproxies_environment(): 1190 """Return a dictionary of scheme -> proxy server URL mappings. 1191 1192 Scan the environment for variables named <scheme>_proxy; 1193 this seems to be the standard convention. If you need a 1194 different way, you can pass a proxies dictionary to the 1195 [Fancy]URLopener constructor. 1196 1197 """ 1198 proxies = {} 1199 for name, value in os.environ.items(): 1200 name = name.lower() 1201 if value and name[-6:] == '_proxy': 1202 proxies[name[:-6]] = value 1203 return proxies 1204 1205if os.name == 'mac': 1206 def getproxies(): 1207 """Return a dictionary of scheme -> proxy server URL mappings. 1208 1209 By convention the mac uses Internet Config to store 1210 proxies. An HTTP proxy, for instance, is stored under 1211 the HttpProxy key. 1212 1213 """ 1214 try: 1215 import ic 1216 except ImportError: 1217 return {} 1218 1219 try: 1220 config = ic.IC() 1221 except ic.error: 1222 return {} 1223 proxies = {} 1224 # HTTP: 1225 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']: 1226 try: 1227 value = config['HTTPProxyHost'] 1228 except ic.error: 1229 pass 1230 else: 1231 proxies['http'] = 'http://%s' % value 1232 # FTP: XXXX To be done. 1233 # Gopher: XXXX To be done. 1234 return proxies 1235 1236elif os.name == 'nt': 1237 def getproxies_registry(): 1238 """Return a dictionary of scheme -> proxy server URL mappings. 1239 1240 Win32 uses the registry to store proxies. 1241 1242 """ 1243 proxies = {} 1244 try: 1245 import _winreg 1246 except ImportError: 1247 # Std module, so should be around - but you never know! 1248 return proxies 1249 try: 1250 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, 1251 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 1252 proxyEnable = _winreg.QueryValueEx(internetSettings, 1253 'ProxyEnable')[0] 1254 if proxyEnable: 1255 # Returned as Unicode but problems if not converted to ASCII 1256 proxyServer = str(_winreg.QueryValueEx(internetSettings, 1257 'ProxyServer')[0]) 1258 if '=' in proxyServer: 1259 # Per-protocol settings 1260 for p in proxyServer.split(';'): 1261 protocol, address = p.split('=', 1) 1262 proxies[protocol] = '%s://%s' % (protocol, address) 1263 else: 1264 # Use one setting for all protocols 1265 if proxyServer[:5] == 'http:': 1266 proxies['http'] = proxyServer 1267 else: 1268 proxies['http'] = 'http://%s' % proxyServer 1269 proxies['ftp'] = 'ftp://%s' % proxyServer 1270 internetSettings.Close() 1271 except (WindowsError, ValueError, TypeError): 1272 # Either registry key not found etc, or the value in an 1273 # unexpected format. 1274 # proxies already set up to be empty so nothing to do 1275 pass 1276 return proxies 1277 1278 def getproxies(): 1279 """Return a dictionary of scheme -> proxy server URL mappings. 1280 1281 Returns settings gathered from the environment, if specified, 1282 or the registry. 1283 1284 """ 1285 return getproxies_environment() or getproxies_registry() 1286else: 1287 # By default use environment variables 1288 getproxies = getproxies_environment 1289 1290 1291# Test and time quote() and unquote() 1292def test1(): 1293 import time 1294 s = '' 1295 for i in range(256): s = s + chr(i) 1296 s = s*4 1297 t0 = time.time() 1298 qs = quote(s) 1299 uqs = unquote(qs) 1300 t1 = time.time() 1301 if uqs != s: 1302 print 'Wrong!' 1303 print `s` 1304 print `qs` 1305 print `uqs` 1306 print round(t1 - t0, 3), 'sec' 1307 1308 1309def reporthook(blocknum, blocksize, totalsize): 1310 # Report during remote transfers 1311 print "Block number: %d, Block size: %d, Total size: %d" % ( 1312 blocknum, blocksize, totalsize) 1313 1314# Test program 1315def test(args=[]): 1316 if not args: 1317 args = [ 1318 '/etc/passwd', 1319 'file:/etc/passwd', 1320 'file://localhost/etc/passwd', 1321 'ftp://ftp.python.org/etc/passwd', 1322## 'gopher://gopher.micro.umn.edu/1/', 1323 'http://www.python.org/index.html', 1324 ] 1325 if hasattr(URLopener, "open_https"): 1326 args.append('https://synergy.as.cmu.edu/~geek/') 1327 try: 1328 for url in args: 1329 print '-'*10, url, '-'*10 1330 fn, h = urlretrieve(url, None, reporthook) 1331 print fn 1332 if h: 1333 print '======' 1334 for k in h.keys(): print k + ':', h[k] 1335 print '======' 1336 fp = open(fn, 'rb') 1337 data = fp.read() 1338 del fp 1339 if '\r' in data: 1340 table = string.maketrans("", "") 1341 data = data.translate(table, "\r") 1342 print data 1343 fn, h = None, None 1344 print '-'*40 1345 finally: 1346 urlcleanup() 1347 1348def main(): 1349 import getopt, sys 1350 try: 1351 opts, args = getopt.getopt(sys.argv[1:], "th") 1352 except getopt.error, msg: 1353 print msg 1354 print "Use -h for help" 1355 return 1356 t = 0 1357 for o, a in opts: 1358 if o == '-t': 1359 t = t + 1 1360 if o == '-h': 1361 print "Usage: python urllib.py [-t] [url ...]" 1362 print "-t runs self-test;", 1363 print "otherwise, contents of urls are printed" 1364 return 1365 if t: 1366 if t > 1: 1367 test1() 1368 test(args) 1369 else: 1370 if not args: 1371 print "Use -h for help" 1372 for url in args: 1373 print urlopen(url).read(), 1374 1375# Run test program when run as a script 1376if __name__ == '__main__': 1377 main() 1378