urllib.py revision 54f0222547b1e92cd018ef132307a6f793dc9505
1"""Open an arbitrary URL. 2 3See the following document for more info on URLs: 4"Names and Addresses, URIs, URLs, URNs, URCs", at 5http://www.w3.org/pub/WWW/Addressing/Overview.html 6 7See also the HTTP spec (from which the error codes are derived): 8"HTTP - Hypertext Transfer Protocol", at 9http://www.w3.org/pub/WWW/Protocols/ 10 11Related standards and specs: 12- RFC1808: the "relative URL" spec. (authoritative status) 13- RFC1738 - the "URL standard". (authoritative status) 14- RFC1630 - the "URI spec". (informational status) 15 16The object returned by URLopener().open(file) will differ per 17protocol. All you know is that is has methods read(), readline(), 18readlines(), fileno(), close() and info(). The read*(), fileno() 19and close() methods work like those of open files. 20The info() method returns a mimetools.Message object which can be 21used to query various info about the object, if available. 22(mimetools.Message objects are queried with the getheader() method.) 23""" 24 25import string 26import socket 27import os 28import time 29import sys 30import types 31 32__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve", 33 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus", 34 "urlencode", "url2pathname", "pathname2url", "splittag", 35 "localhost", "thishost", "ftperrors", "basejoin", "unwrap", 36 "splittype", "splithost", "splituser", "splitpasswd", "splitport", 37 "splitnport", "splitquery", "splitattr", "splitvalue", 38 "splitgophertype", "getproxies"] 39 40__version__ = '1.15' # XXX This version is not always updated :-( 41 42MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 43 44# Helper for non-unix systems 45if os.name == 'mac': 46 from macurl2path import url2pathname, pathname2url 47elif os.name == 'nt': 48 from nturl2path import url2pathname, pathname2url 49elif os.name == 'riscos': 50 from rourl2path import url2pathname, pathname2url 51else: 52 def url2pathname(pathname): 53 return unquote(pathname) 54 def pathname2url(pathname): 55 return quote(pathname) 56 57# This really consists of two pieces: 58# (1) a class which handles opening of all sorts of URLs 59# (plus assorted utilities etc.) 60# (2) a set of functions for parsing URLs 61# XXX Should these be separated out into different modules? 62 63 64# Shortcut for basic usage 65_urlopener = None 66def urlopen(url, data=None, proxies=None): 67 """urlopen(url [, data]) -> open file-like object""" 68 global _urlopener 69 if proxies is not None: 70 opener = FancyURLopener(proxies=proxies) 71 elif not _urlopener: 72 opener = FancyURLopener() 73 _urlopener = opener 74 else: 75 opener = _urlopener 76 if data is None: 77 return opener.open(url) 78 else: 79 return opener.open(url, data) 80def urlretrieve(url, filename=None, reporthook=None, data=None): 81 global _urlopener 82 if not _urlopener: 83 _urlopener = FancyURLopener() 84 return _urlopener.retrieve(url, filename, reporthook, data) 85def urlcleanup(): 86 if _urlopener: 87 _urlopener.cleanup() 88 89 90ftpcache = {} 91class URLopener: 92 """Class to open URLs. 93 This is a class rather than just a subroutine because we may need 94 more than one set of global protocol-specific options. 95 Note -- this is a base class for those who don't want the 96 automatic handling of errors type 302 (relocated) and 401 97 (authorization needed).""" 98 99 __tempfiles = None 100 101 version = "Python-urllib/%s" % __version__ 102 103 # Constructor 104 def __init__(self, proxies=None, **x509): 105 if proxies is None: 106 proxies = getproxies() 107 assert hasattr(proxies, 'has_key'), "proxies must be a mapping" 108 self.proxies = proxies 109 self.key_file = x509.get('key_file') 110 self.cert_file = x509.get('cert_file') 111 self.addheaders = [('User-agent', self.version)] 112 self.__tempfiles = [] 113 self.__unlink = os.unlink # See cleanup() 114 self.tempcache = None 115 # Undocumented feature: if you assign {} to tempcache, 116 # it is used to cache files retrieved with 117 # self.retrieve(). This is not enabled by default 118 # since it does not work for changing documents (and I 119 # haven't got the logic to check expiration headers 120 # yet). 121 self.ftpcache = ftpcache 122 # Undocumented feature: you can use a different 123 # ftp cache by assigning to the .ftpcache member; 124 # in case you want logically independent URL openers 125 # XXX This is not threadsafe. Bah. 126 127 def __del__(self): 128 self.close() 129 130 def close(self): 131 self.cleanup() 132 133 def cleanup(self): 134 # This code sometimes runs when the rest of this module 135 # has already been deleted, so it can't use any globals 136 # or import anything. 137 if self.__tempfiles: 138 for file in self.__tempfiles: 139 try: 140 self.__unlink(file) 141 except OSError: 142 pass 143 del self.__tempfiles[:] 144 if self.tempcache: 145 self.tempcache.clear() 146 147 def addheader(self, *args): 148 """Add a header to be used by the HTTP interface only 149 e.g. u.addheader('Accept', 'sound/basic')""" 150 self.addheaders.append(args) 151 152 # External interface 153 def open(self, fullurl, data=None): 154 """Use URLopener().open(file) instead of open(file, 'r').""" 155 fullurl = unwrap(toBytes(fullurl)) 156 if self.tempcache and fullurl in self.tempcache: 157 filename, headers = self.tempcache[fullurl] 158 fp = open(filename, 'rb') 159 return addinfourl(fp, headers, fullurl) 160 urltype, url = splittype(fullurl) 161 if not urltype: 162 urltype = 'file' 163 if urltype in self.proxies: 164 proxy = self.proxies[urltype] 165 urltype, proxyhost = splittype(proxy) 166 host, selector = splithost(proxyhost) 167 url = (host, fullurl) # Signal special case to open_*() 168 else: 169 proxy = None 170 name = 'open_' + urltype 171 self.type = urltype 172 if '-' in name: 173 # replace - with _ 174 name = '_'.join(name.split('-')) 175 if not hasattr(self, name): 176 if proxy: 177 return self.open_unknown_proxy(proxy, fullurl, data) 178 else: 179 return self.open_unknown(fullurl, data) 180 try: 181 if data is None: 182 return getattr(self, name)(url) 183 else: 184 return getattr(self, name)(url, data) 185 except socket.error, msg: 186 raise IOError, ('socket error', msg), sys.exc_info()[2] 187 188 def open_unknown(self, fullurl, data=None): 189 """Overridable interface to open unknown URL type.""" 190 type, url = splittype(fullurl) 191 raise IOError, ('url error', 'unknown url type', type) 192 193 def open_unknown_proxy(self, proxy, fullurl, data=None): 194 """Overridable interface to open unknown URL type.""" 195 type, url = splittype(fullurl) 196 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy) 197 198 # External interface 199 def retrieve(self, url, filename=None, reporthook=None, data=None): 200 """retrieve(url) returns (filename, None) for a local object 201 or (tempfilename, headers) for a remote object.""" 202 url = unwrap(toBytes(url)) 203 if self.tempcache and url in self.tempcache: 204 return self.tempcache[url] 205 type, url1 = splittype(url) 206 if not filename and (not type or type == 'file'): 207 try: 208 fp = self.open_local_file(url1) 209 hdrs = fp.info() 210 del fp 211 return url2pathname(splithost(url1)[1]), hdrs 212 except IOError, msg: 213 pass 214 fp = self.open(url, data) 215 headers = fp.info() 216 if not filename: 217 import tempfile 218 garbage, path = splittype(url) 219 garbage, path = splithost(path or "") 220 path, garbage = splitquery(path or "") 221 path, garbage = splitattr(path or "") 222 suffix = os.path.splitext(path)[1] 223 filename = tempfile.mktemp(suffix) 224 self.__tempfiles.append(filename) 225 result = filename, headers 226 if self.tempcache is not None: 227 self.tempcache[url] = result 228 tfp = open(filename, 'wb') 229 bs = 1024*8 230 size = -1 231 blocknum = 1 232 if reporthook: 233 if "content-length" in headers: 234 size = int(headers["Content-Length"]) 235 reporthook(0, bs, size) 236 block = fp.read(bs) 237 if reporthook: 238 reporthook(1, bs, size) 239 while block: 240 tfp.write(block) 241 block = fp.read(bs) 242 blocknum = blocknum + 1 243 if reporthook: 244 reporthook(blocknum, bs, size) 245 fp.close() 246 tfp.close() 247 del fp 248 del tfp 249 return result 250 251 # Each method named open_<type> knows how to open that type of URL 252 253 def open_http(self, url, data=None): 254 """Use HTTP protocol.""" 255 import httplib 256 user_passwd = None 257 if type(url) is types.StringType: 258 host, selector = splithost(url) 259 if host: 260 user_passwd, host = splituser(host) 261 host = unquote(host) 262 realhost = host 263 else: 264 host, selector = url 265 urltype, rest = splittype(selector) 266 url = rest 267 user_passwd = None 268 if urltype.lower() != 'http': 269 realhost = None 270 else: 271 realhost, rest = splithost(rest) 272 if realhost: 273 user_passwd, realhost = splituser(realhost) 274 if user_passwd: 275 selector = "%s://%s%s" % (urltype, realhost, rest) 276 if proxy_bypass(realhost): 277 host = realhost 278 279 #print "proxy via http:", host, selector 280 if not host: raise IOError, ('http error', 'no host given') 281 if user_passwd: 282 import base64 283 auth = base64.encodestring(user_passwd).strip() 284 else: 285 auth = None 286 h = httplib.HTTP(host) 287 if data is not None: 288 h.putrequest('POST', selector) 289 h.putheader('Content-type', 'application/x-www-form-urlencoded') 290 h.putheader('Content-length', '%d' % len(data)) 291 else: 292 h.putrequest('GET', selector) 293 if auth: h.putheader('Authorization', 'Basic %s' % auth) 294 if realhost: h.putheader('Host', realhost) 295 for args in self.addheaders: apply(h.putheader, args) 296 h.endheaders() 297 if data is not None: 298 h.send(data) 299 errcode, errmsg, headers = h.getreply() 300 fp = h.getfile() 301 if errcode == 200: 302 return addinfourl(fp, headers, "http:" + url) 303 else: 304 if data is None: 305 return self.http_error(url, fp, errcode, errmsg, headers) 306 else: 307 return self.http_error(url, fp, errcode, errmsg, headers, data) 308 309 def http_error(self, url, fp, errcode, errmsg, headers, data=None): 310 """Handle http errors. 311 Derived class can override this, or provide specific handlers 312 named http_error_DDD where DDD is the 3-digit error code.""" 313 # First check if there's a specific handler for this error 314 name = 'http_error_%d' % errcode 315 if hasattr(self, name): 316 method = getattr(self, name) 317 if data is None: 318 result = method(url, fp, errcode, errmsg, headers) 319 else: 320 result = method(url, fp, errcode, errmsg, headers, data) 321 if result: return result 322 return self.http_error_default(url, fp, errcode, errmsg, headers) 323 324 def http_error_default(self, url, fp, errcode, errmsg, headers): 325 """Default error handler: close the connection and raise IOError.""" 326 void = fp.read() 327 fp.close() 328 raise IOError, ('http error', errcode, errmsg, headers) 329 330 if hasattr(socket, "ssl"): 331 def open_https(self, url, data=None): 332 """Use HTTPS protocol.""" 333 import httplib 334 user_passwd = None 335 if type(url) is types.StringType: 336 host, selector = splithost(url) 337 if host: 338 user_passwd, host = splituser(host) 339 host = unquote(host) 340 realhost = host 341 else: 342 host, selector = url 343 urltype, rest = splittype(selector) 344 url = rest 345 user_passwd = None 346 if urltype.lower() != 'https': 347 realhost = None 348 else: 349 realhost, rest = splithost(rest) 350 if realhost: 351 user_passwd, realhost = splituser(realhost) 352 if user_passwd: 353 selector = "%s://%s%s" % (urltype, realhost, rest) 354 #print "proxy via https:", host, selector 355 if not host: raise IOError, ('https error', 'no host given') 356 if user_passwd: 357 import base64 358 auth = base64.encodestring(user_passwd).strip() 359 else: 360 auth = None 361 h = httplib.HTTPS(host, 0, 362 key_file=self.key_file, 363 cert_file=self.cert_file) 364 if data is not None: 365 h.putrequest('POST', selector) 366 h.putheader('Content-type', 367 'application/x-www-form-urlencoded') 368 h.putheader('Content-length', '%d' % len(data)) 369 else: 370 h.putrequest('GET', selector) 371 if auth: h.putheader('Authorization: Basic %s' % auth) 372 if realhost: h.putheader('Host', realhost) 373 for args in self.addheaders: apply(h.putheader, args) 374 h.endheaders() 375 if data is not None: 376 h.send(data) 377 errcode, errmsg, headers = h.getreply() 378 fp = h.getfile() 379 if errcode == 200: 380 return addinfourl(fp, headers, "https:" + url) 381 else: 382 if data is None: 383 return self.http_error(url, fp, errcode, errmsg, headers) 384 else: 385 return self.http_error(url, fp, errcode, errmsg, headers, 386 data) 387 388 def open_gopher(self, url): 389 """Use Gopher protocol.""" 390 import gopherlib 391 host, selector = splithost(url) 392 if not host: raise IOError, ('gopher error', 'no host given') 393 host = unquote(host) 394 type, selector = splitgophertype(selector) 395 selector, query = splitquery(selector) 396 selector = unquote(selector) 397 if query: 398 query = unquote(query) 399 fp = gopherlib.send_query(selector, query, host) 400 else: 401 fp = gopherlib.send_selector(selector, host) 402 return addinfourl(fp, noheaders(), "gopher:" + url) 403 404 def open_file(self, url): 405 """Use local file or FTP depending on form of URL.""" 406 if url[:2] == '//' and url[2:3] != '/': 407 return self.open_ftp(url) 408 else: 409 return self.open_local_file(url) 410 411 def open_local_file(self, url): 412 """Use local file.""" 413 import mimetypes, mimetools, rfc822, StringIO 414 host, file = splithost(url) 415 localname = url2pathname(file) 416 try: 417 stats = os.stat(localname) 418 except OSError, e: 419 raise IOError(e.errno, e.strerror, e.filename) 420 size = stats.st_size 421 modified = rfc822.formatdate(stats.st_mtime) 422 mtype = mimetypes.guess_type(url)[0] 423 headers = mimetools.Message(StringIO.StringIO( 424 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % 425 (mtype or 'text/plain', size, modified))) 426 if not host: 427 urlfile = file 428 if file[:1] == '/': 429 urlfile = 'file://' + file 430 return addinfourl(open(localname, 'rb'), 431 headers, urlfile) 432 host, port = splitport(host) 433 if not port \ 434 and socket.gethostbyname(host) in (localhost(), thishost()): 435 urlfile = file 436 if file[:1] == '/': 437 urlfile = 'file://' + file 438 return addinfourl(open(localname, 'rb'), 439 headers, urlfile) 440 raise IOError, ('local file error', 'not on local host') 441 442 def open_ftp(self, url): 443 """Use FTP protocol.""" 444 import mimetypes, mimetools, StringIO 445 host, path = splithost(url) 446 if not host: raise IOError, ('ftp error', 'no host given') 447 host, port = splitport(host) 448 user, host = splituser(host) 449 if user: user, passwd = splitpasswd(user) 450 else: passwd = None 451 host = unquote(host) 452 user = unquote(user or '') 453 passwd = unquote(passwd or '') 454 host = socket.gethostbyname(host) 455 if not port: 456 import ftplib 457 port = ftplib.FTP_PORT 458 else: 459 port = int(port) 460 path, attrs = splitattr(path) 461 path = unquote(path) 462 dirs = path.split('/') 463 dirs, file = dirs[:-1], dirs[-1] 464 if dirs and not dirs[0]: dirs = dirs[1:] 465 if dirs and not dirs[0]: dirs[0] = '/' 466 key = user, host, port, '/'.join(dirs) 467 # XXX thread unsafe! 468 if len(self.ftpcache) > MAXFTPCACHE: 469 # Prune the cache, rather arbitrarily 470 for k in self.ftpcache.keys(): 471 if k != key: 472 v = self.ftpcache[k] 473 del self.ftpcache[k] 474 v.close() 475 try: 476 if not key in self.ftpcache: 477 self.ftpcache[key] = \ 478 ftpwrapper(user, passwd, host, port, dirs) 479 if not file: type = 'D' 480 else: type = 'I' 481 for attr in attrs: 482 attr, value = splitvalue(attr) 483 if attr.lower() == 'type' and \ 484 value in ('a', 'A', 'i', 'I', 'd', 'D'): 485 type = value.upper() 486 (fp, retrlen) = self.ftpcache[key].retrfile(file, type) 487 mtype = mimetypes.guess_type("ftp:" + url)[0] 488 headers = "" 489 if mtype: 490 headers += "Content-Type: %s\n" % mtype 491 if retrlen is not None and retrlen >= 0: 492 headers += "Content-Length: %d\n" % retrlen 493 headers = mimetools.Message(StringIO.StringIO(headers)) 494 return addinfourl(fp, headers, "ftp:" + url) 495 except ftperrors(), msg: 496 raise IOError, ('ftp error', msg), sys.exc_info()[2] 497 498 def open_data(self, url, data=None): 499 """Use "data" URL.""" 500 # ignore POSTed data 501 # 502 # syntax of data URLs: 503 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 504 # mediatype := [ type "/" subtype ] *( ";" parameter ) 505 # data := *urlchar 506 # parameter := attribute "=" value 507 import StringIO, mimetools 508 try: 509 [type, data] = url.split(',', 1) 510 except ValueError: 511 raise IOError, ('data error', 'bad data URL') 512 if not type: 513 type = 'text/plain;charset=US-ASCII' 514 semi = type.rfind(';') 515 if semi >= 0 and '=' not in type[semi:]: 516 encoding = type[semi+1:] 517 type = type[:semi] 518 else: 519 encoding = '' 520 msg = [] 521 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT', 522 time.gmtime(time.time()))) 523 msg.append('Content-type: %s' % type) 524 if encoding == 'base64': 525 import base64 526 data = base64.decodestring(data) 527 else: 528 data = unquote(data) 529 msg.append('Content-length: %d' % len(data)) 530 msg.append('') 531 msg.append(data) 532 msg = '\n'.join(msg) 533 f = StringIO.StringIO(msg) 534 headers = mimetools.Message(f, 0) 535 f.fileno = None # needed for addinfourl 536 return addinfourl(f, headers, url) 537 538 539class FancyURLopener(URLopener): 540 """Derived class with handlers for errors we can handle (perhaps).""" 541 542 def __init__(self, *args): 543 apply(URLopener.__init__, (self,) + args) 544 self.auth_cache = {} 545 self.tries = 0 546 self.maxtries = 10 547 548 def http_error_default(self, url, fp, errcode, errmsg, headers): 549 """Default error handling -- don't raise an exception.""" 550 return addinfourl(fp, headers, "http:" + url) 551 552 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 553 """Error 302 -- relocated (temporarily).""" 554 self.tries += 1 555 if self.maxtries and self.tries >= self.maxtries: 556 if hasattr(self, "http_error_500"): 557 meth = self.http_error_500 558 else: 559 meth = self.http_error_default 560 self.tries = 0 561 return meth(url, fp, 500, 562 "Internal Server Error: Redirect Recursion", headers) 563 result = self.redirect_internal(url, fp, errcode, errmsg, headers, 564 data) 565 self.tries = 0 566 return result 567 568 def redirect_internal(self, url, fp, errcode, errmsg, headers, data): 569 if 'location' in headers: 570 newurl = headers['location'] 571 elif 'uri' in headers: 572 newurl = headers['uri'] 573 else: 574 return 575 void = fp.read() 576 fp.close() 577 # In case the server sent a relative URL, join with original: 578 newurl = basejoin(self.type + ":" + url, newurl) 579 if data is None: 580 return self.open(newurl) 581 else: 582 return self.open(newurl, data) 583 584 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): 585 """Error 301 -- also relocated (permanently).""" 586 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 587 588 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): 589 """Error 401 -- authentication required. 590 See this URL for a description of the basic authentication scheme: 591 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt""" 592 if not 'www-authenticate' in headers: 593 URLopener.http_error_default(self, url, fp, 594 errcode, errmsg, headers) 595 stuff = headers['www-authenticate'] 596 import re 597 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 598 if not match: 599 URLopener.http_error_default(self, url, fp, 600 errcode, errmsg, headers) 601 scheme, realm = match.groups() 602 if scheme.lower() != 'basic': 603 URLopener.http_error_default(self, url, fp, 604 errcode, errmsg, headers) 605 name = 'retry_' + self.type + '_basic_auth' 606 if data is None: 607 return getattr(self,name)(url, realm) 608 else: 609 return getattr(self,name)(url, realm, data) 610 611 def retry_http_basic_auth(self, url, realm, data=None): 612 host, selector = splithost(url) 613 i = host.find('@') + 1 614 host = host[i:] 615 user, passwd = self.get_user_passwd(host, realm, i) 616 if not (user or passwd): return None 617 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host 618 newurl = 'http://' + host + selector 619 if data is None: 620 return self.open(newurl) 621 else: 622 return self.open(newurl, data) 623 624 def retry_https_basic_auth(self, url, realm, data=None): 625 host, selector = splithost(url) 626 i = host.find('@') + 1 627 host = host[i:] 628 user, passwd = self.get_user_passwd(host, realm, i) 629 if not (user or passwd): return None 630 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host 631 newurl = '//' + host + selector 632 return self.open_https(newurl, data) 633 634 def get_user_passwd(self, host, realm, clear_cache = 0): 635 key = realm + '@' + host.lower() 636 if key in self.auth_cache: 637 if clear_cache: 638 del self.auth_cache[key] 639 else: 640 return self.auth_cache[key] 641 user, passwd = self.prompt_user_passwd(host, realm) 642 if user or passwd: self.auth_cache[key] = (user, passwd) 643 return user, passwd 644 645 def prompt_user_passwd(self, host, realm): 646 """Override this in a GUI environment!""" 647 import getpass 648 try: 649 user = raw_input("Enter username for %s at %s: " % (realm, 650 host)) 651 passwd = getpass.getpass("Enter password for %s in %s at %s: " % 652 (user, realm, host)) 653 return user, passwd 654 except KeyboardInterrupt: 655 print 656 return None, None 657 658 659# Utility functions 660 661_localhost = None 662def localhost(): 663 """Return the IP address of the magic hostname 'localhost'.""" 664 global _localhost 665 if not _localhost: 666 _localhost = socket.gethostbyname('localhost') 667 return _localhost 668 669_thishost = None 670def thishost(): 671 """Return the IP address of the current host.""" 672 global _thishost 673 if not _thishost: 674 _thishost = socket.gethostbyname(socket.gethostname()) 675 return _thishost 676 677_ftperrors = None 678def ftperrors(): 679 """Return the set of errors raised by the FTP class.""" 680 global _ftperrors 681 if not _ftperrors: 682 import ftplib 683 _ftperrors = ftplib.all_errors 684 return _ftperrors 685 686_noheaders = None 687def noheaders(): 688 """Return an empty mimetools.Message object.""" 689 global _noheaders 690 if not _noheaders: 691 import mimetools 692 import StringIO 693 _noheaders = mimetools.Message(StringIO.StringIO(), 0) 694 _noheaders.fp.close() # Recycle file descriptor 695 return _noheaders 696 697 698# Utility classes 699 700class ftpwrapper: 701 """Class used by open_ftp() for cache of open FTP connections.""" 702 703 def __init__(self, user, passwd, host, port, dirs): 704 self.user = user 705 self.passwd = passwd 706 self.host = host 707 self.port = port 708 self.dirs = dirs 709 self.init() 710 711 def init(self): 712 import ftplib 713 self.busy = 0 714 self.ftp = ftplib.FTP() 715 self.ftp.connect(self.host, self.port) 716 self.ftp.login(self.user, self.passwd) 717 for dir in self.dirs: 718 self.ftp.cwd(dir) 719 720 def retrfile(self, file, type): 721 import ftplib 722 self.endtransfer() 723 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 724 else: cmd = 'TYPE ' + type; isdir = 0 725 try: 726 self.ftp.voidcmd(cmd) 727 except ftplib.all_errors: 728 self.init() 729 self.ftp.voidcmd(cmd) 730 conn = None 731 if file and not isdir: 732 # Use nlst to see if the file exists at all 733 try: 734 self.ftp.nlst(file) 735 except ftplib.error_perm, reason: 736 raise IOError, ('ftp error', reason), sys.exc_info()[2] 737 # Restore the transfer mode! 738 self.ftp.voidcmd(cmd) 739 # Try to retrieve as a file 740 try: 741 cmd = 'RETR ' + file 742 conn = self.ftp.ntransfercmd(cmd) 743 except ftplib.error_perm, reason: 744 if str(reason)[:3] != '550': 745 raise IOError, ('ftp error', reason), sys.exc_info()[2] 746 if not conn: 747 # Set transfer mode to ASCII! 748 self.ftp.voidcmd('TYPE A') 749 # Try a directory listing 750 if file: cmd = 'LIST ' + file 751 else: cmd = 'LIST' 752 conn = self.ftp.ntransfercmd(cmd) 753 self.busy = 1 754 # Pass back both a suitably decorated object and a retrieval length 755 return (addclosehook(conn[0].makefile('rb'), 756 self.endtransfer), conn[1]) 757 def endtransfer(self): 758 if not self.busy: 759 return 760 self.busy = 0 761 try: 762 self.ftp.voidresp() 763 except ftperrors(): 764 pass 765 766 def close(self): 767 self.endtransfer() 768 try: 769 self.ftp.close() 770 except ftperrors(): 771 pass 772 773class addbase: 774 """Base class for addinfo and addclosehook.""" 775 776 def __init__(self, fp): 777 self.fp = fp 778 self.read = self.fp.read 779 self.readline = self.fp.readline 780 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines 781 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno 782 783 def __repr__(self): 784 return '<%s at %s whose fp = %s>' % (self.__class__.__name__, 785 `id(self)`, `self.fp`) 786 787 def close(self): 788 self.read = None 789 self.readline = None 790 self.readlines = None 791 self.fileno = None 792 if self.fp: self.fp.close() 793 self.fp = None 794 795class addclosehook(addbase): 796 """Class to add a close hook to an open file.""" 797 798 def __init__(self, fp, closehook, *hookargs): 799 addbase.__init__(self, fp) 800 self.closehook = closehook 801 self.hookargs = hookargs 802 803 def close(self): 804 addbase.close(self) 805 if self.closehook: 806 apply(self.closehook, self.hookargs) 807 self.closehook = None 808 self.hookargs = None 809 810class addinfo(addbase): 811 """class to add an info() method to an open file.""" 812 813 def __init__(self, fp, headers): 814 addbase.__init__(self, fp) 815 self.headers = headers 816 817 def info(self): 818 return self.headers 819 820class addinfourl(addbase): 821 """class to add info() and geturl() methods to an open file.""" 822 823 def __init__(self, fp, headers, url): 824 addbase.__init__(self, fp) 825 self.headers = headers 826 self.url = url 827 828 def info(self): 829 return self.headers 830 831 def geturl(self): 832 return self.url 833 834 835def basejoin(base, url): 836 """Utility to combine a URL with a base URL to form a new URL.""" 837 type, path = splittype(url) 838 if type: 839 # if url is complete (i.e., it contains a type), return it 840 return url 841 host, path = splithost(path) 842 type, basepath = splittype(base) # inherit type from base 843 if host: 844 # if url contains host, just inherit type 845 if type: return type + '://' + host + path 846 else: 847 # no type inherited, so url must have started with // 848 # just return it 849 return url 850 host, basepath = splithost(basepath) # inherit host 851 basepath, basetag = splittag(basepath) # remove extraneous cruft 852 basepath, basequery = splitquery(basepath) # idem 853 if path[:1] != '/': 854 # non-absolute path name 855 if path[:1] in ('#', '?'): 856 # path is just a tag or query, attach to basepath 857 i = len(basepath) 858 else: 859 # else replace last component 860 i = basepath.rfind('/') 861 if i < 0: 862 # basepath not absolute 863 if host: 864 # host present, make absolute 865 basepath = '/' 866 else: 867 # else keep non-absolute 868 basepath = '' 869 else: 870 # remove last file component 871 basepath = basepath[:i+1] 872 # Interpret ../ (important because of symlinks) 873 while basepath and path[:3] == '../': 874 path = path[3:] 875 i = basepath[:-1].rfind('/') 876 if i > 0: 877 basepath = basepath[:i+1] 878 elif i == 0: 879 basepath = '/' 880 break 881 else: 882 basepath = '' 883 884 path = basepath + path 885 if host and path and path[0] != '/': 886 path = '/' + path 887 if type and host: return type + '://' + host + path 888 elif type: return type + ':' + path 889 elif host: return '//' + host + path # don't know what this means 890 else: return path 891 892 893# Utilities to parse URLs (most of these return None for missing parts): 894# unwrap('<URL:type://host/path>') --> 'type://host/path' 895# splittype('type:opaquestring') --> 'type', 'opaquestring' 896# splithost('//host[:port]/path') --> 'host[:port]', '/path' 897# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' 898# splitpasswd('user:passwd') -> 'user', 'passwd' 899# splitport('host:port') --> 'host', 'port' 900# splitquery('/path?query') --> '/path', 'query' 901# splittag('/path#tag') --> '/path', 'tag' 902# splitattr('/path;attr1=value1;attr2=value2;...') -> 903# '/path', ['attr1=value1', 'attr2=value2', ...] 904# splitvalue('attr=value') --> 'attr', 'value' 905# splitgophertype('/Xselector') --> 'X', 'selector' 906# unquote('abc%20def') -> 'abc def' 907# quote('abc def') -> 'abc%20def') 908 909if hasattr(types, "UnicodeType"): 910 def _is_unicode(x): 911 return isinstance(x, unicode) 912else: 913 def _is_unicode(x): 914 return 0 915 916def toBytes(url): 917 """toBytes(u"URL") --> 'URL'.""" 918 # Most URL schemes require ASCII. If that changes, the conversion 919 # can be relaxed 920 if _is_unicode(url): 921 try: 922 url = url.encode("ASCII") 923 except UnicodeError: 924 raise UnicodeError("URL " + repr(url) + 925 " contains non-ASCII characters") 926 return url 927 928def unwrap(url): 929 """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" 930 url = url.strip() 931 if url[:1] == '<' and url[-1:] == '>': 932 url = url[1:-1].strip() 933 if url[:4] == 'URL:': url = url[4:].strip() 934 return url 935 936_typeprog = None 937def splittype(url): 938 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" 939 global _typeprog 940 if _typeprog is None: 941 import re 942 _typeprog = re.compile('^([^/:]+):') 943 944 match = _typeprog.match(url) 945 if match: 946 scheme = match.group(1) 947 return scheme.lower(), url[len(scheme) + 1:] 948 return None, url 949 950_hostprog = None 951def splithost(url): 952 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" 953 global _hostprog 954 if _hostprog is None: 955 import re 956 _hostprog = re.compile('^//([^/]*)(.*)$') 957 958 match = _hostprog.match(url) 959 if match: return match.group(1, 2) 960 return None, url 961 962_userprog = None 963def splituser(host): 964 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" 965 global _userprog 966 if _userprog is None: 967 import re 968 _userprog = re.compile('^([^@]*)@(.*)$') 969 970 match = _userprog.match(host) 971 if match: return map(unquote, match.group(1, 2)) 972 return None, host 973 974_passwdprog = None 975def splitpasswd(user): 976 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" 977 global _passwdprog 978 if _passwdprog is None: 979 import re 980 _passwdprog = re.compile('^([^:]*):(.*)$') 981 982 match = _passwdprog.match(user) 983 if match: return match.group(1, 2) 984 return user, None 985 986# splittag('/path#tag') --> '/path', 'tag' 987_portprog = None 988def splitport(host): 989 """splitport('host:port') --> 'host', 'port'.""" 990 global _portprog 991 if _portprog is None: 992 import re 993 _portprog = re.compile('^(.*):([0-9]+)$') 994 995 match = _portprog.match(host) 996 if match: return match.group(1, 2) 997 return host, None 998 999_nportprog = None 1000def splitnport(host, defport=-1): 1001 """Split host and port, returning numeric port. 1002 Return given default port if no ':' found; defaults to -1. 1003 Return numerical port if a valid number are found after ':'. 1004 Return None if ':' but not a valid number.""" 1005 global _nportprog 1006 if _nportprog is None: 1007 import re 1008 _nportprog = re.compile('^(.*):(.*)$') 1009 1010 match = _nportprog.match(host) 1011 if match: 1012 host, port = match.group(1, 2) 1013 try: 1014 if not port: raise ValueError, "no digits" 1015 nport = int(port) 1016 except ValueError: 1017 nport = None 1018 return host, nport 1019 return host, defport 1020 1021_queryprog = None 1022def splitquery(url): 1023 """splitquery('/path?query') --> '/path', 'query'.""" 1024 global _queryprog 1025 if _queryprog is None: 1026 import re 1027 _queryprog = re.compile('^(.*)\?([^?]*)$') 1028 1029 match = _queryprog.match(url) 1030 if match: return match.group(1, 2) 1031 return url, None 1032 1033_tagprog = None 1034def splittag(url): 1035 """splittag('/path#tag') --> '/path', 'tag'.""" 1036 global _tagprog 1037 if _tagprog is None: 1038 import re 1039 _tagprog = re.compile('^(.*)#([^#]*)$') 1040 1041 match = _tagprog.match(url) 1042 if match: return match.group(1, 2) 1043 return url, None 1044 1045def splitattr(url): 1046 """splitattr('/path;attr1=value1;attr2=value2;...') -> 1047 '/path', ['attr1=value1', 'attr2=value2', ...].""" 1048 words = url.split(';') 1049 return words[0], words[1:] 1050 1051_valueprog = None 1052def splitvalue(attr): 1053 """splitvalue('attr=value') --> 'attr', 'value'.""" 1054 global _valueprog 1055 if _valueprog is None: 1056 import re 1057 _valueprog = re.compile('^([^=]*)=(.*)$') 1058 1059 match = _valueprog.match(attr) 1060 if match: return match.group(1, 2) 1061 return attr, None 1062 1063def splitgophertype(selector): 1064 """splitgophertype('/Xselector') --> 'X', 'selector'.""" 1065 if selector[:1] == '/' and selector[1:2]: 1066 return selector[1], selector[2:] 1067 return None, selector 1068 1069def unquote(s): 1070 """unquote('abc%20def') -> 'abc def'.""" 1071 mychr = chr 1072 myatoi = int 1073 list = s.split('%') 1074 res = [list[0]] 1075 myappend = res.append 1076 del list[0] 1077 for item in list: 1078 if item[1:2]: 1079 try: 1080 myappend(mychr(myatoi(item[:2], 16)) 1081 + item[2:]) 1082 except ValueError: 1083 myappend('%' + item) 1084 else: 1085 myappend('%' + item) 1086 return "".join(res) 1087 1088def unquote_plus(s): 1089 """unquote('%7e/abc+def') -> '~/abc def'""" 1090 if '+' in s: 1091 # replace '+' with ' ' 1092 s = ' '.join(s.split('+')) 1093 return unquote(s) 1094 1095always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 1096 'abcdefghijklmnopqrstuvwxyz' 1097 '0123456789' '_.-') 1098 1099_fast_safe_test = always_safe + '/' 1100_fast_safe = None 1101 1102def _fast_quote(s): 1103 global _fast_safe 1104 if _fast_safe is None: 1105 _fast_safe = {} 1106 for c in _fast_safe_test: 1107 _fast_safe[c] = c 1108 res = list(s) 1109 for i in range(len(res)): 1110 c = res[i] 1111 if not c in _fast_safe: 1112 res[i] = '%%%02X' % ord(c) 1113 return ''.join(res) 1114 1115def quote(s, safe = '/'): 1116 """quote('abc def') -> 'abc%20def' 1117 1118 Each part of a URL, e.g. the path info, the query, etc., has a 1119 different set of reserved characters that must be quoted. 1120 1121 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists 1122 the following reserved characters. 1123 1124 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 1125 "$" | "," 1126 1127 Each of these characters is reserved in some component of a URL, 1128 but not necessarily in all of them. 1129 1130 By default, the quote function is intended for quoting the path 1131 section of a URL. Thus, it will not encode '/'. This character 1132 is reserved, but in typical usage the quote function is being 1133 called on a path where the existing slash characters are used as 1134 reserved characters. 1135 """ 1136 safe = always_safe + safe 1137 if _fast_safe_test == safe: 1138 return _fast_quote(s) 1139 res = list(s) 1140 for i in range(len(res)): 1141 c = res[i] 1142 if c not in safe: 1143 res[i] = '%%%02X' % ord(c) 1144 return ''.join(res) 1145 1146def quote_plus(s, safe = ''): 1147 """Quote the query fragment of a URL; replacing ' ' with '+'""" 1148 if ' ' in s: 1149 l = s.split(' ') 1150 for i in range(len(l)): 1151 l[i] = quote(l[i], safe) 1152 return '+'.join(l) 1153 else: 1154 return quote(s, safe) 1155 1156def urlencode(query,doseq=0): 1157 """Encode a sequence of two-element tuples or dictionary into a URL query string. 1158 1159 If any values in the query arg are sequences and doseq is true, each 1160 sequence element is converted to a separate parameter. 1161 1162 If the query arg is a sequence of two-element tuples, the order of the 1163 parameters in the output will match the order of parameters in the 1164 input. 1165 """ 1166 1167 if hasattr(query,"items"): 1168 # mapping objects 1169 query = query.items() 1170 else: 1171 # it's a bother at times that strings and string-like objects are 1172 # sequences... 1173 try: 1174 # non-sequence items should not work with len() 1175 # non-empty strings will fail this 1176 if len(query) and type(query[0]) != types.TupleType: 1177 raise TypeError 1178 # zero-length sequences of all types will get here and succeed, 1179 # but that's a minor nit - since the original implementation 1180 # allowed empty dicts that type of behavior probably should be 1181 # preserved for consistency 1182 except TypeError: 1183 ty,va,tb = sys.exc_info() 1184 raise TypeError, "not a valid non-string sequence or mapping object", tb 1185 1186 l = [] 1187 if not doseq: 1188 # preserve old behavior 1189 for k, v in query: 1190 k = quote_plus(str(k)) 1191 v = quote_plus(str(v)) 1192 l.append(k + '=' + v) 1193 else: 1194 for k, v in query: 1195 k = quote_plus(str(k)) 1196 if type(v) == types.StringType: 1197 v = quote_plus(v) 1198 l.append(k + '=' + v) 1199 elif _is_unicode(v): 1200 # is there a reasonable way to convert to ASCII? 1201 # encode generates a string, but "replace" or "ignore" 1202 # lose information and "strict" can raise UnicodeError 1203 v = quote_plus(v.encode("ASCII","replace")) 1204 l.append(k + '=' + v) 1205 else: 1206 try: 1207 # is this a sufficient test for sequence-ness? 1208 x = len(v) 1209 except TypeError: 1210 # not a sequence 1211 v = quote_plus(str(v)) 1212 l.append(k + '=' + v) 1213 else: 1214 # loop over the sequence 1215 for elt in v: 1216 l.append(k + '=' + quote_plus(str(elt))) 1217 return '&'.join(l) 1218 1219# Proxy handling 1220def getproxies_environment(): 1221 """Return a dictionary of scheme -> proxy server URL mappings. 1222 1223 Scan the environment for variables named <scheme>_proxy; 1224 this seems to be the standard convention. If you need a 1225 different way, you can pass a proxies dictionary to the 1226 [Fancy]URLopener constructor. 1227 1228 """ 1229 proxies = {} 1230 for name, value in os.environ.items(): 1231 name = name.lower() 1232 if value and name[-6:] == '_proxy': 1233 proxies[name[:-6]] = value 1234 return proxies 1235 1236if os.name == 'mac': 1237 def getproxies(): 1238 """Return a dictionary of scheme -> proxy server URL mappings. 1239 1240 By convention the mac uses Internet Config to store 1241 proxies. An HTTP proxy, for instance, is stored under 1242 the HttpProxy key. 1243 1244 """ 1245 try: 1246 import ic 1247 except ImportError: 1248 return {} 1249 1250 try: 1251 config = ic.IC() 1252 except ic.error: 1253 return {} 1254 proxies = {} 1255 # HTTP: 1256 if 'UseHTTPProxy' in config and config['UseHTTPProxy']: 1257 try: 1258 value = config['HTTPProxyHost'] 1259 except ic.error: 1260 pass 1261 else: 1262 proxies['http'] = 'http://%s' % value 1263 # FTP: XXXX To be done. 1264 # Gopher: XXXX To be done. 1265 return proxies 1266 1267 def proxy_bypass(x): 1268 return 0 1269 1270elif os.name == 'nt': 1271 def getproxies_registry(): 1272 """Return a dictionary of scheme -> proxy server URL mappings. 1273 1274 Win32 uses the registry to store proxies. 1275 1276 """ 1277 proxies = {} 1278 try: 1279 import _winreg 1280 except ImportError: 1281 # Std module, so should be around - but you never know! 1282 return proxies 1283 try: 1284 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, 1285 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 1286 proxyEnable = _winreg.QueryValueEx(internetSettings, 1287 'ProxyEnable')[0] 1288 if proxyEnable: 1289 # Returned as Unicode but problems if not converted to ASCII 1290 proxyServer = str(_winreg.QueryValueEx(internetSettings, 1291 'ProxyServer')[0]) 1292 if '=' in proxyServer: 1293 # Per-protocol settings 1294 for p in proxyServer.split(';'): 1295 protocol, address = p.split('=', 1) 1296 # See if address has a type:// prefix 1297 import re 1298 if not re.match('^([^/:]+)://', address): 1299 address = '%s://%s' % (protocol, address) 1300 proxies[protocol] = address 1301 else: 1302 # Use one setting for all protocols 1303 if proxyServer[:5] == 'http:': 1304 proxies['http'] = proxyServer 1305 else: 1306 proxies['http'] = 'http://%s' % proxyServer 1307 proxies['ftp'] = 'ftp://%s' % proxyServer 1308 internetSettings.Close() 1309 except (WindowsError, ValueError, TypeError): 1310 # Either registry key not found etc, or the value in an 1311 # unexpected format. 1312 # proxies already set up to be empty so nothing to do 1313 pass 1314 return proxies 1315 1316 def getproxies(): 1317 """Return a dictionary of scheme -> proxy server URL mappings. 1318 1319 Returns settings gathered from the environment, if specified, 1320 or the registry. 1321 1322 """ 1323 return getproxies_environment() or getproxies_registry() 1324 1325 def proxy_bypass(host): 1326 try: 1327 import _winreg 1328 import re 1329 except ImportError: 1330 # Std modules, so should be around - but you never know! 1331 return 0 1332 try: 1333 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, 1334 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 1335 proxyEnable = _winreg.QueryValueEx(internetSettings, 1336 'ProxyEnable')[0] 1337 proxyOverride = str(_winreg.QueryValueEx(internetSettings, 1338 'ProxyOverride')[0]) 1339 # ^^^^ Returned as Unicode but problems if not converted to ASCII 1340 except WindowsError: 1341 return 0 1342 if not proxyEnable or not proxyOverride: 1343 return 0 1344 # try to make a host list from name and IP address. 1345 host = [host] 1346 try: 1347 addr = socket.gethostbyname(host[0]) 1348 if addr != host: 1349 host.append(addr) 1350 except socket.error: 1351 pass 1352 # make a check value list from the registry entry: replace the 1353 # '<local>' string by the localhost entry and the corresponding 1354 # canonical entry. 1355 proxyOverride = proxyOverride.split(';') 1356 i = 0 1357 while i < len(proxyOverride): 1358 if proxyOverride[i] == '<local>': 1359 proxyOverride[i:i+1] = ['localhost', 1360 '127.0.0.1', 1361 socket.gethostname(), 1362 socket.gethostbyname( 1363 socket.gethostname())] 1364 i += 1 1365 # print proxyOverride 1366 # now check if we match one of the registry values. 1367 for test in proxyOverride: 1368 test = test.replace(".", r"\.") # mask dots 1369 test = test.replace("*", r".*") # change glob sequence 1370 test = test.replace("?", r".") # change glob char 1371 for val in host: 1372 # print "%s <--> %s" %( test, val ) 1373 if re.match(test, val, re.I): 1374 return 1 1375 return 0 1376 1377else: 1378 # By default use environment variables 1379 getproxies = getproxies_environment 1380 1381 def proxy_bypass(host): 1382 return 0 1383 1384# Test and time quote() and unquote() 1385def test1(): 1386 s = '' 1387 for i in range(256): s = s + chr(i) 1388 s = s*4 1389 t0 = time.time() 1390 qs = quote(s) 1391 uqs = unquote(qs) 1392 t1 = time.time() 1393 if uqs != s: 1394 print 'Wrong!' 1395 print `s` 1396 print `qs` 1397 print `uqs` 1398 print round(t1 - t0, 3), 'sec' 1399 1400 1401def reporthook(blocknum, blocksize, totalsize): 1402 # Report during remote transfers 1403 print "Block number: %d, Block size: %d, Total size: %d" % ( 1404 blocknum, blocksize, totalsize) 1405 1406# Test program 1407def test(args=[]): 1408 if not args: 1409 args = [ 1410 '/etc/passwd', 1411 'file:/etc/passwd', 1412 'file://localhost/etc/passwd', 1413 'ftp://ftp.python.org/pub/python/README', 1414## 'gopher://gopher.micro.umn.edu/1/', 1415 'http://www.python.org/index.html', 1416 ] 1417 if hasattr(URLopener, "open_https"): 1418 args.append('https://synergy.as.cmu.edu/~geek/') 1419 try: 1420 for url in args: 1421 print '-'*10, url, '-'*10 1422 fn, h = urlretrieve(url, None, reporthook) 1423 print fn 1424 if h: 1425 print '======' 1426 for k in h.keys(): print k + ':', h[k] 1427 print '======' 1428 fp = open(fn, 'rb') 1429 data = fp.read() 1430 del fp 1431 if '\r' in data: 1432 table = string.maketrans("", "") 1433 data = data.translate(table, "\r") 1434 print data 1435 fn, h = None, None 1436 print '-'*40 1437 finally: 1438 urlcleanup() 1439 1440def main(): 1441 import getopt, sys 1442 try: 1443 opts, args = getopt.getopt(sys.argv[1:], "th") 1444 except getopt.error, msg: 1445 print msg 1446 print "Use -h for help" 1447 return 1448 t = 0 1449 for o, a in opts: 1450 if o == '-t': 1451 t = t + 1 1452 if o == '-h': 1453 print "Usage: python urllib.py [-t] [url ...]" 1454 print "-t runs self-test;", 1455 print "otherwise, contents of urls are printed" 1456 return 1457 if t: 1458 if t > 1: 1459 test1() 1460 test(args) 1461 else: 1462 if not args: 1463 print "Use -h for help" 1464 for url in args: 1465 print urlopen(url).read(), 1466 1467# Run test program when run as a script 1468if __name__ == '__main__': 1469 main() 1470