urllib.py revision 8c8a02a2588b0219d88fb892635148f9467f6634
1# Open an arbitrary URL 2# 3# See the following document for a tentative description of URLs: 4# Uniform Resource Locators Tim Berners-Lee 5# INTERNET DRAFT CERN 6# IETF URL Working Group 14 July 1993 7# draft-ietf-uri-url-01.txt 8# 9# The object returned by URLopener().open(file) will differ per 10# protocol. All you know is that is has methods read(), readline(), 11# readlines(), fileno(), close() and info(). The read*(), fileno() 12# and close() methods work like those of open files. 13# The info() method returns an mimetools.Message object which can be 14# used to query various info about the object, if available. 15# (mimetools.Message objects are queried with the getheader() method.) 16 17import string 18import socket 19import regex 20import os 21 22 23__version__ = '1.2' # XXXX Should I update this number? -- jack 24 25# Helper for non-unix systems 26if os.name == 'mac': 27 def url2pathname(pathname): 28 "Convert /-delimited pathname to mac pathname" 29 # 30 # XXXX The .. handling should be fixed... 31 # 32 tp = splittype(pathname)[0] 33 if tp and tp <> 'file': 34 raise RuntimeError, 'Cannot convert non-local URL to pathname' 35 components = string.split(pathname, '/') 36 if '..' in components or '.' in components or '' in components[1:-1]: 37 raise RuntimeError, 'Cannot convert URL containing ., .. or // to pathname' 38 if not components[0]: 39 # Absolute unix path, don't start with colon 40 return string.join(components[1:], ':') 41 else: 42 # relative unix path, start with colon 43 return ':' + string.join(components, ':') 44 45 def pathname2url(pathname): 46 "convert mac pathname to /-delimited pathname" 47 if '/' in pathname: 48 raise RuntimeError, "Cannot convert pathname containing slashes" 49 components = string.split(pathname, ':') 50 if '' in components[1:-1]: 51 raise RuntimeError, "Cannot convert pathname containing ::" 52 # Truncate names longer than 31 bytes 53 components = map(lambda x: x[:31], components) 54 55 if os.path.isabs(pathname): 56 return '/' + string.join(components, '/') 57 else: 58 return string.join(components, '/') 59else: 60 def url2pathname(pathname): 61 return pathname 62 def pathname2url(pathname): 63 return pathname 64 65# This really consists of two pieces: 66# (1) a class which handles opening of all sorts of URLs 67# (plus assorted utilities etc.) 68# (2) a set of functions for parsing URLs 69# XXX Should these be separated out into different modules? 70 71 72# Shortcut for basic usage 73_urlopener = None 74def urlopen(url): 75 global _urlopener 76 if not _urlopener: 77 _urlopener = FancyURLopener() 78 return _urlopener.open(url) 79def urlretrieve(url): 80 global _urlopener 81 if not _urlopener: 82 _urlopener = FancyURLopener() 83 return _urlopener.retrieve(url) 84def urlcleanup(): 85 if _urlopener: 86 _urlopener.cleanup() 87 88 89# Class to open URLs. 90# This is a class rather than just a subroutine because we may need 91# more than one set of global protocol-specific options. 92# Note -- this is a base class for those who don't want the 93# automatic handling of errors type 302 (relocated) and 401 94# (authorization needed). 95ftpcache = {} 96class URLopener: 97 98 # Constructor 99 def __init__(self): 100 server_version = "Python-urllib/%s" % __version__ 101 self.addheaders = [('User-agent', server_version)] 102 self.tempcache = None 103 # Undocumented feature: if you assign {} to tempcache, 104 # it is used to cache files retrieved with 105 # self.retrieve(). This is not enabled by default 106 # since it does not work for changing documents (and I 107 # haven't got the logic to check expiration headers 108 # yet). 109 self.ftpcache = ftpcache 110 # Undocumented feature: you can use a different 111 # ftp cache by assigning to the .ftpcache member; 112 # in case you want logically independent URL openers 113 114 def __del__(self): 115 self.close() 116 117 def close(self): 118 self.cleanup() 119 120 def cleanup(self): 121 import os 122 if self.tempcache: 123 for url in self.tempcache.keys(): 124 try: 125 os.unlink(self.tempcache[url][0]) 126 except os.error: 127 pass 128 del self.tempcache[url] 129 130 # Add a header to be used by the HTTP interface only 131 # e.g. u.addheader('Accept', 'sound/basic') 132 def addheader(self, *args): 133 self.addheaders.append(args) 134 135 # External interface 136 # Use URLopener().open(file) instead of open(file, 'r') 137 def open(self, fullurl): 138 fullurl = unwrap(fullurl) 139 type, url = splittype(fullurl) 140 if not type: type = 'file' 141 name = 'open_' + type 142 if '-' in name: 143 import regsub 144 name = regsub.gsub('-', '_', name) 145 if not hasattr(self, name): 146 return self.open_unknown(fullurl) 147 try: 148 return getattr(self, name)(url) 149 except socket.error, msg: 150 raise IOError, ('socket error', msg) 151 152 # Overridable interface to open unknown URL type 153 def open_unknown(self, fullurl): 154 type, url = splittype(fullurl) 155 raise IOError, ('url error', 'unknown url type', type) 156 157 # External interface 158 # retrieve(url) returns (filename, None) for a local object 159 # or (tempfilename, headers) for a remote object 160 def retrieve(self, url): 161 if self.tempcache and self.tempcache.has_key(url): 162 return self.tempcache[url] 163 url1 = unwrap(url) 164 if self.tempcache and self.tempcache.has_key(url1): 165 self.tempcache[url] = self.tempcache[url1] 166 return self.tempcache[url1] 167 type, url1 = splittype(url1) 168 if not type or type == 'file': 169 try: 170 fp = self.open_local_file(url1) 171 del fp 172 return url2pathname(splithost(url1)[1]), None 173 except IOError, msg: 174 pass 175 fp = self.open(url) 176 headers = fp.info() 177 import tempfile 178 tfn = tempfile.mktemp() 179 result = tfn, headers 180 if self.tempcache is not None: 181 self.tempcache[url] = result 182 tfp = open(tfn, 'w') 183 bs = 1024*8 184 block = fp.read(bs) 185 while block: 186 tfp.write(block) 187 block = fp.read(bs) 188 del fp 189 del tfp 190 return result 191 192 # Each method named open_<type> knows how to open that type of URL 193 194 # Use HTTP protocol 195 def open_http(self, url): 196 import httplib 197 host, selector = splithost(url) 198 if not host: raise IOError, ('http error', 'no host given') 199 i = string.find(host, '@') 200 if i >= 0: 201 user_passwd, host = host[:i], host[i+1:] 202 else: 203 user_passwd = None 204 if user_passwd: 205 import base64 206 auth = string.strip(base64.encodestring(user_passwd)) 207 else: 208 auth = None 209 h = httplib.HTTP(host) 210 h.putrequest('GET', selector) 211 if auth: h.putheader('Authorization: Basic %s' % auth) 212 for args in self.addheaders: apply(h.putheader, args) 213 h.endheaders() 214 errcode, errmsg, headers = h.getreply() 215 fp = h.getfile() 216 if errcode == 200: 217 return addinfo(fp, headers) 218 else: 219 return self.http_error(url, 220 fp, errcode, errmsg, headers) 221 222 # Handle http errors. 223 # Derived class can override this, or provide specific handlers 224 # named http_error_DDD where DDD is the 3-digit error code 225 def http_error(self, url, fp, errcode, errmsg, headers): 226 # First check if there's a specific handler for this error 227 name = 'http_error_%d' % errcode 228 if hasattr(self, name): 229 method = getattr(self, name) 230 result = method(url, fp, errcode, errmsg, headers) 231 if result: return result 232 return self.http_error_default( 233 url, fp, errcode, errmsg, headers) 234 235 # Default http error handler: close the connection and raises IOError 236 def http_error_default(self, url, fp, errcode, errmsg, headers): 237 void = fp.read() 238 fp.close() 239 raise IOError, ('http error', errcode, errmsg, headers) 240 241 # Use Gopher protocol 242 def open_gopher(self, url): 243 import gopherlib 244 host, selector = splithost(url) 245 if not host: raise IOError, ('gopher error', 'no host given') 246 type, selector = splitgophertype(selector) 247 selector, query = splitquery(selector) 248 selector = unquote(selector) 249 if query: 250 query = unquote(query) 251 fp = gopherlib.send_query(selector, query, host) 252 else: 253 fp = gopherlib.send_selector(selector, host) 254 return addinfo(fp, noheaders()) 255 256 # Use local file or FTP depending on form of URL 257 def open_file(self, url): 258 if url[:2] == '//': 259 return self.open_ftp(url) 260 else: 261 return self.open_local_file(url) 262 263 # Use local file 264 def open_local_file(self, url): 265 host, file = splithost(url) 266 if not host: return addinfo(open(url2pathname(file), 'r'), noheaders()) 267 host, port = splitport(host) 268 if not port and socket.gethostbyname(host) in ( 269 localhost(), thishost()): 270 file = unquote(file) 271 return addinfo(open(url2pathname(file), 'r'), noheaders()) 272 raise IOError, ('local file error', 'not on local host') 273 274 # Use FTP protocol 275 def open_ftp(self, url): 276 host, path = splithost(url) 277 if not host: raise IOError, ('ftp error', 'no host given') 278 host, port = splitport(host) 279 user, host = splituser(host) 280 if user: user, passwd = splitpasswd(user) 281 else: passwd = None 282 host = socket.gethostbyname(host) 283 if not port: 284 import ftplib 285 port = ftplib.FTP_PORT 286 path, attrs = splitattr(path) 287 dirs = string.splitfields(path, '/') 288 dirs, file = dirs[:-1], dirs[-1] 289 if dirs and not dirs[0]: dirs = dirs[1:] 290 key = (user, host, port, string.joinfields(dirs, '/')) 291 try: 292 if not self.ftpcache.has_key(key): 293 self.ftpcache[key] = \ 294 ftpwrapper(user, passwd, 295 host, port, dirs) 296 if not file: type = 'D' 297 else: type = 'I' 298 for attr in attrs: 299 attr, value = splitvalue(attr) 300 if string.lower(attr) == 'type' and \ 301 value in ('a', 'A', 'i', 'I', 'd', 'D'): 302 type = string.upper(value) 303 return addinfo(self.ftpcache[key].retrfile(file, type), 304 noheaders()) 305 except ftperrors(), msg: 306 raise IOError, ('ftp error', msg) 307 308 309# Derived class with handlers for errors we can handle (perhaps) 310class FancyURLopener(URLopener): 311 312 def __init__(self, *args): 313 apply(URLopener.__init__, (self,) + args) 314 self.auth_cache = {} 315 316 # Default error handling -- don't raise an exception 317 def http_error_default(self, url, fp, errcode, errmsg, headers): 318 return addinfo(fp, headers) 319 320 # Error 302 -- relocated 321 def http_error_302(self, url, fp, errcode, errmsg, headers): 322 # XXX The server can force infinite recursion here! 323 if headers.has_key('location'): 324 newurl = headers['location'] 325 elif headers.has_key('uri'): 326 newurl = headers['uri'] 327 else: 328 return 329 void = fp.read() 330 fp.close() 331 return self.open(newurl) 332 333 # Error 401 -- authentication required 334 # See this URL for a description of the basic authentication scheme: 335 # http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt 336 def http_error_401(self, url, fp, errcode, errmsg, headers): 337 if headers.has_key('www-authenticate'): 338 stuff = headers['www-authenticate'] 339 p = regex.compile( 340 '[ \t]*\([^ \t]+\)[ \t]+realm="\([^"]*\)"') 341 if p.match(stuff) >= 0: 342 scheme, realm = p.group(1, 2) 343 if string.lower(scheme) == 'basic': 344 return self.retry_http_basic_auth( 345 url, realm) 346 347 def retry_http_basic_auth(self, url, realm): 348 host, selector = splithost(url) 349 i = string.find(host, '@') + 1 350 host = host[i:] 351 user, passwd = self.get_user_passwd(host, realm, i) 352 if not (user or passwd): return None 353 host = user + ':' + passwd + '@' + host 354 newurl = '//' + host + selector 355 return self.open_http(newurl) 356 357 def get_user_passwd(self, host, realm, clear_cache = 0): 358 key = realm + '@' + string.lower(host) 359 if self.auth_cache.has_key(key): 360 if clear_cache: 361 del self.auth_cache[key] 362 else: 363 return self.auth_cache[key] 364 user, passwd = self.prompt_user_passwd(host, realm) 365 if user or passwd: self.auth_cache[key] = (user, passwd) 366 return user, passwd 367 368 def prompt_user_passwd(self, host, realm): 369 # Override this in a GUI environment! 370 try: 371 user = raw_input("Enter username for %s at %s: " % 372 (realm, host)) 373 self.echo_off() 374 try: 375 passwd = raw_input( 376 "Enter password for %s in %s at %s: " % 377 (user, realm, host)) 378 finally: 379 self.echo_on() 380 return user, passwd 381 except KeyboardInterrupt: 382 return None, None 383 384 def echo_off(self): 385 import os 386 os.system("stty -echo") 387 388 def echo_on(self): 389 import os 390 print 391 os.system("stty echo") 392 393 394# Utility functions 395 396# Return the IP address of the magic hostname 'localhost' 397_localhost = None 398def localhost(): 399 global _localhost 400 if not _localhost: 401 _localhost = socket.gethostbyname('localhost') 402 return _localhost 403 404# Return the IP address of the current host 405_thishost = None 406def thishost(): 407 global _thishost 408 if not _thishost: 409 _thishost = socket.gethostbyname(socket.gethostname()) 410 return _thishost 411 412# Return the set of errors raised by the FTP class 413_ftperrors = None 414def ftperrors(): 415 global _ftperrors 416 if not _ftperrors: 417 import ftplib 418 _ftperrors = (ftplib.error_reply, 419 ftplib.error_temp, 420 ftplib.error_perm, 421 ftplib.error_proto) 422 return _ftperrors 423 424# Return an empty mimetools.Message object 425_noheaders = None 426def noheaders(): 427 global _noheaders 428 if not _noheaders: 429 import mimetools 430 import StringIO 431 _noheaders = mimetools.Message(StringIO.StringIO(), 0) 432 _noheaders.fp.close() # Recycle file descriptor 433 return _noheaders 434 435 436# Utility classes 437 438# Class used by open_ftp() for cache of open FTP connections 439class ftpwrapper: 440 def __init__(self, user, passwd, host, port, dirs): 441 self.user = unquote(user or '') 442 self.passwd = unquote(passwd or '') 443 self.host = host 444 self.port = port 445 self.dirs = [] 446 for dir in dirs: 447 self.dirs.append(unquote(dir)) 448 self.init() 449 def init(self): 450 import ftplib 451 self.ftp = ftplib.FTP() 452 self.ftp.connect(self.host, self.port) 453 self.ftp.login(self.user, self.passwd) 454 for dir in self.dirs: 455 self.ftp.cwd(dir) 456 def retrfile(self, file, type): 457 import ftplib 458 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 459 else: cmd = 'TYPE ' + type; isdir = 0 460 try: 461 self.ftp.voidcmd(cmd) 462 except ftplib.all_errors: 463 self.init() 464 self.ftp.voidcmd(cmd) 465 conn = None 466 if file and not isdir: 467 try: 468 cmd = 'RETR ' + file 469 conn = self.ftp.transfercmd(cmd) 470 except ftplib.error_perm, reason: 471 if reason[:3] != '550': 472 raise IOError, ('ftp error', reason) 473 if not conn: 474 # Try a directory listing 475 if file: cmd = 'LIST ' + file 476 else: cmd = 'LIST' 477 conn = self.ftp.transfercmd(cmd) 478 return addclosehook(conn.makefile('r'), self.ftp.voidresp) 479 480# Base class for addinfo and addclosehook 481class addbase: 482 def __init__(self, fp): 483 self.fp = fp 484 self.read = self.fp.read 485 self.readline = self.fp.readline 486 self.readlines = self.fp.readlines 487 self.fileno = self.fp.fileno 488 def __repr__(self): 489 return '<%s at %s whose fp = %s>' % ( 490 self.__class__.__name__, `id(self)`, `self.fp`) 491 def close(self): 492 self.read = None 493 self.readline = None 494 self.readlines = None 495 self.fileno = None 496 if self.fp: self.fp.close() 497 self.fp = None 498 499# Class to add a close hook to an open file 500class addclosehook(addbase): 501 def __init__(self, fp, closehook, *hookargs): 502 addbase.__init__(self, fp) 503 self.closehook = closehook 504 self.hookargs = hookargs 505 def close(self): 506 if self.closehook: 507 apply(self.closehook, self.hookargs) 508 self.closehook = None 509 self.hookargs = None 510 addbase.close(self) 511 512# class to add an info() method to an open file 513class addinfo(addbase): 514 def __init__(self, fp, headers): 515 addbase.__init__(self, fp) 516 self.headers = headers 517 def info(self): 518 return self.headers 519 520 521# Utility to combine a URL with a base URL to form a new URL 522 523def basejoin(base, url): 524 type, path = splittype(url) 525 if type: 526 # if url is complete (i.e., it contains a type), return it 527 return url 528 host, path = splithost(path) 529 type, basepath = splittype(base) # inherit type from base 530 if host: 531 # if url contains host, just inherit type 532 if type: return type + '://' + host + path 533 else: 534 # no type inherited, so url must have started with // 535 # just return it 536 return url 537 host, basepath = splithost(basepath) # inherit host 538 basepath, basetag = splittag(basepath) # remove extraneuous cruft 539 basepath, basequery = splitquery(basepath) # idem 540 if path[:1] != '/': 541 # non-absolute path name 542 if path[:1] in ('#', '?'): 543 # path is just a tag or query, attach to basepath 544 i = len(basepath) 545 else: 546 # else replace last component 547 i = string.rfind(basepath, '/') 548 if i < 0: 549 # basepath not absolute 550 if host: 551 # host present, make absolute 552 basepath = '/' 553 else: 554 # else keep non-absolute 555 basepath = '' 556 else: 557 # remove last file component 558 basepath = basepath[:i+1] 559 path = basepath + path 560 if type and host: return type + '://' + host + path 561 elif type: return type + ':' + path 562 elif host: return '//' + host + path # don't know what this means 563 else: return path 564 565 566# Utilities to parse URLs (most of these return None for missing parts): 567# unwrap('<URL:type://host/path>') --> 'type://host/path' 568# splittype('type:opaquestring') --> 'type', 'opaquestring' 569# splithost('//host[:port]/path') --> 'host[:port]', '/path' 570# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' 571# splitpasswd('user:passwd') -> 'user', 'passwd' 572# splitport('host:port') --> 'host', 'port' 573# splitquery('/path?query') --> '/path', 'query' 574# splittag('/path#tag') --> '/path', 'tag' 575# splitattr('/path;attr1=value1;attr2=value2;...') -> 576# '/path', ['attr1=value1', 'attr2=value2', ...] 577# splitvalue('attr=value') --> 'attr', 'value' 578# splitgophertype('/Xselector') --> 'X', 'selector' 579# unquote('abc%20def') -> 'abc def' 580# quote('abc def') -> 'abc%20def') 581 582def unwrap(url): 583 url = string.strip(url) 584 if url[:1] == '<' and url[-1:] == '>': 585 url = string.strip(url[1:-1]) 586 if url[:4] == 'URL:': url = string.strip(url[4:]) 587 return url 588 589_typeprog = regex.compile('^\([^/:]+\):\(.*\)$') 590def splittype(url): 591 if _typeprog.match(url) >= 0: return _typeprog.group(1, 2) 592 return None, url 593 594_hostprog = regex.compile('^//\([^/]+\)\(.*\)$') 595def splithost(url): 596 if _hostprog.match(url) >= 0: return _hostprog.group(1, 2) 597 return None, url 598 599_userprog = regex.compile('^\([^@]*\)@\(.*\)$') 600def splituser(host): 601 if _userprog.match(host) >= 0: return _userprog.group(1, 2) 602 return None, host 603 604_passwdprog = regex.compile('^\([^:]*\):\(.*\)$') 605def splitpasswd(user): 606 if _passwdprog.match(user) >= 0: return _passwdprog.group(1, 2) 607 return user, None 608 609_portprog = regex.compile('^\(.*\):\([0-9]+\)$') 610def splitport(host): 611 if _portprog.match(host) >= 0: return _portprog.group(1, 2) 612 return host, None 613 614_queryprog = regex.compile('^\(.*\)\?\([^?]*\)$') 615def splitquery(url): 616 if _queryprog.match(url) >= 0: return _queryprog.group(1, 2) 617 return url, None 618 619_tagprog = regex.compile('^\(.*\)#\([^#]*\)$') 620def splittag(url): 621 if _tagprog.match(url) >= 0: return _tagprog.group(1, 2) 622 return url, None 623 624def splitattr(url): 625 words = string.splitfields(url, ';') 626 return words[0], words[1:] 627 628_valueprog = regex.compile('^\([^=]*\)=\(.*\)$') 629def splitvalue(attr): 630 if _valueprog.match(attr) >= 0: return _valueprog.group(1, 2) 631 return attr, None 632 633def splitgophertype(selector): 634 if selector[:1] == '/' and selector[1:2]: 635 return selector[1], selector[2:] 636 return None, selector 637 638_quoteprog = regex.compile('%[0-9a-fA-F][0-9a-fA-F]') 639def unquote(s): 640 i = 0 641 n = len(s) 642 res = '' 643 while 0 <= i < n: 644 j = _quoteprog.search(s, i) 645 if j < 0: 646 res = res + s[i:] 647 break 648 res = res + (s[i:j] + chr(string.atoi(s[j+1:j+3], 16))) 649 i = j+3 650 return res 651 652always_safe = string.letters + string.digits + '_,.-' 653def quote(s, safe = '/'): 654 safe = always_safe + safe 655 res = '' 656 for c in s: 657 if c in safe: 658 res = res + c 659 else: 660 res = res + '%%%02x' % ord(c) 661 return res 662 663# Test and time quote() and unquote() 664def test1(): 665 import time 666 s = '' 667 for i in range(256): s = s + chr(i) 668 s = s*4 669 t0 = time.time() 670 qs = quote(s) 671 uqs = unquote(qs) 672 t1 = time.time() 673 if uqs != s: 674 print 'Wrong!' 675 print `s` 676 print `qs` 677 print `uqs` 678 print round(t1 - t0, 3), 'sec' 679 680 681# Test program 682def test(): 683 import sys 684 import regsub 685 args = sys.argv[1:] 686 if not args: 687 args = [ 688 '/etc/passwd', 689 'file:/etc/passwd', 690 'file://localhost/etc/passwd', 691 'ftp://ftp.cwi.nl/etc/passwd', 692 'gopher://gopher.cwi.nl/11/', 693 'http://www.cwi.nl/index.html', 694 ] 695 try: 696 for url in args: 697 print '-'*10, url, '-'*10 698 fn, h = urlretrieve(url) 699 print fn, h 700 if h: 701 print '======' 702 for k in h.keys(): print k + ':', h[k] 703 print '======' 704 fp = open(fn, 'r') 705 data = fp.read() 706 del fp 707 print regsub.gsub('\r', '', data) 708 fn, h = None, None 709 print '-'*40 710 finally: 711 urlcleanup() 712 713# Run test program when run as a script 714if __name__ == '__main__': 715## test1() 716 test() 717