urllib.py revision b030bc026eb57861568fdc9310512185c95a6f11
1# Open an arbitrary URL 2# 3# See the following document for a tentative description of URLs: 4# Uniform Resource Locators Tim Berners-Lee 5# INTERNET DRAFT CERN 6# IETF URL Working Group 14 July 1993 7# draft-ietf-uri-url-01.txt 8# 9# The object returned by URLopener().open(file) will differ per 10# protocol. All you know is that is has methods read(), readline(), 11# readlines(), fileno(), close() and info(). The read*(), fileno() 12# and close() methods work like those of open files. 13# The info() method returns an mimetools.Message object which can be 14# used to query various info about the object, if available. 15# (mimetools.Message objects are queried with the getheader() method.) 16 17import string 18import socket 19import regex 20import os 21 22 23__version__ = '1.5' 24 25# Helper for non-unix systems 26if os.name == 'mac': 27 from macurl2path import url2pathname, pathname2url 28elif os.name == 'nt': 29 from nturl2path import url2pathname, pathname2url 30else: 31 def url2pathname(pathname): 32 return pathname 33 def pathname2url(pathname): 34 return pathname 35 36# This really consists of two pieces: 37# (1) a class which handles opening of all sorts of URLs 38# (plus assorted utilities etc.) 39# (2) a set of functions for parsing URLs 40# XXX Should these be separated out into different modules? 41 42 43# Shortcut for basic usage 44_urlopener = None 45def urlopen(url): 46 global _urlopener 47 if not _urlopener: 48 _urlopener = FancyURLopener() 49 return _urlopener.open(url) 50def urlretrieve(url, filename=None): 51 global _urlopener 52 if not _urlopener: 53 _urlopener = FancyURLopener() 54 if filename: 55 return _urlopener.retrieve(url, filename) 56 else: 57 return _urlopener.retrieve(url) 58def urlcleanup(): 59 if _urlopener: 60 _urlopener.cleanup() 61 62 63# Class to open URLs. 64# This is a class rather than just a subroutine because we may need 65# more than one set of global protocol-specific options. 66# Note -- this is a base class for those who don't want the 67# automatic handling of errors type 302 (relocated) and 401 68# (authorization needed). 69ftpcache = {} 70class URLopener: 71 72 # Constructor 73 def __init__(self, proxies=None): 74 if proxies is None: 75 proxies = getproxies() 76 self.proxies = proxies 77 server_version = "Python-urllib/%s" % __version__ 78 self.addheaders = [('User-agent', server_version)] 79 self.tempcache = None 80 # Undocumented feature: if you assign {} to tempcache, 81 # it is used to cache files retrieved with 82 # self.retrieve(). This is not enabled by default 83 # since it does not work for changing documents (and I 84 # haven't got the logic to check expiration headers 85 # yet). 86 self.ftpcache = ftpcache 87 # Undocumented feature: you can use a different 88 # ftp cache by assigning to the .ftpcache member; 89 # in case you want logically independent URL openers 90 91 def __del__(self): 92 self.close() 93 94 def close(self): 95 self.cleanup() 96 97 def cleanup(self): 98 import os 99 if self.tempcache: 100 for url in self.tempcache.keys(): 101 try: 102 os.unlink(self.tempcache[url][0]) 103 except os.error: 104 pass 105 del self.tempcache[url] 106 107 # Add a header to be used by the HTTP interface only 108 # e.g. u.addheader('Accept', 'sound/basic') 109 def addheader(self, *args): 110 self.addheaders.append(args) 111 112 # External interface 113 # Use URLopener().open(file) instead of open(file, 'r') 114 def open(self, fullurl): 115 fullurl = unwrap(fullurl) 116 type, url = splittype(fullurl) 117 if not type: type = 'file' 118 self.openedurl = '%s:%s' % (type, url) 119 if self.proxies.has_key(type): 120 proxy = self.proxies[type] 121 type, proxy = splittype(proxy) 122 host, selector = splithost(proxy) 123 url = (host, fullurl) # Signal special case to open_*() 124 name = 'open_' + type 125 if '-' in name: 126 import regsub 127 name = regsub.gsub('-', '_', name) 128 if not hasattr(self, name): 129 return self.open_unknown(fullurl) 130 try: 131 return getattr(self, name)(url) 132 except socket.error, msg: 133 raise IOError, ('socket error', msg) 134 135 # Overridable interface to open unknown URL type 136 def open_unknown(self, fullurl): 137 type, url = splittype(fullurl) 138 raise IOError, ('url error', 'unknown url type', type) 139 140 # External interface 141 # retrieve(url) returns (filename, None) for a local object 142 # or (tempfilename, headers) for a remote object 143 def retrieve(self, url, filename=None): 144 if self.tempcache and self.tempcache.has_key(url): 145 return self.tempcache[url] 146 url1 = unwrap(url) 147 if self.tempcache and self.tempcache.has_key(url1): 148 self.tempcache[url] = self.tempcache[url1] 149 return self.tempcache[url1] 150 type, url1 = splittype(url1) 151 if not filename and (not type or type == 'file'): 152 try: 153 fp = self.open_local_file(url1) 154 del fp 155 return url2pathname(splithost(url1)[1]), None 156 except IOError, msg: 157 pass 158 fp = self.open(url) 159 headers = fp.info() 160 if not filename: 161 import tempfile 162 filename = tempfile.mktemp() 163 result = filename, headers 164 if self.tempcache is not None: 165 self.tempcache[url] = result 166 tfp = open(filename, 'w') 167 bs = 1024*8 168 block = fp.read(bs) 169 while block: 170 tfp.write(block) 171 block = fp.read(bs) 172 del fp 173 del tfp 174 return result 175 176 # Each method named open_<type> knows how to open that type of URL 177 178 # Use HTTP protocol 179 def open_http(self, url): 180 import httplib 181 if type(url) is type(""): 182 host, selector = splithost(url) 183 user_passwd, host = splituser(host) 184 else: 185 host, selector = url 186 urltype, rest = splittype(selector) 187 if string.lower(urltype) == 'http': 188 realhost, rest = splithost(rest) 189 user_passwd, realhost = splituser(realhost) 190 if user_passwd: 191 selector = "%s://%s%s" % (urltype, 192 realhost, rest) 193 print "proxy via http:", host, selector 194 if not host: raise IOError, ('http error', 'no host given') 195 if user_passwd: 196 import base64 197 auth = string.strip(base64.encodestring(user_passwd)) 198 else: 199 auth = None 200 h = httplib.HTTP(host) 201 h.putrequest('GET', selector) 202 if auth: h.putheader('Authorization: Basic %s' % auth) 203 for args in self.addheaders: apply(h.putheader, args) 204 h.endheaders() 205 errcode, errmsg, headers = h.getreply() 206 fp = h.getfile() 207 if errcode == 200: 208 return addinfourl(fp, headers, self.openedurl) 209 else: 210 return self.http_error(url, 211 fp, errcode, errmsg, headers) 212 213 # Handle http errors. 214 # Derived class can override this, or provide specific handlers 215 # named http_error_DDD where DDD is the 3-digit error code 216 def http_error(self, url, fp, errcode, errmsg, headers): 217 # First check if there's a specific handler for this error 218 name = 'http_error_%d' % errcode 219 if hasattr(self, name): 220 method = getattr(self, name) 221 result = method(url, fp, errcode, errmsg, headers) 222 if result: return result 223 return self.http_error_default( 224 url, fp, errcode, errmsg, headers) 225 226 # Default http error handler: close the connection and raises IOError 227 def http_error_default(self, url, fp, errcode, errmsg, headers): 228 void = fp.read() 229 fp.close() 230 raise IOError, ('http error', errcode, errmsg, headers) 231 232 # Use Gopher protocol 233 def open_gopher(self, url): 234 import gopherlib 235 host, selector = splithost(url) 236 if not host: raise IOError, ('gopher error', 'no host given') 237 type, selector = splitgophertype(selector) 238 selector, query = splitquery(selector) 239 selector = unquote(selector) 240 if query: 241 query = unquote(query) 242 fp = gopherlib.send_query(selector, query, host) 243 else: 244 fp = gopherlib.send_selector(selector, host) 245 return addinfourl(fp, noheaders(), self.openedurl) 246 247 # Use local file or FTP depending on form of URL 248 def open_file(self, url): 249 if url[:2] == '//': 250 return self.open_ftp(url) 251 else: 252 return self.open_local_file(url) 253 254 # Use local file 255 def open_local_file(self, url): 256 host, file = splithost(url) 257 if not host: 258 return addinfourl(open(url2pathname(file), 'r'), noheaders(), 'file:'+file) 259 host, port = splitport(host) 260 if not port and socket.gethostbyname(host) in ( 261 localhost(), thishost()): 262 file = unquote(file) 263 return addinfourl(open(url2pathname(file), 'r'), noheaders(), 'file:'+file) 264 raise IOError, ('local file error', 'not on local host') 265 266 # Use FTP protocol 267 def open_ftp(self, url): 268 host, path = splithost(url) 269 if not host: raise IOError, ('ftp error', 'no host given') 270 host, port = splitport(host) 271 user, host = splituser(host) 272 if user: user, passwd = splitpasswd(user) 273 else: passwd = None 274 host = socket.gethostbyname(host) 275 if not port: 276 import ftplib 277 port = ftplib.FTP_PORT 278 path, attrs = splitattr(path) 279 dirs = string.splitfields(path, '/') 280 dirs, file = dirs[:-1], dirs[-1] 281 if dirs and not dirs[0]: dirs = dirs[1:] 282 key = (user, host, port, string.joinfields(dirs, '/')) 283 try: 284 if not self.ftpcache.has_key(key): 285 self.ftpcache[key] = \ 286 ftpwrapper(user, passwd, 287 host, port, dirs) 288 if not file: type = 'D' 289 else: type = 'I' 290 for attr in attrs: 291 attr, value = splitvalue(attr) 292 if string.lower(attr) == 'type' and \ 293 value in ('a', 'A', 'i', 'I', 'd', 'D'): 294 type = string.upper(value) 295 return addinfourl(self.ftpcache[key].retrfile(file, type), 296 noheaders(), self.openedurl) 297 except ftperrors(), msg: 298 raise IOError, ('ftp error', msg) 299 300 301# Derived class with handlers for errors we can handle (perhaps) 302class FancyURLopener(URLopener): 303 304 def __init__(self, *args): 305 apply(URLopener.__init__, (self,) + args) 306 self.auth_cache = {} 307 308 # Default error handling -- don't raise an exception 309 def http_error_default(self, url, fp, errcode, errmsg, headers): 310 return addinfourl(fp, headers, self.openedurl) 311 312 # Error 302 -- relocated (temporarily) 313 def http_error_302(self, url, fp, errcode, errmsg, headers): 314 # XXX The server can force infinite recursion here! 315 if headers.has_key('location'): 316 newurl = headers['location'] 317 elif headers.has_key('uri'): 318 newurl = headers['uri'] 319 else: 320 return 321 void = fp.read() 322 fp.close() 323 return self.open(newurl) 324 325 # Error 301 -- also relocated (permanently) 326 http_error_301 = http_error_302 327 328 # Error 401 -- authentication required 329 # See this URL for a description of the basic authentication scheme: 330 # http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt 331 def http_error_401(self, url, fp, errcode, errmsg, headers): 332 if headers.has_key('www-authenticate'): 333 stuff = headers['www-authenticate'] 334 p = regex.compile( 335 '[ \t]*\([^ \t]+\)[ \t]+realm="\([^"]*\)"') 336 if p.match(stuff) >= 0: 337 scheme, realm = p.group(1, 2) 338 if string.lower(scheme) == 'basic': 339 return self.retry_http_basic_auth( 340 url, realm) 341 342 def retry_http_basic_auth(self, url, realm): 343 host, selector = splithost(url) 344 i = string.find(host, '@') + 1 345 host = host[i:] 346 user, passwd = self.get_user_passwd(host, realm, i) 347 if not (user or passwd): return None 348 host = user + ':' + passwd + '@' + host 349 newurl = '//' + host + selector 350 return self.open_http(newurl) 351 352 def get_user_passwd(self, host, realm, clear_cache = 0): 353 key = realm + '@' + string.lower(host) 354 if self.auth_cache.has_key(key): 355 if clear_cache: 356 del self.auth_cache[key] 357 else: 358 return self.auth_cache[key] 359 user, passwd = self.prompt_user_passwd(host, realm) 360 if user or passwd: self.auth_cache[key] = (user, passwd) 361 return user, passwd 362 363 def prompt_user_passwd(self, host, realm): 364 # Override this in a GUI environment! 365 try: 366 user = raw_input("Enter username for %s at %s: " % 367 (realm, host)) 368 self.echo_off() 369 try: 370 passwd = raw_input( 371 "Enter password for %s in %s at %s: " % 372 (user, realm, host)) 373 finally: 374 self.echo_on() 375 return user, passwd 376 except KeyboardInterrupt: 377 return None, None 378 379 def echo_off(self): 380 import os 381 os.system("stty -echo") 382 383 def echo_on(self): 384 import os 385 print 386 os.system("stty echo") 387 388 389# Utility functions 390 391# Return the IP address of the magic hostname 'localhost' 392_localhost = None 393def localhost(): 394 global _localhost 395 if not _localhost: 396 _localhost = socket.gethostbyname('localhost') 397 return _localhost 398 399# Return the IP address of the current host 400_thishost = None 401def thishost(): 402 global _thishost 403 if not _thishost: 404 _thishost = socket.gethostbyname(socket.gethostname()) 405 return _thishost 406 407# Return the set of errors raised by the FTP class 408_ftperrors = None 409def ftperrors(): 410 global _ftperrors 411 if not _ftperrors: 412 import ftplib 413 _ftperrors = (ftplib.error_reply, 414 ftplib.error_temp, 415 ftplib.error_perm, 416 ftplib.error_proto) 417 return _ftperrors 418 419# Return an empty mimetools.Message object 420_noheaders = None 421def noheaders(): 422 global _noheaders 423 if not _noheaders: 424 import mimetools 425 import StringIO 426 _noheaders = mimetools.Message(StringIO.StringIO(), 0) 427 _noheaders.fp.close() # Recycle file descriptor 428 return _noheaders 429 430 431# Utility classes 432 433# Class used by open_ftp() for cache of open FTP connections 434class ftpwrapper: 435 def __init__(self, user, passwd, host, port, dirs): 436 self.user = unquote(user or '') 437 self.passwd = unquote(passwd or '') 438 self.host = host 439 self.port = port 440 self.dirs = [] 441 for dir in dirs: 442 self.dirs.append(unquote(dir)) 443 self.init() 444 def init(self): 445 import ftplib 446 self.ftp = ftplib.FTP() 447 self.ftp.connect(self.host, self.port) 448 self.ftp.login(self.user, self.passwd) 449 for dir in self.dirs: 450 self.ftp.cwd(dir) 451 def retrfile(self, file, type): 452 import ftplib 453 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 454 else: cmd = 'TYPE ' + type; isdir = 0 455 try: 456 self.ftp.voidcmd(cmd) 457 except ftplib.all_errors: 458 self.init() 459 self.ftp.voidcmd(cmd) 460 conn = None 461 if file and not isdir: 462 try: 463 cmd = 'RETR ' + file 464 conn = self.ftp.transfercmd(cmd) 465 except ftplib.error_perm, reason: 466 if reason[:3] != '550': 467 raise IOError, ('ftp error', reason) 468 if not conn: 469 # Try a directory listing 470 if file: cmd = 'LIST ' + file 471 else: cmd = 'LIST' 472 conn = self.ftp.transfercmd(cmd) 473 return addclosehook(conn.makefile('rb'), self.ftp.voidresp) 474 475# Base class for addinfo and addclosehook 476class addbase: 477 def __init__(self, fp): 478 self.fp = fp 479 self.read = self.fp.read 480 self.readline = self.fp.readline 481 self.readlines = self.fp.readlines 482 self.fileno = self.fp.fileno 483 def __repr__(self): 484 return '<%s at %s whose fp = %s>' % ( 485 self.__class__.__name__, `id(self)`, `self.fp`) 486 def close(self): 487 self.read = None 488 self.readline = None 489 self.readlines = None 490 self.fileno = None 491 if self.fp: self.fp.close() 492 self.fp = None 493 494# Class to add a close hook to an open file 495class addclosehook(addbase): 496 def __init__(self, fp, closehook, *hookargs): 497 addbase.__init__(self, fp) 498 self.closehook = closehook 499 self.hookargs = hookargs 500 def close(self): 501 if self.closehook: 502 apply(self.closehook, self.hookargs) 503 self.closehook = None 504 self.hookargs = None 505 addbase.close(self) 506 507# class to add an info() method to an open file 508class addinfo(addbase): 509 def __init__(self, fp, headers): 510 addbase.__init__(self, fp) 511 self.headers = headers 512 def info(self): 513 return self.headers 514 515# class to add info() and geturl() methods to an open file 516class addinfourl(addbase): 517 def __init__(self, fp, headers, url): 518 addbase.__init__(self, fp) 519 self.headers = headers 520 self.url = url 521 def info(self): 522 return self.headers 523 def geturl(self): 524 return self.url 525 526 527# Utility to combine a URL with a base URL to form a new URL 528 529def basejoin(base, url): 530 type, path = splittype(url) 531 if type: 532 # if url is complete (i.e., it contains a type), return it 533 return url 534 host, path = splithost(path) 535 type, basepath = splittype(base) # inherit type from base 536 if host: 537 # if url contains host, just inherit type 538 if type: return type + '://' + host + path 539 else: 540 # no type inherited, so url must have started with // 541 # just return it 542 return url 543 host, basepath = splithost(basepath) # inherit host 544 basepath, basetag = splittag(basepath) # remove extraneuous cruft 545 basepath, basequery = splitquery(basepath) # idem 546 if path[:1] != '/': 547 # non-absolute path name 548 if path[:1] in ('#', '?'): 549 # path is just a tag or query, attach to basepath 550 i = len(basepath) 551 else: 552 # else replace last component 553 i = string.rfind(basepath, '/') 554 if i < 0: 555 # basepath not absolute 556 if host: 557 # host present, make absolute 558 basepath = '/' 559 else: 560 # else keep non-absolute 561 basepath = '' 562 else: 563 # remove last file component 564 basepath = basepath[:i+1] 565 path = basepath + path 566 if type and host: return type + '://' + host + path 567 elif type: return type + ':' + path 568 elif host: return '//' + host + path # don't know what this means 569 else: return path 570 571 572# Utilities to parse URLs (most of these return None for missing parts): 573# unwrap('<URL:type://host/path>') --> 'type://host/path' 574# splittype('type:opaquestring') --> 'type', 'opaquestring' 575# splithost('//host[:port]/path') --> 'host[:port]', '/path' 576# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' 577# splitpasswd('user:passwd') -> 'user', 'passwd' 578# splitport('host:port') --> 'host', 'port' 579# splitquery('/path?query') --> '/path', 'query' 580# splittag('/path#tag') --> '/path', 'tag' 581# splitattr('/path;attr1=value1;attr2=value2;...') -> 582# '/path', ['attr1=value1', 'attr2=value2', ...] 583# splitvalue('attr=value') --> 'attr', 'value' 584# splitgophertype('/Xselector') --> 'X', 'selector' 585# unquote('abc%20def') -> 'abc def' 586# quote('abc def') -> 'abc%20def') 587 588def unwrap(url): 589 url = string.strip(url) 590 if url[:1] == '<' and url[-1:] == '>': 591 url = string.strip(url[1:-1]) 592 if url[:4] == 'URL:': url = string.strip(url[4:]) 593 return url 594 595_typeprog = regex.compile('^\([^/:]+\):\(.*\)$') 596def splittype(url): 597 if _typeprog.match(url) >= 0: return _typeprog.group(1, 2) 598 return None, url 599 600_hostprog = regex.compile('^//\([^/]+\)\(.*\)$') 601def splithost(url): 602 if _hostprog.match(url) >= 0: return _hostprog.group(1, 2) 603 return None, url 604 605_userprog = regex.compile('^\([^@]*\)@\(.*\)$') 606def splituser(host): 607 if _userprog.match(host) >= 0: return _userprog.group(1, 2) 608 return None, host 609 610_passwdprog = regex.compile('^\([^:]*\):\(.*\)$') 611def splitpasswd(user): 612 if _passwdprog.match(user) >= 0: return _passwdprog.group(1, 2) 613 return user, None 614 615_portprog = regex.compile('^\(.*\):\([0-9]+\)$') 616def splitport(host): 617 if _portprog.match(host) >= 0: return _portprog.group(1, 2) 618 return host, None 619 620# Split host and port, returning numeric port. 621# Return given default port if no ':' found; defaults to -1. 622# Return numerical port if a valid number are found after ':'. 623# Return None if ':' but not a valid number. 624_nportprog = regex.compile('^\(.*\):\(.*\)$') 625def splitnport(host, defport=-1): 626 if _nportprog.match(host) >= 0: 627 host, port = _nportprog.group(1, 2) 628 try: 629 if not port: raise string.atoi_error, "no digits" 630 nport = string.atoi(port) 631 except string.atoi_error: 632 nport = None 633 return host, nport 634 return host, defport 635 636_queryprog = regex.compile('^\(.*\)\?\([^?]*\)$') 637def splitquery(url): 638 if _queryprog.match(url) >= 0: return _queryprog.group(1, 2) 639 return url, None 640 641_tagprog = regex.compile('^\(.*\)#\([^#]*\)$') 642def splittag(url): 643 if _tagprog.match(url) >= 0: return _tagprog.group(1, 2) 644 return url, None 645 646def splitattr(url): 647 words = string.splitfields(url, ';') 648 return words[0], words[1:] 649 650_valueprog = regex.compile('^\([^=]*\)=\(.*\)$') 651def splitvalue(attr): 652 if _valueprog.match(attr) >= 0: return _valueprog.group(1, 2) 653 return attr, None 654 655def splitgophertype(selector): 656 if selector[:1] == '/' and selector[1:2]: 657 return selector[1], selector[2:] 658 return None, selector 659 660_quoteprog = regex.compile('%[0-9a-fA-F][0-9a-fA-F]') 661def unquote(s): 662 i = 0 663 n = len(s) 664 res = [] 665 while 0 <= i < n: 666 j = _quoteprog.search(s, i) 667 if j < 0: 668 res.append(s[i:]) 669 break 670 res.append(s[i:j] + chr(string.atoi(s[j+1:j+3], 16))) 671 i = j+3 672 return string.joinfields(res, '') 673 674always_safe = string.letters + string.digits + '_,.-' 675def quote(s, safe = '/'): 676 safe = always_safe + safe 677 res = [] 678 for c in s: 679 if c in safe: 680 res.append(c) 681 else: 682 res.append('%%%02x' % ord(c)) 683 return string.joinfields(res, '') 684 685 686# Proxy handling 687def getproxies(): 688 """Return a dictionary of protocol scheme -> proxy server URL mappings. 689 690 Scan the environment for variables named <scheme>_proxy; 691 this seems to be the standard convention. If you need a 692 different way, you can pass a proxies dictionary to the 693 [Fancy]URLopener constructor. 694 695 """ 696 proxies = {} 697 for name, value in os.environ.items(): 698 if value and name[-6:] == '_proxy': 699 proxies[name[:-6]] = value 700 return proxies 701 702 703# Test and time quote() and unquote() 704def test1(): 705 import time 706 s = '' 707 for i in range(256): s = s + chr(i) 708 s = s*4 709 t0 = time.time() 710 qs = quote(s) 711 uqs = unquote(qs) 712 t1 = time.time() 713 if uqs != s: 714 print 'Wrong!' 715 print `s` 716 print `qs` 717 print `uqs` 718 print round(t1 - t0, 3), 'sec' 719 720 721# Test program 722def test(): 723 import sys 724 import regsub 725 args = sys.argv[1:] 726 if not args: 727 args = [ 728 '/etc/passwd', 729 'file:/etc/passwd', 730 'file://localhost/etc/passwd', 731 'ftp://ftp.cwi.nl/etc/passwd', 732 'gopher://gopher.cwi.nl/11/', 733 'http://www.cwi.nl/index.html', 734 ] 735 try: 736 for url in args: 737 print '-'*10, url, '-'*10 738 fn, h = urlretrieve(url) 739 print fn, h 740 if h: 741 print '======' 742 for k in h.keys(): print k + ':', h[k] 743 print '======' 744 fp = open(fn, 'r') 745 data = fp.read() 746 del fp 747 print regsub.gsub('\r', '', data) 748 fn, h = None, None 749 print '-'*40 750 finally: 751 urlcleanup() 752 753# Run test program when run as a script 754if __name__ == '__main__': 755## test1() 756 test() 757