urllib.py revision 6cb15a0572b0a8ca32016e18bea5c7924303ee3b
1# Open an arbitrary URL 2# 3# See the following document for a tentative description of URLs: 4# Uniform Resource Locators Tim Berners-Lee 5# INTERNET DRAFT CERN 6# IETF URL Working Group 14 July 1993 7# draft-ietf-uri-url-01.txt 8# 9# The object returned by URLopener().open(file) will differ per 10# protocol. All you know is that is has methods read(), readline(), 11# readlines(), fileno(), close() and info(). The read*(), fileno() 12# and close() methods work like those of open files. 13# The info() method returns an rfc822.Message object which can be 14# used to query various info about the object, if available. 15# (rfc822.Message objects are queried with the getheader() method.) 16 17import string 18import socket 19import regex 20 21 22__version__ = '1.0' 23 24 25# This really consists of two pieces: 26# (1) a class which handles opening of all sorts of URLs 27# (plus assorted utilities etc.) 28# (2) a set of functions for parsing URLs 29# XXX Should these be separated out into different modules? 30 31 32# Shortcut for basic usage 33_urlopener = None 34def urlopen(url): 35 global _urlopener 36 if not _urlopener: 37 _urlopener = URLopener() 38 return _urlopener.open(url) 39def urlretrieve(url): 40 global _urlopener 41 if not _urlopener: 42 _urlopener = URLopener() 43 return _urlopener.retrieve(url) 44def urlcleanup(): 45 if _urlopener: 46 _urlopener.cleanup() 47 48 49# Class to open URLs. 50# This is a class rather than just a subroutine because we may need 51# more than one set of global protocol-specific options. 52ftpcache = {} 53class URLopener: 54 55 # Constructor 56 def __init__(self): 57 server_version = "Python-urllib/%s" % __version__ 58 self.addheaders = [('User-agent', server_version)] 59 self.tempcache = None 60 # Undocumented feature: if you assign {} to tempcache, 61 # it is used to cache files retrieved with 62 # self.retrieve(). This is not enabled by default 63 # since it does not work for changing documents (and I 64 # haven't got the logic to check expiration headers 65 # yet). 66 self.ftpcache = ftpcache 67 # Undocumented feature: you can use a different 68 # ftp cache by assigning to the .ftpcache member; 69 # in case you want logically independent URL openers 70 71 def __del__(self): 72 self.close() 73 74 def close(self): 75 self.cleanup() 76 77 def cleanup(self): 78 import os 79 if self.tempcache: 80 for url in self.tempcache.keys(): 81 try: 82 os.unlink(self.tempcache[url][0]) 83 except os.error: 84 pass 85 del self.tempcache[url] 86 87 # Add a header to be used by the HTTP interface only 88 # e.g. u.addheader('Accept', 'sound/basic') 89 def addheader(self, *args): 90 self.addheaders.append(args) 91 92 # External interface 93 # Use URLopener().open(file) instead of open(file, 'r') 94 def open(self, url): 95 type, url = splittype(unwrap(url)) 96 if not type: type = 'file' 97 name = 'open_' + type 98 if '-' in name: 99 import regsub 100 name = regsub.gsub('-', '_', name) 101 if not hasattr(self, name): 102 raise IOError, ('url error', 'unknown url type', type) 103 try: 104 return getattr(self, name)(url) 105 except socket.error, msg: 106 raise IOError, ('socket error', msg) 107 108 # External interface 109 # retrieve(url) returns (filename, None) for a local object 110 # or (tempfilename, headers) for a remote object 111 def retrieve(self, url): 112 if self.tempcache and self.tempcache.has_key(url): 113 return self.tempcache[url] 114 url1 = unwrap(url) 115 if self.tempcache and self.tempcache.has_key(url1): 116 self.tempcache[url] = self.tempcache[url1] 117 return self.tempcache[url1] 118 type, url1 = splittype(url1) 119 if not type or type == 'file': 120 try: 121 fp = self.open_local_file(url1) 122 del fp 123 return splithost(url1)[1], None 124 except IOError, msg: 125 pass 126 fp = self.open(url) 127 headers = fp.info() 128 import tempfile 129 tfn = tempfile.mktemp() 130 result = tfn, headers 131 if self.tempcache is not None: 132 self.tempcache[url] = result 133 tfp = open(tfn, 'w') 134 bs = 1024*8 135 block = fp.read(bs) 136 while block: 137 tfp.write(block) 138 block = fp.read(bs) 139 del fp 140 del tfp 141 return result 142 143 # Each method named open_<type> knows how to open that type of URL 144 145 # Use HTTP protocol 146 def open_http(self, url): 147 import httplib 148 host, selector = splithost(url) 149 if not host: raise IOError, ('http error', 'no host given') 150 h = httplib.HTTP(host) 151 h.putrequest('GET', selector) 152 for args in self.addheaders: apply(h.putheader, args) 153 h.endheaders() 154 errcode, errmsg, headers = h.getreply() 155 fp = h.getfile() 156 if errcode == 200: 157 return addinfo(fp, headers) 158 else: 159 n = len(fp.read()) 160 fp.close() 161 raise IOError, ('http error', errcode, errmsg, headers) 162 163 # Use Gopher protocol 164 def open_gopher(self, url): 165 import gopherlib 166 host, selector = splithost(url) 167 if not host: raise IOError, ('gopher error', 'no host given') 168 type, selector = splitgophertype(selector) 169 selector, query = splitquery(selector) 170 selector = unquote(selector) 171 if query: 172 query = unquote(query) 173 fp = gopherlib.send_query(selector, query, host) 174 else: 175 fp = gopherlib.send_selector(selector, host) 176 return addinfo(fp, noheaders()) 177 178 # Use local file or FTP depending on form of URL 179 def open_file(self, url): 180 try: 181 return self.open_local_file(url) 182 except IOError: 183 return self.open_ftp(url) 184 185 # Use local file 186 def open_local_file(self, url): 187 host, file = splithost(url) 188 if not host: return addinfo(open(file, 'r'), noheaders()) 189 host, port = splitport(host) 190 if not port and socket.gethostbyname(host) in ( 191 localhost(), thishost()): 192 file = unquote(file) 193 return addinfo(open(file, 'r'), noheaders()) 194 raise IOError, ('local file error', 'not on local host') 195 196 # Use FTP protocol 197 def open_ftp(self, url): 198 host, path = splithost(url) 199 if not host: raise IOError, ('ftp error', 'no host given') 200 host, port = splitport(host) 201 user, host = splituser(host) 202 if user: user, passwd = splitpasswd(user) 203 else: passwd = None 204 host = socket.gethostbyname(host) 205 if not port: 206 import ftplib 207 port = ftplib.FTP_PORT 208 path, attrs = splitattr(path) 209 dirs = string.splitfields(path, '/') 210 dirs, file = dirs[:-1], dirs[-1] 211 if dirs and not dirs[0]: dirs = dirs[1:] 212 key = (user, host, port, string.joinfields(dirs, '/')) 213## print 'key =', key 214 try: 215 if not self.ftpcache.has_key(key): 216 self.ftpcache[key] = \ 217 ftpwrapper(user, passwd, 218 host, port, dirs) 219 if not file: type = 'D' 220 else: type = 'I' 221 for attr in attrs: 222 attr, value = splitvalue(attr) 223 if string.lower(attr) == 'type' and \ 224 value in ('a', 'A', 'i', 'I', 'd', 'D'): 225 type = string.upper(value) 226 return addinfo(self.ftpcache[key].retrfile(file, type), 227 noheaders()) 228 except ftperrors(), msg: 229 raise IOError, ('ftp error', msg) 230 231 232# Utility functions 233 234# Return the IP address of the magic hostname 'localhost' 235_localhost = None 236def localhost(): 237 global _localhost 238 if not _localhost: 239 _localhost = socket.gethostbyname('localhost') 240 return _localhost 241 242# Return the IP address of the current host 243_thishost = None 244def thishost(): 245 global _thishost 246 if not _thishost: 247 _thishost = socket.gethostbyname(socket.gethostname()) 248 return _thishost 249 250# Return the set of errors raised by the FTP class 251_ftperrors = None 252def ftperrors(): 253 global _ftperrors 254 if not _ftperrors: 255 import ftplib 256 _ftperrors = (ftplib.error_reply, 257 ftplib.error_temp, 258 ftplib.error_perm, 259 ftplib.error_proto) 260 return _ftperrors 261 262# Return an empty rfc822.Message object 263_noheaders = None 264def noheaders(): 265 global _noheaders 266 if not _noheaders: 267 import rfc822 268 _noheaders = rfc822.Message(open('/dev/null', 'r')) 269 _noheaders.fp.close() # Recycle file descriptor 270 return _noheaders 271 272 273# Utility classes 274 275# Class used by open_ftp() for cache of open FTP connections 276class ftpwrapper: 277 def __init__(self, user, passwd, host, port, dirs): 278 self.user = unquote(user or '') 279 self.passwd = unquote(passwd or '') 280 self.host = host 281 self.port = port 282 self.dirs = [] 283 for dir in dirs: 284 self.dirs.append(unquote(dir)) 285 self.init() 286 def init(self): 287 import ftplib 288 self.ftp = ftplib.FTP() 289 self.ftp.connect(self.host, self.port) 290 self.ftp.login(self.user, self.passwd) 291 for dir in self.dirs: 292 self.ftp.cwd(dir) 293 def retrfile(self, file, type): 294 import ftplib 295 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 296 else: cmd = 'TYPE ' + type; isdir = 0 297 try: 298 self.ftp.voidcmd(cmd) 299 except ftplib.all_errors: 300 self.init() 301 self.ftp.voidcmd(cmd) 302 conn = None 303 if file and not isdir: 304 try: 305 cmd = 'RETR ' + file 306 conn = self.ftp.transfercmd(cmd) 307 except ftplib.error_perm, reason: 308 if reason[:3] != '550': 309 raise IOError, ('ftp error', reason) 310 if not conn: 311 # Try a directory listing 312 if file: cmd = 'LIST ' + file 313 else: cmd = 'LIST' 314 conn = self.ftp.transfercmd(cmd) 315 return addclosehook(conn.makefile('r'), self.ftp.voidresp) 316 317# Base class for addinfo and addclosehook 318class addbase: 319 def __init__(self, fp): 320 self.fp = fp 321 self.read = self.fp.read 322 self.readline = self.fp.readline 323 self.readlines = self.fp.readlines 324 self.fileno = self.fp.fileno 325 def __repr__(self): 326 return '<%s at %s whose fp = %s>' % ( 327 self.__class__.__name__, `id(self)`, `self.fp`) 328 def __del__(self): 329 self.close() 330 def close(self): 331 self.read = None 332 self.readline = None 333 self.readlines = None 334 self.fileno = None 335 if self.fp: self.fp.close() 336 self.fp = None 337 338# Class to add a close hook to an open file 339class addclosehook(addbase): 340 def __init__(self, fp, closehook, *hookargs): 341 addbase.__init__(self, fp) 342 self.closehook = closehook 343 self.hookargs = hookargs 344 def close(self): 345 if self.closehook: 346 apply(self.closehook, self.hookargs) 347 self.closehook = None 348 self.hookargs = None 349 addbase.close(self) 350 351# class to add an info() method to an open file 352class addinfo(addbase): 353 def __init__(self, fp, headers): 354 addbase.__init__(self, fp) 355 self.headers = headers 356 def info(self): 357 return self.headers 358 359 360# Utility to combine a URL with a base URL to form a new URL 361 362def basejoin(base, url): 363 type, path = splittype(url) 364 host, path = splithost(path) 365 if type and host: return url 366 basetype, basepath = splittype(base) 367 basehost, basepath = splithost(basepath) 368 basepath, basetag = splittag(basepath) 369 basepath, basequery = splitquery(basepath) 370 if not type: type = basetype or 'file' 371 if path[:1] != '/': 372 i = string.rfind(basepath, '/') 373 if i < 0: basepath = '/' 374 else: basepath = basepath[:i+1] 375 path = basepath + path 376 if not host: host = basehost 377 if host: return type + '://' + host + path 378 else: return type + ':' + path 379 380 381# Utilities to parse URLs (most of these return None for missing parts): 382# unwrap('<URL:type//host/path>') --> 'type//host/path' 383# splittype('type:opaquestring') --> 'type', 'opaquestring' 384# splithost('//host[:port]/path') --> 'host[:port]', '/path' 385# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' 386# splitpasswd('user:passwd') -> 'user', 'passwd' 387# splitport('host:port') --> 'host', 'port' 388# splitquery('/path?query') --> '/path', 'query' 389# splittag('/path#tag') --> '/path', 'tag' 390# splitattr('/path;attr1=value1;attr2=value2;...') -> 391# '/path', ['attr1=value1', 'attr2=value2', ...] 392# splitvalue('attr=value') --> 'attr', 'value' 393# splitgophertype('/Xselector') --> 'X', 'selector' 394# unquote('abc%20def') -> 'abc def' 395# quote('abc def') -> 'abc%20def') 396 397def unwrap(url): 398 url = string.strip(url) 399 if url[:1] == '<' and url[-1:] == '>': 400 url = string.strip(url[1:-1]) 401 if url[:4] == 'URL:': url = string.strip(url[4:]) 402 return url 403 404_typeprog = regex.compile('^\([^/:]+\):\(.*\)$') 405def splittype(url): 406 if _typeprog.match(url) >= 0: return _typeprog.group(1, 2) 407 return None, url 408 409_hostprog = regex.compile('^//\([^/]+\)\(.*\)$') 410def splithost(url): 411 if _hostprog.match(url) >= 0: return _hostprog.group(1, 2) 412 return None, url 413 414_userprog = regex.compile('^\([^@]*\)@\(.*\)$') 415def splituser(host): 416 if _userprog.match(host) >= 0: return _userprog.group(1, 2) 417 return None, host 418 419_passwdprog = regex.compile('^\([^:]*\):\(.*\)$') 420def splitpasswd(user): 421 if _passwdprog.match(user) >= 0: return _passwdprog.group(1, 2) 422 return user, None 423 424_portprog = regex.compile('^\(.*\):\([0-9]+\)$') 425def splitport(host): 426 if _portprog.match(host) >= 0: return _portprog.group(1, 2) 427 return host, None 428 429_queryprog = regex.compile('^\(.*\)\?\([^?]*\)$') 430def splitquery(url): 431 if _queryprog.match(url) >= 0: return _queryprog.group(1, 2) 432 return url, None 433 434_tagprog = regex.compile('^\(.*\)#\([^#]*\)$') 435def splittag(url): 436 if _tagprog.match(url) >= 0: return _tagprog.group(1, 2) 437 return url, None 438 439def splitattr(url): 440 words = string.splitfields(url, ';') 441 return words[0], words[1:] 442 443_valueprog = regex.compile('^\([^=]*\)=\(.*\)$') 444def splitvalue(attr): 445 if _valueprog.match(attr) >= 0: return _valueprog.group(1, 2) 446 return attr, None 447 448def splitgophertype(selector): 449 if selector[:1] == '/' and selector[1:2]: 450 return selector[1], selector[2:] 451 return None, selector 452 453_quoteprog = regex.compile('%[0-9a-fA-F][0-9a-fA-F]') 454def unquote(s): 455 i = 0 456 n = len(s) 457 res = '' 458 while 0 <= i < n: 459 j = _quoteprog.search(s, i) 460 if j < 0: 461 res = res + s[i:] 462 break 463 res = res + (s[i:j] + chr(eval('0x' + s[j+1:j+3]))) 464 i = j+3 465 return res 466 467always_safe = string.letters + string.digits + '_,.-' 468def quote(s, safe = '/'): 469 safe = always_safe + safe 470 res = '' 471 for c in s: 472 if c in safe: 473 res = res + c 474 else: 475 res = res + '%%%02x' % ord(c) 476 return res 477 478# Test and time quote() and unquote() 479def test1(): 480 import time 481 s = '' 482 for i in range(256): s = s + chr(i) 483 s = s*4 484 t0 = time.time() 485 qs = quote(s) 486 uqs = unquote(qs) 487 t1 = time.time() 488 if uqs != s: 489 print 'Wrong!' 490 print `s` 491 print `qs` 492 print `uqs` 493 print round(t1 - t0, 3), 'sec' 494 495 496# Test program 497def test(): 498 import sys 499 import regsub 500 args = sys.argv[1:] 501 if not args: 502 args = [ 503 '/etc/passwd', 504 'file:/etc/passwd', 505 'file://localhost/etc/passwd', 506 'ftp://ftp.cwi.nl/etc/passwd', 507 'gopher://gopher.cwi.nl/11/', 508 'http://www.cwi.nl/index.html', 509 ] 510 try: 511 for url in args: 512 print '-'*10, url, '-'*10 513 fn, h = urlretrieve(url) 514 print fn, h 515 if h: 516 print '======' 517 for k in h.keys(): print k + ':', h[k] 518 print '======' 519 fp = open(fn, 'r') 520 data = fp.read() 521 del fp 522 print regsub.gsub('\r', '', data) 523 fn, h = None, None 524 print '-'*40 525 finally: 526 urlcleanup() 527 528# Run test program when run as a script 529if __name__ == '__main__': 530## test1() 531 test() 532