urllib.py revision ae9ee7329d350b57a2d7b835e4e77a00b67e21b6
1# Open an arbitrary URL 2# 3# See the following document for more info on URLs: 4# "Names and Addresses, URIs, URLs, URNs, URCs", at 5# http://www.w3.org/pub/WWW/Addressing/Overview.html 6# 7# See also the HTTP spec (from which the error codes are derived): 8# "HTTP - Hypertext Transfer Protocol", at 9# http://www.w3.org/pub/WWW/Protocols/ 10# 11# Related standards and specs: 12# - RFC1808: the "relative URL" spec. (authoritative status) 13# - RFC1738 - the "URL standard". (authoritative status) 14# - RFC1630 - the "URI spec". (informational status) 15# 16# The object returned by URLopener().open(file) will differ per 17# protocol. All you know is that is has methods read(), readline(), 18# readlines(), fileno(), close() and info(). The read*(), fileno() 19# and close() methods work like those of open files. 20# The info() method returns a mimetools.Message object which can be 21# used to query various info about the object, if available. 22# (mimetools.Message objects are queried with the getheader() method.) 23 24import string 25import socket 26import os 27import sys 28 29 30__version__ = '1.10' 31 32MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 33 34# Helper for non-unix systems 35if os.name == 'mac': 36 from macurl2path import url2pathname, pathname2url 37elif os.name == 'nt': 38 from nturl2path import url2pathname, pathname2url 39else: 40 def url2pathname(pathname): 41 return pathname 42 def pathname2url(pathname): 43 return pathname 44 45# This really consists of two pieces: 46# (1) a class which handles opening of all sorts of URLs 47# (plus assorted utilities etc.) 48# (2) a set of functions for parsing URLs 49# XXX Should these be separated out into different modules? 50 51 52# Shortcut for basic usage 53_urlopener = None 54def urlopen(url, data=None): 55 global _urlopener 56 if not _urlopener: 57 _urlopener = FancyURLopener() 58 if data is None: 59 return _urlopener.open(url) 60 else: 61 return _urlopener.open(url, data) 62def urlretrieve(url, filename=None): 63 global _urlopener 64 if not _urlopener: 65 _urlopener = FancyURLopener() 66 if filename: 67 return _urlopener.retrieve(url, filename) 68 else: 69 return _urlopener.retrieve(url) 70def urlcleanup(): 71 if _urlopener: 72 _urlopener.cleanup() 73 74 75# Class to open URLs. 76# This is a class rather than just a subroutine because we may need 77# more than one set of global protocol-specific options. 78# Note -- this is a base class for those who don't want the 79# automatic handling of errors type 302 (relocated) and 401 80# (authorization needed). 81ftpcache = {} 82class URLopener: 83 84 __tempfiles = None 85 86 # Constructor 87 def __init__(self, proxies=None): 88 if proxies is None: 89 proxies = getproxies() 90 assert hasattr(proxies, 'has_key'), "proxies must be a mapping" 91 self.proxies = proxies 92 server_version = "Python-urllib/%s" % __version__ 93 self.addheaders = [('User-agent', server_version)] 94 self.__tempfiles = [] 95 self.__unlink = os.unlink # See cleanup() 96 self.tempcache = None 97 # Undocumented feature: if you assign {} to tempcache, 98 # it is used to cache files retrieved with 99 # self.retrieve(). This is not enabled by default 100 # since it does not work for changing documents (and I 101 # haven't got the logic to check expiration headers 102 # yet). 103 self.ftpcache = ftpcache 104 # Undocumented feature: you can use a different 105 # ftp cache by assigning to the .ftpcache member; 106 # in case you want logically independent URL openers 107 108 def __del__(self): 109 self.close() 110 111 def close(self): 112 self.cleanup() 113 114 def cleanup(self): 115 # This code sometimes runs when the rest of this module 116 # has already been deleted, so it can't use any globals 117 # or import anything. 118 if self.__tempfiles: 119 for file in self.__tempfiles: 120 try: 121 self.__unlink(file) 122 except: 123 pass 124 del self.__tempfiles[:] 125 if self.tempcache: 126 self.tempcache.clear() 127 128 # Add a header to be used by the HTTP interface only 129 # e.g. u.addheader('Accept', 'sound/basic') 130 def addheader(self, *args): 131 self.addheaders.append(args) 132 133 # External interface 134 # Use URLopener().open(file) instead of open(file, 'r') 135 def open(self, fullurl, data=None): 136 fullurl = unwrap(fullurl) 137 if self.tempcache and self.tempcache.has_key(fullurl): 138 filename, headers = self.tempcache[fullurl] 139 fp = open(filename, 'rb') 140 return addinfourl(fp, headers, fullurl) 141 type, url = splittype(fullurl) 142 if not type: type = 'file' 143 if self.proxies.has_key(type): 144 proxy = self.proxies[type] 145 type, proxy = splittype(proxy) 146 host, selector = splithost(proxy) 147 url = (host, fullurl) # Signal special case to open_*() 148 name = 'open_' + type 149 if '-' in name: 150 # replace - with _ 151 name = string.join(string.split(name, '-'), '_') 152 if not hasattr(self, name): 153 if data is None: 154 return self.open_unknown(fullurl) 155 else: 156 return self.open_unknown(fullurl, data) 157 try: 158 if data is None: 159 return getattr(self, name)(url) 160 else: 161 return getattr(self, name)(url, data) 162 except socket.error, msg: 163 raise IOError, ('socket error', msg), sys.exc_info()[2] 164 165 # Overridable interface to open unknown URL type 166 def open_unknown(self, fullurl, data=None): 167 type, url = splittype(fullurl) 168 raise IOError, ('url error', 'unknown url type', type) 169 170 # External interface 171 # retrieve(url) returns (filename, None) for a local object 172 # or (tempfilename, headers) for a remote object 173 def retrieve(self, url, filename=None): 174 url = unwrap(url) 175 if self.tempcache and self.tempcache.has_key(url): 176 return self.tempcache[url] 177 type, url1 = splittype(url) 178 if not filename and (not type or type == 'file'): 179 try: 180 fp = self.open_local_file(url1) 181 hdrs = fp.info() 182 del fp 183 return url2pathname(splithost(url1)[1]), hdrs 184 except IOError, msg: 185 pass 186 fp = self.open(url) 187 headers = fp.info() 188 if not filename: 189 import tempfile 190 garbage, path = splittype(url) 191 garbage, path = splithost(path or "") 192 path, garbage = splitquery(path or "") 193 path, garbage = splitattr(path or "") 194 suffix = os.path.splitext(path)[1] 195 filename = tempfile.mktemp(suffix) 196 self.__tempfiles.append(filename) 197 result = filename, headers 198 if self.tempcache is not None: 199 self.tempcache[url] = result 200 tfp = open(filename, 'wb') 201 bs = 1024*8 202 block = fp.read(bs) 203 while block: 204 tfp.write(block) 205 block = fp.read(bs) 206 fp.close() 207 tfp.close() 208 del fp 209 del tfp 210 return result 211 212 # Each method named open_<type> knows how to open that type of URL 213 214 # Use HTTP protocol 215 def open_http(self, url, data=None): 216 import httplib 217 user_passwd = None 218 if type(url) is type(""): 219 host, selector = splithost(url) 220 if host: 221 user_passwd, host = splituser(host) 222 realhost = host 223 else: 224 host, selector = url 225 urltype, rest = splittype(selector) 226 url = rest 227 user_passwd = None 228 if string.lower(urltype) != 'http': 229 realhost = None 230 else: 231 realhost, rest = splithost(rest) 232 if realhost: 233 user_passwd, realhost = \ 234 splituser(realhost) 235 if user_passwd: 236 selector = "%s://%s%s" % (urltype, 237 realhost, 238 rest) 239 #print "proxy via http:", host, selector 240 if not host: raise IOError, ('http error', 'no host given') 241 if user_passwd: 242 import base64 243 auth = string.strip(base64.encodestring(user_passwd)) 244 else: 245 auth = None 246 h = httplib.HTTP(host) 247 if data is not None: 248 h.putrequest('POST', selector) 249 h.putheader('Content-type', 250 'application/x-www-form-urlencoded') 251 h.putheader('Content-length', '%d' % len(data)) 252 else: 253 h.putrequest('GET', selector) 254 if auth: h.putheader('Authorization', 'Basic %s' % auth) 255 if realhost: h.putheader('Host', realhost) 256 for args in self.addheaders: apply(h.putheader, args) 257 h.endheaders() 258 if data is not None: 259 h.send(data + '\r\n') 260 errcode, errmsg, headers = h.getreply() 261 fp = h.getfile() 262 if errcode == 200: 263 return addinfourl(fp, headers, "http:" + url) 264 else: 265 return self.http_error(url, 266 fp, errcode, errmsg, headers) 267 268 # Handle http errors. 269 # Derived class can override this, or provide specific handlers 270 # named http_error_DDD where DDD is the 3-digit error code 271 def http_error(self, url, fp, errcode, errmsg, headers): 272 # First check if there's a specific handler for this error 273 name = 'http_error_%d' % errcode 274 if hasattr(self, name): 275 method = getattr(self, name) 276 result = method(url, fp, errcode, errmsg, headers) 277 if result: return result 278 return self.http_error_default( 279 url, fp, errcode, errmsg, headers) 280 281 # Default http error handler: close the connection and raises IOError 282 def http_error_default(self, url, fp, errcode, errmsg, headers): 283 void = fp.read() 284 fp.close() 285 raise IOError, ('http error', errcode, errmsg, headers) 286 287 # Use Gopher protocol 288 def open_gopher(self, url): 289 import gopherlib 290 host, selector = splithost(url) 291 if not host: raise IOError, ('gopher error', 'no host given') 292 type, selector = splitgophertype(selector) 293 selector, query = splitquery(selector) 294 selector = unquote(selector) 295 if query: 296 query = unquote(query) 297 fp = gopherlib.send_query(selector, query, host) 298 else: 299 fp = gopherlib.send_selector(selector, host) 300 return addinfourl(fp, noheaders(), "gopher:" + url) 301 302 # Use local file or FTP depending on form of URL 303 def open_file(self, url): 304 if url[:2] == '//' and url[2:3] != '/': 305 return self.open_ftp(url) 306 else: 307 return self.open_local_file(url) 308 309 # Use local file 310 def open_local_file(self, url): 311 import mimetypes, mimetools, StringIO 312 mtype = mimetypes.guess_type(url)[0] 313 headers = mimetools.Message(StringIO.StringIO( 314 'Content-Type: %s\n' % (mtype or 'text/plain'))) 315 host, file = splithost(url) 316 if not host: 317 return addinfourl( 318 open(url2pathname(file), 'rb'), 319 headers, 'file:'+file) 320 host, port = splitport(host) 321 if not port and socket.gethostbyname(host) in ( 322 localhost(), thishost()): 323 file = unquote(file) 324 return addinfourl( 325 open(url2pathname(file), 'rb'), 326 headers, 'file:'+file) 327 raise IOError, ('local file error', 'not on local host') 328 329 # Use FTP protocol 330 def open_ftp(self, url): 331 host, path = splithost(url) 332 if not host: raise IOError, ('ftp error', 'no host given') 333 host, port = splitport(host) 334 user, host = splituser(host) 335 if user: user, passwd = splitpasswd(user) 336 else: passwd = None 337 host = socket.gethostbyname(host) 338 if not port: 339 import ftplib 340 port = ftplib.FTP_PORT 341 else: 342 port = int(port) 343 path, attrs = splitattr(path) 344 dirs = string.splitfields(path, '/') 345 dirs, file = dirs[:-1], dirs[-1] 346 if dirs and not dirs[0]: dirs = dirs[1:] 347 key = (user, host, port, string.joinfields(dirs, '/')) 348 if len(self.ftpcache) > MAXFTPCACHE: 349 # Prune the cache, rather arbitrarily 350 for k in self.ftpcache.keys(): 351 if k != key: 352 v = self.ftpcache[k] 353 del self.ftpcache[k] 354 v.close() 355 try: 356 if not self.ftpcache.has_key(key): 357 self.ftpcache[key] = \ 358 ftpwrapper(user, passwd, 359 host, port, dirs) 360 if not file: type = 'D' 361 else: type = 'I' 362 for attr in attrs: 363 attr, value = splitvalue(attr) 364 if string.lower(attr) == 'type' and \ 365 value in ('a', 'A', 'i', 'I', 'd', 'D'): 366 type = string.upper(value) 367 return addinfourl( 368 self.ftpcache[key].retrfile(file, type), 369 noheaders(), "ftp:" + url) 370 except ftperrors(), msg: 371 raise IOError, ('ftp error', msg), sys.exc_info()[2] 372 373 # Use "data" URL 374 def open_data(self, url, data=None): 375 # ignore POSTed data 376 # 377 # syntax of data URLs: 378 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 379 # mediatype := [ type "/" subtype ] *( ";" parameter ) 380 # data := *urlchar 381 # parameter := attribute "=" value 382 import StringIO, mimetools, time 383 try: 384 [type, data] = string.split(url, ',', 1) 385 except ValueError: 386 raise IOError, ('data error', 'bad data URL') 387 if not type: 388 type = 'text/plain;charset=US-ASCII' 389 semi = string.rfind(type, ';') 390 if semi >= 0 and '=' not in type[semi:]: 391 encoding = type[semi+1:] 392 type = type[:semi] 393 else: 394 encoding = '' 395 msg = [] 396 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT', 397 time.gmtime(time.time()))) 398 msg.append('Content-type: %s' % type) 399 if encoding == 'base64': 400 import base64 401 data = base64.decodestring(data) 402 else: 403 data = unquote(data) 404 msg.append('Content-length: %d' % len(data)) 405 msg.append('') 406 msg.append(data) 407 msg = string.join(msg, '\n') 408 f = StringIO.StringIO(msg) 409 headers = mimetools.Message(f, 0) 410 f.fileno = None # needed for addinfourl 411 return addinfourl(f, headers, url) 412 413 414# Derived class with handlers for errors we can handle (perhaps) 415class FancyURLopener(URLopener): 416 417 def __init__(self, *args): 418 apply(URLopener.__init__, (self,) + args) 419 self.auth_cache = {} 420 421 # Default error handling -- don't raise an exception 422 def http_error_default(self, url, fp, errcode, errmsg, headers): 423 return addinfourl(fp, headers, "http:" + url) 424 425 # Error 302 -- relocated (temporarily) 426 def http_error_302(self, url, fp, errcode, errmsg, headers): 427 # XXX The server can force infinite recursion here! 428 if headers.has_key('location'): 429 newurl = headers['location'] 430 elif headers.has_key('uri'): 431 newurl = headers['uri'] 432 else: 433 return 434 void = fp.read() 435 fp.close() 436 return self.open(newurl) 437 438 # Error 301 -- also relocated (permanently) 439 http_error_301 = http_error_302 440 441 # Error 401 -- authentication required 442 # See this URL for a description of the basic authentication scheme: 443 # http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt 444 def http_error_401(self, url, fp, errcode, errmsg, headers): 445 if headers.has_key('www-authenticate'): 446 stuff = headers['www-authenticate'] 447 import re 448 match = re.match( 449 '[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 450 if match: 451 scheme, realm = match.groups() 452 if string.lower(scheme) == 'basic': 453 return self.retry_http_basic_auth( 454 url, realm) 455 456 def retry_http_basic_auth(self, url, realm): 457 host, selector = splithost(url) 458 i = string.find(host, '@') + 1 459 host = host[i:] 460 user, passwd = self.get_user_passwd(host, realm, i) 461 if not (user or passwd): return None 462 host = user + ':' + passwd + '@' + host 463 newurl = 'http://' + host + selector 464 return self.open(newurl) 465 466 def get_user_passwd(self, host, realm, clear_cache = 0): 467 key = realm + '@' + string.lower(host) 468 if self.auth_cache.has_key(key): 469 if clear_cache: 470 del self.auth_cache[key] 471 else: 472 return self.auth_cache[key] 473 user, passwd = self.prompt_user_passwd(host, realm) 474 if user or passwd: self.auth_cache[key] = (user, passwd) 475 return user, passwd 476 477 def prompt_user_passwd(self, host, realm): 478 # Override this in a GUI environment! 479 import getpass 480 try: 481 user = raw_input("Enter username for %s at %s: " % 482 (realm, host)) 483 passwd = getpass.getpass( 484 "Enter password for %s in %s at %s: " % 485 (user, realm, host)) 486 return user, passwd 487 except KeyboardInterrupt: 488 print 489 return None, None 490 491 492# Utility functions 493 494# Return the IP address of the magic hostname 'localhost' 495_localhost = None 496def localhost(): 497 global _localhost 498 if not _localhost: 499 _localhost = socket.gethostbyname('localhost') 500 return _localhost 501 502# Return the IP address of the current host 503_thishost = None 504def thishost(): 505 global _thishost 506 if not _thishost: 507 _thishost = socket.gethostbyname(socket.gethostname()) 508 return _thishost 509 510# Return the set of errors raised by the FTP class 511_ftperrors = None 512def ftperrors(): 513 global _ftperrors 514 if not _ftperrors: 515 import ftplib 516 _ftperrors = ftplib.all_errors 517 return _ftperrors 518 519# Return an empty mimetools.Message object 520_noheaders = None 521def noheaders(): 522 global _noheaders 523 if not _noheaders: 524 import mimetools 525 import StringIO 526 _noheaders = mimetools.Message(StringIO.StringIO(), 0) 527 _noheaders.fp.close() # Recycle file descriptor 528 return _noheaders 529 530 531# Utility classes 532 533# Class used by open_ftp() for cache of open FTP connections 534class ftpwrapper: 535 def __init__(self, user, passwd, host, port, dirs): 536 self.user = unquote(user or '') 537 self.passwd = unquote(passwd or '') 538 self.host = host 539 self.port = port 540 self.dirs = [] 541 for dir in dirs: 542 self.dirs.append(unquote(dir)) 543 self.init() 544 def init(self): 545 import ftplib 546 self.busy = 0 547 self.ftp = ftplib.FTP() 548 self.ftp.connect(self.host, self.port) 549 self.ftp.login(self.user, self.passwd) 550 for dir in self.dirs: 551 self.ftp.cwd(dir) 552 def retrfile(self, file, type): 553 import ftplib 554 self.endtransfer() 555 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 556 else: cmd = 'TYPE ' + type; isdir = 0 557 try: 558 self.ftp.voidcmd(cmd) 559 except ftplib.all_errors: 560 self.init() 561 self.ftp.voidcmd(cmd) 562 conn = None 563 if file and not isdir: 564 # Use nlst to see if the file exists at all 565 try: 566 self.ftp.nlst(file) 567 except ftplib.error_perm, reason: 568 raise IOError, ('ftp error', reason), \ 569 sys.exc_info()[2] 570 # Restore the transfer mode! 571 self.ftp.voidcmd(cmd) 572 # Try to retrieve as a file 573 try: 574 cmd = 'RETR ' + file 575 conn = self.ftp.transfercmd(cmd) 576 except ftplib.error_perm, reason: 577 if reason[:3] != '550': 578 raise IOError, ('ftp error', reason), \ 579 sys.exc_info()[2] 580 if not conn: 581 # Set transfer mode to ASCII! 582 self.ftp.voidcmd('TYPE A') 583 # Try a directory listing 584 if file: cmd = 'LIST ' + file 585 else: cmd = 'LIST' 586 conn = self.ftp.transfercmd(cmd) 587 self.busy = 1 588 return addclosehook(conn.makefile('rb'), self.endtransfer) 589 def endtransfer(self): 590 if not self.busy: 591 return 592 self.busy = 0 593 try: 594 self.ftp.voidresp() 595 except ftperrors(): 596 pass 597 def close(self): 598 self.endtransfer() 599 try: 600 self.ftp.close() 601 except ftperrors(): 602 pass 603 604# Base class for addinfo and addclosehook 605class addbase: 606 def __init__(self, fp): 607 self.fp = fp 608 self.read = self.fp.read 609 self.readline = self.fp.readline 610 self.readlines = self.fp.readlines 611 self.fileno = self.fp.fileno 612 def __repr__(self): 613 return '<%s at %s whose fp = %s>' % ( 614 self.__class__.__name__, `id(self)`, `self.fp`) 615 def close(self): 616 self.read = None 617 self.readline = None 618 self.readlines = None 619 self.fileno = None 620 if self.fp: self.fp.close() 621 self.fp = None 622 623# Class to add a close hook to an open file 624class addclosehook(addbase): 625 def __init__(self, fp, closehook, *hookargs): 626 addbase.__init__(self, fp) 627 self.closehook = closehook 628 self.hookargs = hookargs 629 def close(self): 630 if self.closehook: 631 apply(self.closehook, self.hookargs) 632 self.closehook = None 633 self.hookargs = None 634 addbase.close(self) 635 636# class to add an info() method to an open file 637class addinfo(addbase): 638 def __init__(self, fp, headers): 639 addbase.__init__(self, fp) 640 self.headers = headers 641 def info(self): 642 return self.headers 643 644# class to add info() and geturl() methods to an open file 645class addinfourl(addbase): 646 def __init__(self, fp, headers, url): 647 addbase.__init__(self, fp) 648 self.headers = headers 649 self.url = url 650 def info(self): 651 return self.headers 652 def geturl(self): 653 return self.url 654 655 656# Utility to combine a URL with a base URL to form a new URL 657 658def basejoin(base, url): 659 type, path = splittype(url) 660 if type: 661 # if url is complete (i.e., it contains a type), return it 662 return url 663 host, path = splithost(path) 664 type, basepath = splittype(base) # inherit type from base 665 if host: 666 # if url contains host, just inherit type 667 if type: return type + '://' + host + path 668 else: 669 # no type inherited, so url must have started with // 670 # just return it 671 return url 672 host, basepath = splithost(basepath) # inherit host 673 basepath, basetag = splittag(basepath) # remove extraneuous cruft 674 basepath, basequery = splitquery(basepath) # idem 675 if path[:1] != '/': 676 # non-absolute path name 677 if path[:1] in ('#', '?'): 678 # path is just a tag or query, attach to basepath 679 i = len(basepath) 680 else: 681 # else replace last component 682 i = string.rfind(basepath, '/') 683 if i < 0: 684 # basepath not absolute 685 if host: 686 # host present, make absolute 687 basepath = '/' 688 else: 689 # else keep non-absolute 690 basepath = '' 691 else: 692 # remove last file component 693 basepath = basepath[:i+1] 694 # Interpret ../ (important because of symlinks) 695 while basepath and path[:3] == '../': 696 path = path[3:] 697 i = string.rfind(basepath[:-1], '/') 698 if i > 0: 699 basepath = basepath[:i+1] 700 elif i == 0: 701 basepath = '/' 702 break 703 else: 704 basepath = '' 705 706 path = basepath + path 707 if type and host: return type + '://' + host + path 708 elif type: return type + ':' + path 709 elif host: return '//' + host + path # don't know what this means 710 else: return path 711 712 713# Utilities to parse URLs (most of these return None for missing parts): 714# unwrap('<URL:type://host/path>') --> 'type://host/path' 715# splittype('type:opaquestring') --> 'type', 'opaquestring' 716# splithost('//host[:port]/path') --> 'host[:port]', '/path' 717# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' 718# splitpasswd('user:passwd') -> 'user', 'passwd' 719# splitport('host:port') --> 'host', 'port' 720# splitquery('/path?query') --> '/path', 'query' 721# splittag('/path#tag') --> '/path', 'tag' 722# splitattr('/path;attr1=value1;attr2=value2;...') -> 723# '/path', ['attr1=value1', 'attr2=value2', ...] 724# splitvalue('attr=value') --> 'attr', 'value' 725# splitgophertype('/Xselector') --> 'X', 'selector' 726# unquote('abc%20def') -> 'abc def' 727# quote('abc def') -> 'abc%20def') 728 729def unwrap(url): 730 url = string.strip(url) 731 if url[:1] == '<' and url[-1:] == '>': 732 url = string.strip(url[1:-1]) 733 if url[:4] == 'URL:': url = string.strip(url[4:]) 734 return url 735 736_typeprog = None 737def splittype(url): 738 global _typeprog 739 if _typeprog is None: 740 import re 741 _typeprog = re.compile('^([^/:]+):') 742 743 match = _typeprog.match(url) 744 if match: 745 scheme = match.group(1) 746 return scheme, url[len(scheme) + 1:] 747 return None, url 748 749_hostprog = None 750def splithost(url): 751 global _hostprog 752 if _hostprog is None: 753 import re 754 _hostprog = re.compile('^//([^/]+)(.*)$') 755 756 match = _hostprog.match(url) 757 if match: return match.group(1, 2) 758 return None, url 759 760_userprog = None 761def splituser(host): 762 global _userprog 763 if _userprog is None: 764 import re 765 _userprog = re.compile('^([^@]*)@(.*)$') 766 767 match = _userprog.match(host) 768 if match: return match.group(1, 2) 769 return None, host 770 771_passwdprog = None 772def splitpasswd(user): 773 global _passwdprog 774 if _passwdprog is None: 775 import re 776 _passwdprog = re.compile('^([^:]*):(.*)$') 777 778 match = _passwdprog.match(user) 779 if match: return match.group(1, 2) 780 return user, None 781 782_portprog = None 783def splitport(host): 784 global _portprog 785 if _portprog is None: 786 import re 787 _portprog = re.compile('^(.*):([0-9]+)$') 788 789 match = _portprog.match(host) 790 if match: return match.group(1, 2) 791 return host, None 792 793# Split host and port, returning numeric port. 794# Return given default port if no ':' found; defaults to -1. 795# Return numerical port if a valid number are found after ':'. 796# Return None if ':' but not a valid number. 797_nportprog = None 798def splitnport(host, defport=-1): 799 global _nportprog 800 if _nportprog is None: 801 import re 802 _nportprog = re.compile('^(.*):(.*)$') 803 804 match = _nportprog.match(host) 805 if match: 806 host, port = match.group(1, 2) 807 try: 808 if not port: raise string.atoi_error, "no digits" 809 nport = string.atoi(port) 810 except string.atoi_error: 811 nport = None 812 return host, nport 813 return host, defport 814 815_queryprog = None 816def splitquery(url): 817 global _queryprog 818 if _queryprog is None: 819 import re 820 _queryprog = re.compile('^(.*)\?([^?]*)$') 821 822 match = _queryprog.match(url) 823 if match: return match.group(1, 2) 824 return url, None 825 826_tagprog = None 827def splittag(url): 828 global _tagprog 829 if _tagprog is None: 830 import re 831 _tagprog = re.compile('^(.*)#([^#]*)$') 832 833 match = _tagprog.match(url) 834 if match: return match.group(1, 2) 835 return url, None 836 837def splitattr(url): 838 words = string.splitfields(url, ';') 839 return words[0], words[1:] 840 841_valueprog = None 842def splitvalue(attr): 843 global _valueprog 844 if _valueprog is None: 845 import re 846 _valueprog = re.compile('^([^=]*)=(.*)$') 847 848 match = _valueprog.match(attr) 849 if match: return match.group(1, 2) 850 return attr, None 851 852def splitgophertype(selector): 853 if selector[:1] == '/' and selector[1:2]: 854 return selector[1], selector[2:] 855 return None, selector 856 857_quoteprog = None 858def unquote(s): 859 global _quoteprog 860 if _quoteprog is None: 861 import re 862 _quoteprog = re.compile('%[0-9a-fA-F][0-9a-fA-F]') 863 864 i = 0 865 n = len(s) 866 res = [] 867 while 0 <= i < n: 868 match = _quoteprog.search(s, i) 869 if not match: 870 res.append(s[i:]) 871 break 872 j = match.start(0) 873 res.append(s[i:j] + chr(string.atoi(s[j+1:j+3], 16))) 874 i = j+3 875 return string.joinfields(res, '') 876 877def unquote_plus(s): 878 if '+' in s: 879 # replace '+' with ' ' 880 s = string.join(string.split(s, '+'), ' ') 881 return unquote(s) 882 883always_safe = string.letters + string.digits + '_,.-' 884def quote(s, safe = '/'): 885 safe = always_safe + safe 886 res = [] 887 for c in s: 888 if c in safe: 889 res.append(c) 890 else: 891 res.append('%%%02x' % ord(c)) 892 return string.joinfields(res, '') 893 894def quote_plus(s, safe = '/'): 895 if ' ' in s: 896 # replace ' ' with '+' 897 s = string.join(string.split(s, ' '), '+') 898 return quote(s, safe + '+') 899 else: 900 return quote(s, safe) 901 902 903# Proxy handling 904def getproxies(): 905 """Return a dictionary of protocol scheme -> proxy server URL mappings. 906 907 Scan the environment for variables named <scheme>_proxy; 908 this seems to be the standard convention. If you need a 909 different way, you can pass a proxies dictionary to the 910 [Fancy]URLopener constructor. 911 912 """ 913 proxies = {} 914 for name, value in os.environ.items(): 915 name = string.lower(name) 916 if value and name[-6:] == '_proxy': 917 proxies[name[:-6]] = value 918 return proxies 919 920 921# Test and time quote() and unquote() 922def test1(): 923 import time 924 s = '' 925 for i in range(256): s = s + chr(i) 926 s = s*4 927 t0 = time.time() 928 qs = quote(s) 929 uqs = unquote(qs) 930 t1 = time.time() 931 if uqs != s: 932 print 'Wrong!' 933 print `s` 934 print `qs` 935 print `uqs` 936 print round(t1 - t0, 3), 'sec' 937 938 939# Test program 940def test(): 941 import sys 942 args = sys.argv[1:] 943 if not args: 944 args = [ 945 '/etc/passwd', 946 'file:/etc/passwd', 947 'file://localhost/etc/passwd', 948 'ftp://ftp.python.org/etc/passwd', 949 'gopher://gopher.micro.umn.edu/1/', 950 'http://www.python.org/index.html', 951 ] 952 try: 953 for url in args: 954 print '-'*10, url, '-'*10 955 fn, h = urlretrieve(url) 956 print fn, h 957 if h: 958 print '======' 959 for k in h.keys(): print k + ':', h[k] 960 print '======' 961 fp = open(fn, 'rb') 962 data = fp.read() 963 del fp 964 if '\r' in data: 965 table = string.maketrans("", "") 966 data = string.translate(data, table, "\r") 967 print data 968 fn, h = None, None 969 print '-'*40 970 finally: 971 urlcleanup() 972 973# Run test program when run as a script 974if __name__ == '__main__': 975 test1() 976 test() 977