urllib.py revision 332e14437c6e7461e9756f75e4fac3f9d2043023
1# Open an arbitrary URL 2# 3# See the following document for more info on URLs: 4# "Names and Addresses, URIs, URLs, URNs, URCs", at 5# http://www.w3.org/pub/WWW/Addressing/Overview.html 6# 7# See also the HTTP spec (from which the error codes are derived): 8# "HTTP - Hypertext Transfer Protocol", at 9# http://www.w3.org/pub/WWW/Protocols/ 10# 11# Related standards and specs: 12# - RFC1808: the "relative URL" spec. (authoritative status) 13# - RFC1738 - the "URL standard". (authoritative status) 14# - RFC1630 - the "URI spec". (informational status) 15# 16# The object returned by URLopener().open(file) will differ per 17# protocol. All you know is that is has methods read(), readline(), 18# readlines(), fileno(), close() and info(). The read*(), fileno() 19# and close() methods work like those of open files. 20# The info() method returns a mimetools.Message object which can be 21# used to query various info about the object, if available. 22# (mimetools.Message objects are queried with the getheader() method.) 23 24import string 25import socket 26import os 27import sys 28 29 30__version__ = '1.8' 31 32MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 33 34# Helper for non-unix systems 35if os.name == 'mac': 36 from macurl2path import url2pathname, pathname2url 37elif os.name == 'nt': 38 from nturl2path import url2pathname, pathname2url 39else: 40 def url2pathname(pathname): 41 return pathname 42 def pathname2url(pathname): 43 return pathname 44 45# This really consists of two pieces: 46# (1) a class which handles opening of all sorts of URLs 47# (plus assorted utilities etc.) 48# (2) a set of functions for parsing URLs 49# XXX Should these be separated out into different modules? 50 51 52# Shortcut for basic usage 53_urlopener = None 54def urlopen(url, data=None): 55 global _urlopener 56 if not _urlopener: 57 _urlopener = FancyURLopener() 58 if data is None: 59 return _urlopener.open(url) 60 else: 61 return _urlopener.open(url, data) 62def urlretrieve(url, filename=None): 63 global _urlopener 64 if not _urlopener: 65 _urlopener = FancyURLopener() 66 if filename: 67 return _urlopener.retrieve(url, filename) 68 else: 69 return _urlopener.retrieve(url) 70def urlcleanup(): 71 if _urlopener: 72 _urlopener.cleanup() 73 74 75# Class to open URLs. 76# This is a class rather than just a subroutine because we may need 77# more than one set of global protocol-specific options. 78# Note -- this is a base class for those who don't want the 79# automatic handling of errors type 302 (relocated) and 401 80# (authorization needed). 81ftpcache = {} 82class URLopener: 83 84 __tempfiles = [] 85 86 # Constructor 87 def __init__(self, proxies=None): 88 if proxies is None: 89 proxies = getproxies() 90 self.proxies = proxies 91 server_version = "Python-urllib/%s" % __version__ 92 self.addheaders = [('User-agent', server_version)] 93 self.__tempfiles = [] 94 self.tempcache = None 95 # Undocumented feature: if you assign {} to tempcache, 96 # it is used to cache files retrieved with 97 # self.retrieve(). This is not enabled by default 98 # since it does not work for changing documents (and I 99 # haven't got the logic to check expiration headers 100 # yet). 101 self.ftpcache = ftpcache 102 # Undocumented feature: you can use a different 103 # ftp cache by assigning to the .ftpcache member; 104 # in case you want logically independent URL openers 105 106 def __del__(self): 107 self.close() 108 109 def close(self): 110 self.cleanup() 111 112 def cleanup(self): 113 if self.__tempfiles: 114 import os 115 for file in self.__tempfiles: 116 try: 117 os.unlink(file) 118 except os.error: 119 pass 120 URLopener.__tempfiles = [] 121 self.tempcache = None 122 123 # Add a header to be used by the HTTP interface only 124 # e.g. u.addheader('Accept', 'sound/basic') 125 def addheader(self, *args): 126 self.addheaders.append(args) 127 128 # External interface 129 # Use URLopener().open(file) instead of open(file, 'r') 130 def open(self, fullurl, data=None): 131 fullurl = unwrap(fullurl) 132 type, url = splittype(fullurl) 133 if not type: type = 'file' 134 self.openedurl = '%s:%s' % (type, url) 135 if self.proxies.has_key(type): 136 proxy = self.proxies[type] 137 type, proxy = splittype(proxy) 138 host, selector = splithost(proxy) 139 url = (host, fullurl) # Signal special case to open_*() 140 name = 'open_' + type 141 if '-' in name: 142 # replace - with _ 143 name = string.join(string.split(name, '-'), '_') 144 if not hasattr(self, name): 145 if data is None: 146 return self.open_unknown(fullurl) 147 else: 148 return self.open_unknown(fullurl, data) 149 try: 150 if data is None: 151 return getattr(self, name)(url) 152 else: 153 return getattr(self, name)(url, data) 154 except socket.error, msg: 155 raise IOError, ('socket error', msg), sys.exc_info()[2] 156 157 # Overridable interface to open unknown URL type 158 def open_unknown(self, fullurl, data=None): 159 type, url = splittype(fullurl) 160 raise IOError, ('url error', 'unknown url type', type) 161 162 # External interface 163 # retrieve(url) returns (filename, None) for a local object 164 # or (tempfilename, headers) for a remote object 165 def retrieve(self, url, filename=None): 166 if self.tempcache and self.tempcache.has_key(url): 167 return self.tempcache[url] 168 url1 = unwrap(url) 169 self.openedurl = url1 170 if self.tempcache and self.tempcache.has_key(url1): 171 self.tempcache[url] = self.tempcache[url1] 172 return self.tempcache[url1] 173 type, url1 = splittype(url1) 174 if not filename and (not type or type == 'file'): 175 try: 176 fp = self.open_local_file(url1) 177 del fp 178 return url2pathname(splithost(url1)[1]), None 179 except IOError, msg: 180 pass 181 fp = self.open(url) 182 headers = fp.info() 183 if not filename: 184 import tempfile 185 filename = tempfile.mktemp() 186 self.__tempfiles.append(filename) 187 result = filename, headers 188 if self.tempcache is not None: 189 self.tempcache[url] = result 190 tfp = open(filename, 'wb') 191 bs = 1024*8 192 block = fp.read(bs) 193 while block: 194 tfp.write(block) 195 block = fp.read(bs) 196 fp.close() 197 tfp.close() 198 del fp 199 del tfp 200 return result 201 202 # Each method named open_<type> knows how to open that type of URL 203 204 # Use HTTP protocol 205 def open_http(self, url, data=None): 206 import httplib 207 if type(url) is type(""): 208 host, selector = splithost(url) 209 user_passwd, host = splituser(host) 210 realhost = host 211 else: 212 host, selector = url 213 urltype, rest = splittype(selector) 214 user_passwd = None 215 if string.lower(urltype) != 'http': 216 realhost = None 217 else: 218 realhost, rest = splithost(rest) 219 user_passwd, realhost = splituser(realhost) 220 if user_passwd: 221 selector = "%s://%s%s" % (urltype, 222 realhost, rest) 223 #print "proxy via http:", host, selector 224 if not host: raise IOError, ('http error', 'no host given') 225 if user_passwd: 226 import base64 227 auth = string.strip(base64.encodestring(user_passwd)) 228 else: 229 auth = None 230 h = httplib.HTTP(host) 231 if data is not None: 232 h.putrequest('POST', selector) 233 h.putheader('Content-type', 234 'application/x-www-form-urlencoded') 235 h.putheader('Content-length', '%d' % len(data)) 236 else: 237 h.putrequest('GET', selector) 238 if auth: h.putheader('Authorization', 'Basic %s' % auth) 239 if realhost: h.putheader('Host', realhost) 240 for args in self.addheaders: apply(h.putheader, args) 241 h.endheaders() 242 if data is not None: 243 h.send(data + '\r\n') 244 errcode, errmsg, headers = h.getreply() 245 fp = h.getfile() 246 if errcode == 200: 247 return addinfourl(fp, headers, self.openedurl) 248 else: 249 return self.http_error(url, 250 fp, errcode, errmsg, headers) 251 252 # Handle http errors. 253 # Derived class can override this, or provide specific handlers 254 # named http_error_DDD where DDD is the 3-digit error code 255 def http_error(self, url, fp, errcode, errmsg, headers): 256 # First check if there's a specific handler for this error 257 name = 'http_error_%d' % errcode 258 if hasattr(self, name): 259 method = getattr(self, name) 260 result = method(url, fp, errcode, errmsg, headers) 261 if result: return result 262 return self.http_error_default( 263 url, fp, errcode, errmsg, headers) 264 265 # Default http error handler: close the connection and raises IOError 266 def http_error_default(self, url, fp, errcode, errmsg, headers): 267 void = fp.read() 268 fp.close() 269 raise IOError, ('http error', errcode, errmsg, headers) 270 271 # Use Gopher protocol 272 def open_gopher(self, url): 273 import gopherlib 274 host, selector = splithost(url) 275 if not host: raise IOError, ('gopher error', 'no host given') 276 type, selector = splitgophertype(selector) 277 selector, query = splitquery(selector) 278 selector = unquote(selector) 279 if query: 280 query = unquote(query) 281 fp = gopherlib.send_query(selector, query, host) 282 else: 283 fp = gopherlib.send_selector(selector, host) 284 return addinfourl(fp, noheaders(), self.openedurl) 285 286 # Use local file or FTP depending on form of URL 287 def open_file(self, url): 288 if url[:2] == '//' and url[2:3] != '/': 289 return self.open_ftp(url) 290 else: 291 return self.open_local_file(url) 292 293 # Use local file 294 def open_local_file(self, url): 295 host, file = splithost(url) 296 if not host: 297 return addinfourl( 298 open(url2pathname(file), 'rb'), 299 noheaders(), 'file:'+file) 300 host, port = splitport(host) 301 if not port and socket.gethostbyname(host) in ( 302 localhost(), thishost()): 303 file = unquote(file) 304 return addinfourl( 305 open(url2pathname(file), 'rb'), 306 noheaders(), 'file:'+file) 307 raise IOError, ('local file error', 'not on local host') 308 309 # Use FTP protocol 310 def open_ftp(self, url): 311 host, path = splithost(url) 312 if not host: raise IOError, ('ftp error', 'no host given') 313 host, port = splitport(host) 314 user, host = splituser(host) 315 if user: user, passwd = splitpasswd(user) 316 else: passwd = None 317 host = socket.gethostbyname(host) 318 if not port: 319 import ftplib 320 port = ftplib.FTP_PORT 321 path, attrs = splitattr(path) 322 dirs = string.splitfields(path, '/') 323 dirs, file = dirs[:-1], dirs[-1] 324 if dirs and not dirs[0]: dirs = dirs[1:] 325 key = (user, host, port, string.joinfields(dirs, '/')) 326 if len(self.ftpcache) > MAXFTPCACHE: 327 # Prune the cache, rather arbitrarily 328 for k in self.ftpcache.keys(): 329 if k != key: 330 v = self.ftpcache[k] 331 del self.ftpcache[k] 332 v.close() 333 try: 334 if not self.ftpcache.has_key(key): 335 self.ftpcache[key] = \ 336 ftpwrapper(user, passwd, 337 host, port, dirs) 338 if not file: type = 'D' 339 else: type = 'I' 340 for attr in attrs: 341 attr, value = splitvalue(attr) 342 if string.lower(attr) == 'type' and \ 343 value in ('a', 'A', 'i', 'I', 'd', 'D'): 344 type = string.upper(value) 345 return addinfourl( 346 self.ftpcache[key].retrfile(file, type), 347 noheaders(), self.openedurl) 348 except ftperrors(), msg: 349 raise IOError, ('ftp error', msg), sys.exc_info()[2] 350 351 352# Derived class with handlers for errors we can handle (perhaps) 353class FancyURLopener(URLopener): 354 355 def __init__(self, *args): 356 apply(URLopener.__init__, (self,) + args) 357 self.auth_cache = {} 358 359 # Default error handling -- don't raise an exception 360 def http_error_default(self, url, fp, errcode, errmsg, headers): 361 return addinfourl(fp, headers, self.openedurl) 362 363 # Error 302 -- relocated (temporarily) 364 def http_error_302(self, url, fp, errcode, errmsg, headers): 365 # XXX The server can force infinite recursion here! 366 if headers.has_key('location'): 367 newurl = headers['location'] 368 elif headers.has_key('uri'): 369 newurl = headers['uri'] 370 else: 371 return 372 void = fp.read() 373 fp.close() 374 return self.open(newurl) 375 376 # Error 301 -- also relocated (permanently) 377 http_error_301 = http_error_302 378 379 # Error 401 -- authentication required 380 # See this URL for a description of the basic authentication scheme: 381 # http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt 382 def http_error_401(self, url, fp, errcode, errmsg, headers): 383 if headers.has_key('www-authenticate'): 384 stuff = headers['www-authenticate'] 385 import re 386 match = re.match( 387 '[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 388 if match: 389 scheme, realm = match.group() 390 if string.lower(scheme) == 'basic': 391 return self.retry_http_basic_auth( 392 url, realm) 393 394 def retry_http_basic_auth(self, url, realm): 395 host, selector = splithost(url) 396 i = string.find(host, '@') + 1 397 host = host[i:] 398 user, passwd = self.get_user_passwd(host, realm, i) 399 if not (user or passwd): return None 400 host = user + ':' + passwd + '@' + host 401 newurl = '//' + host + selector 402 return self.open_http(newurl) 403 404 def get_user_passwd(self, host, realm, clear_cache = 0): 405 key = realm + '@' + string.lower(host) 406 if self.auth_cache.has_key(key): 407 if clear_cache: 408 del self.auth_cache[key] 409 else: 410 return self.auth_cache[key] 411 user, passwd = self.prompt_user_passwd(host, realm) 412 if user or passwd: self.auth_cache[key] = (user, passwd) 413 return user, passwd 414 415 def prompt_user_passwd(self, host, realm): 416 # Override this in a GUI environment! 417 try: 418 user = raw_input("Enter username for %s at %s: " % 419 (realm, host)) 420 self.echo_off() 421 try: 422 passwd = raw_input( 423 "Enter password for %s in %s at %s: " % 424 (user, realm, host)) 425 finally: 426 self.echo_on() 427 return user, passwd 428 except KeyboardInterrupt: 429 return None, None 430 431 def echo_off(self): 432 import os 433 os.system("stty -echo") 434 435 def echo_on(self): 436 import os 437 print 438 os.system("stty echo") 439 440 441# Utility functions 442 443# Return the IP address of the magic hostname 'localhost' 444_localhost = None 445def localhost(): 446 global _localhost 447 if not _localhost: 448 _localhost = socket.gethostbyname('localhost') 449 return _localhost 450 451# Return the IP address of the current host 452_thishost = None 453def thishost(): 454 global _thishost 455 if not _thishost: 456 _thishost = socket.gethostbyname(socket.gethostname()) 457 return _thishost 458 459# Return the set of errors raised by the FTP class 460_ftperrors = None 461def ftperrors(): 462 global _ftperrors 463 if not _ftperrors: 464 import ftplib 465 _ftperrors = ftplib.all_errors 466 return _ftperrors 467 468# Return an empty mimetools.Message object 469_noheaders = None 470def noheaders(): 471 global _noheaders 472 if not _noheaders: 473 import mimetools 474 import StringIO 475 _noheaders = mimetools.Message(StringIO.StringIO(), 0) 476 _noheaders.fp.close() # Recycle file descriptor 477 return _noheaders 478 479 480# Utility classes 481 482# Class used by open_ftp() for cache of open FTP connections 483class ftpwrapper: 484 def __init__(self, user, passwd, host, port, dirs): 485 self.user = unquote(user or '') 486 self.passwd = unquote(passwd or '') 487 self.host = host 488 self.port = port 489 self.dirs = [] 490 for dir in dirs: 491 self.dirs.append(unquote(dir)) 492 self.init() 493 def init(self): 494 import ftplib 495 self.ftp = ftplib.FTP() 496 self.ftp.connect(self.host, self.port) 497 self.ftp.login(self.user, self.passwd) 498 for dir in self.dirs: 499 self.ftp.cwd(dir) 500 def retrfile(self, file, type): 501 import ftplib 502 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 503 else: cmd = 'TYPE ' + type; isdir = 0 504 try: 505 self.ftp.voidcmd(cmd) 506 except ftplib.all_errors: 507 self.init() 508 self.ftp.voidcmd(cmd) 509 conn = None 510 if file and not isdir: 511 try: 512 cmd = 'RETR ' + file 513 conn = self.ftp.transfercmd(cmd) 514 except ftplib.error_perm, reason: 515 if reason[:3] != '550': 516 raise IOError, ('ftp error', reason), \ 517 sys.exc_info()[2] 518 if not conn: 519 # Try a directory listing 520 if file: cmd = 'LIST ' + file 521 else: cmd = 'LIST' 522 conn = self.ftp.transfercmd(cmd) 523 return addclosehook(conn.makefile('rb'), self.endtransfer) 524 def endtransfer(self): 525 try: 526 self.ftp.voidresp() 527 except ftperrors(): 528 pass 529 def close(self): 530 try: 531 self.ftp.close() 532 except ftperrors(): 533 pass 534 535# Base class for addinfo and addclosehook 536class addbase: 537 def __init__(self, fp): 538 self.fp = fp 539 self.read = self.fp.read 540 self.readline = self.fp.readline 541 self.readlines = self.fp.readlines 542 self.fileno = self.fp.fileno 543 def __repr__(self): 544 return '<%s at %s whose fp = %s>' % ( 545 self.__class__.__name__, `id(self)`, `self.fp`) 546 def close(self): 547 self.read = None 548 self.readline = None 549 self.readlines = None 550 self.fileno = None 551 if self.fp: self.fp.close() 552 self.fp = None 553 554# Class to add a close hook to an open file 555class addclosehook(addbase): 556 def __init__(self, fp, closehook, *hookargs): 557 addbase.__init__(self, fp) 558 self.closehook = closehook 559 self.hookargs = hookargs 560 def close(self): 561 if self.closehook: 562 apply(self.closehook, self.hookargs) 563 self.closehook = None 564 self.hookargs = None 565 addbase.close(self) 566 567# class to add an info() method to an open file 568class addinfo(addbase): 569 def __init__(self, fp, headers): 570 addbase.__init__(self, fp) 571 self.headers = headers 572 def info(self): 573 return self.headers 574 575# class to add info() and geturl() methods to an open file 576class addinfourl(addbase): 577 def __init__(self, fp, headers, url): 578 addbase.__init__(self, fp) 579 self.headers = headers 580 self.url = url 581 def info(self): 582 return self.headers 583 def geturl(self): 584 return self.url 585 586 587# Utility to combine a URL with a base URL to form a new URL 588 589def basejoin(base, url): 590 type, path = splittype(url) 591 if type: 592 # if url is complete (i.e., it contains a type), return it 593 return url 594 host, path = splithost(path) 595 type, basepath = splittype(base) # inherit type from base 596 if host: 597 # if url contains host, just inherit type 598 if type: return type + '://' + host + path 599 else: 600 # no type inherited, so url must have started with // 601 # just return it 602 return url 603 host, basepath = splithost(basepath) # inherit host 604 basepath, basetag = splittag(basepath) # remove extraneuous cruft 605 basepath, basequery = splitquery(basepath) # idem 606 if path[:1] != '/': 607 # non-absolute path name 608 if path[:1] in ('#', '?'): 609 # path is just a tag or query, attach to basepath 610 i = len(basepath) 611 else: 612 # else replace last component 613 i = string.rfind(basepath, '/') 614 if i < 0: 615 # basepath not absolute 616 if host: 617 # host present, make absolute 618 basepath = '/' 619 else: 620 # else keep non-absolute 621 basepath = '' 622 else: 623 # remove last file component 624 basepath = basepath[:i+1] 625 # Interpret ../ (important because of symlinks) 626 while basepath and path[:3] == '../': 627 path = path[3:] 628 i = string.rfind(basepath[:-1], '/') 629 if i > 0: 630 basepath = basepath[:i+1] 631 elif i == 0: 632 basepath = '/' 633 break 634 else: 635 basepath = '' 636 637 path = basepath + path 638 if type and host: return type + '://' + host + path 639 elif type: return type + ':' + path 640 elif host: return '//' + host + path # don't know what this means 641 else: return path 642 643 644# Utilities to parse URLs (most of these return None for missing parts): 645# unwrap('<URL:type://host/path>') --> 'type://host/path' 646# splittype('type:opaquestring') --> 'type', 'opaquestring' 647# splithost('//host[:port]/path') --> 'host[:port]', '/path' 648# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' 649# splitpasswd('user:passwd') -> 'user', 'passwd' 650# splitport('host:port') --> 'host', 'port' 651# splitquery('/path?query') --> '/path', 'query' 652# splittag('/path#tag') --> '/path', 'tag' 653# splitattr('/path;attr1=value1;attr2=value2;...') -> 654# '/path', ['attr1=value1', 'attr2=value2', ...] 655# splitvalue('attr=value') --> 'attr', 'value' 656# splitgophertype('/Xselector') --> 'X', 'selector' 657# unquote('abc%20def') -> 'abc def' 658# quote('abc def') -> 'abc%20def') 659 660def unwrap(url): 661 url = string.strip(url) 662 if url[:1] == '<' and url[-1:] == '>': 663 url = string.strip(url[1:-1]) 664 if url[:4] == 'URL:': url = string.strip(url[4:]) 665 return url 666 667_typeprog = None 668def splittype(url): 669 global _typeprog 670 if _typeprog is None: 671 import re 672 _typeprog = re.compile('^([^/:]+):') 673 674 match = _typeprog.match(url) 675 if match: 676 scheme = match.group(1) 677 return scheme, url[len(scheme) + 1:] 678 return None, url 679 680_hostprog = None 681def splithost(url): 682 global _hostprog 683 if _hostprog is None: 684 import re 685 _hostprog = re.compile('^//([^/]+)(.*)$') 686 687 match = _hostprog.match(url) 688 if match: return match.group(1, 2) 689 return None, url 690 691_userprog = None 692def splituser(host): 693 global _userprog 694 if _userprog is None: 695 import re 696 _userprog = re.compile('^([^@]*)@(.*)$') 697 698 match = _userprog.match(host) 699 if match: return match.group(1, 2) 700 return None, host 701 702_passwdprog = None 703def splitpasswd(user): 704 global _passwdprog 705 if _passwdprog is None: 706 import re 707 _passwdprog = re.compile('^([^:]*):(.*)$') 708 709 match = _passwdprog.match(host) 710 if match: return match.group(1, 2) 711 return user, None 712 713_portprog = None 714def splitport(host): 715 global _portprog 716 if _portprog is None: 717 import re 718 _portprog = re.compile('^(.*):([0-9]+)$') 719 720 match = _portprog.match(host) 721 if match: return match.group(1, 2) 722 return host, None 723 724# Split host and port, returning numeric port. 725# Return given default port if no ':' found; defaults to -1. 726# Return numerical port if a valid number are found after ':'. 727# Return None if ':' but not a valid number. 728_nportprog = None 729def splitnport(host, defport=-1): 730 global _nportprog 731 if _nportprog is None: 732 import re 733 _nportprog = re.compile('^(.*):(.*)$') 734 735 match = _nportprog.match(host) 736 if match: 737 host, port = match.group(1, 2) 738 try: 739 if not port: raise string.atoi_error, "no digits" 740 nport = string.atoi(port) 741 except string.atoi_error: 742 nport = None 743 return host, nport 744 return host, defport 745 746_queryprog = None 747def splitquery(url): 748 global _queryprog 749 if _queryprog is None: 750 import re 751 _queryprog = re.compile('^(.*)\?([^?]*)$') 752 753 match = _queryprog.match(url) 754 if match: return match.group(1, 2) 755 return url, None 756 757_tagprog = None 758def splittag(url): 759 global _tagprog 760 if _tagprog is None: 761 import re 762 _tagprog = re.compile('^(.*)#([^#]*)$') 763 764 match = _tagprog.match(url) 765 if match: return match.group(1, 2) 766 return url, None 767 768def splitattr(url): 769 words = string.splitfields(url, ';') 770 return words[0], words[1:] 771 772_valueprog = None 773def splitvalue(attr): 774 global _valueprog 775 if _valueprog is None: 776 import re 777 _valueprog = re.compile('^([^=]*)=(.*)$') 778 779 match = _valueprog.match(attr) 780 if match: return match.group(1, 2) 781 return attr, None 782 783def splitgophertype(selector): 784 if selector[:1] == '/' and selector[1:2]: 785 return selector[1], selector[2:] 786 return None, selector 787 788_quoteprog = None 789def unquote(s): 790 global _quoteprog 791 if _quoteprog is None: 792 import re 793 _quoteprog = re.compile('%[0-9a-fA-F][0-9a-fA-F]') 794 795 i = 0 796 n = len(s) 797 res = [] 798 while 0 <= i < n: 799 match = _quoteprog.search(s, i) 800 if not match: 801 res.append(s[i:]) 802 break 803 j = match.start(0) 804 res.append(s[i:j] + chr(string.atoi(s[j+1:j+3], 16))) 805 i = j+3 806 return string.joinfields(res, '') 807 808def unquote_plus(s): 809 if '+' in s: 810 # replace '+' with ' ' 811 s = string.join(string.split(s, '+'), ' ') 812 return unquote(s) 813 814always_safe = string.letters + string.digits + '_,.-' 815def quote(s, safe = '/'): 816 safe = always_safe + safe 817 res = [] 818 for c in s: 819 if c in safe: 820 res.append(c) 821 else: 822 res.append('%%%02x' % ord(c)) 823 return string.joinfields(res, '') 824 825def quote_plus(s, safe = '/'): 826 if ' ' in s: 827 # replace ' ' with '+' 828 s = string.join(string.split(s, ' '), '+') 829 return quote(s, safe + '+') 830 else: 831 return quote(s, safe) 832 833 834# Proxy handling 835def getproxies(): 836 """Return a dictionary of protocol scheme -> proxy server URL mappings. 837 838 Scan the environment for variables named <scheme>_proxy; 839 this seems to be the standard convention. If you need a 840 different way, you can pass a proxies dictionary to the 841 [Fancy]URLopener constructor. 842 843 """ 844 proxies = {} 845 for name, value in os.environ.items(): 846 name = string.lower(name) 847 if value and name[-6:] == '_proxy': 848 proxies[name[:-6]] = value 849 return proxies 850 851 852# Test and time quote() and unquote() 853def test1(): 854 import time 855 s = '' 856 for i in range(256): s = s + chr(i) 857 s = s*4 858 t0 = time.time() 859 qs = quote(s) 860 uqs = unquote(qs) 861 t1 = time.time() 862 if uqs != s: 863 print 'Wrong!' 864 print `s` 865 print `qs` 866 print `uqs` 867 print round(t1 - t0, 3), 'sec' 868 869 870# Test program 871def test(): 872 import sys 873 args = sys.argv[1:] 874 if not args: 875 args = [ 876 '/etc/passwd', 877 'file:/etc/passwd', 878 'file://localhost/etc/passwd', 879 'ftp://ftp.python.org/etc/passwd', 880 'gopher://gopher.micro.umn.edu/1/', 881 'http://www.python.org/index.html', 882 ] 883 try: 884 for url in args: 885 print '-'*10, url, '-'*10 886 fn, h = urlretrieve(url) 887 print fn, h 888 if h: 889 print '======' 890 for k in h.keys(): print k + ':', h[k] 891 print '======' 892 fp = open(fn, 'rb') 893 data = fp.read() 894 del fp 895 if '\r' in data: 896 table = string.maketrans("", "") 897 data = string.translate(data, table, "\r") 898 print data 899 fn, h = None, None 900 print '-'*40 901 finally: 902 urlcleanup() 903 904# Run test program when run as a script 905if __name__ == '__main__': 906 test1() 907 test() 908