urllib.py revision 1c24592b925ba4716f2c0cec10bfe59cef2eed30
1"""Open an arbitrary URL. 2 3See the following document for more info on URLs: 4"Names and Addresses, URIs, URLs, URNs, URCs", at 5http://www.w3.org/pub/WWW/Addressing/Overview.html 6 7See also the HTTP spec (from which the error codes are derived): 8"HTTP - Hypertext Transfer Protocol", at 9http://www.w3.org/pub/WWW/Protocols/ 10 11Related standards and specs: 12- RFC1808: the "relative URL" spec. (authoritative status) 13- RFC1738 - the "URL standard". (authoritative status) 14- RFC1630 - the "URI spec". (informational status) 15 16The object returned by URLopener().open(file) will differ per 17protocol. All you know is that is has methods read(), readline(), 18readlines(), fileno(), close() and info(). The read*(), fileno() 19and close() methods work like those of open files. 20The info() method returns a mimetools.Message object which can be 21used to query various info about the object, if available. 22(mimetools.Message objects are queried with the getheader() method.) 23""" 24 25import string 26import socket 27import os 28import time 29import sys 30from urlparse import urljoin as basejoin 31 32__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve", 33 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus", 34 "urlencode", "url2pathname", "pathname2url", "splittag", 35 "localhost", "thishost", "ftperrors", "basejoin", "unwrap", 36 "splittype", "splithost", "splituser", "splitpasswd", "splitport", 37 "splitnport", "splitquery", "splitattr", "splitvalue", 38 "getproxies"] 39 40__version__ = '1.17' # XXX This version is not always updated :-( 41 42MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 43 44# Helper for non-unix systems 45if os.name == 'mac': 46 from macurl2path import url2pathname, pathname2url 47elif os.name == 'nt': 48 from nturl2path import url2pathname, pathname2url 49elif os.name == 'riscos': 50 from rourl2path import url2pathname, pathname2url 51else: 52 def url2pathname(pathname): 53 """OS-specific conversion from a relative URL of the 'file' scheme 54 to a file system path; not recommended for general use.""" 55 return unquote(pathname) 56 57 def pathname2url(pathname): 58 """OS-specific conversion from a file system path to a relative URL 59 of the 'file' scheme; not recommended for general use.""" 60 return quote(pathname) 61 62# This really consists of two pieces: 63# (1) a class which handles opening of all sorts of URLs 64# (plus assorted utilities etc.) 65# (2) a set of functions for parsing URLs 66# XXX Should these be separated out into different modules? 67 68 69# Shortcut for basic usage 70_urlopener = None 71def urlopen(url, data=None, proxies=None): 72 """Create a file-like object for the specified URL to read from.""" 73 from warnings import warnpy3k 74 warnpy3k("urllib.urlopen() has been removed in Python 3.0 in " 75 "favor of urllib2.urlopen()", stacklevel=2) 76 77 global _urlopener 78 if proxies is not None: 79 opener = FancyURLopener(proxies=proxies) 80 elif not _urlopener: 81 opener = FancyURLopener() 82 _urlopener = opener 83 else: 84 opener = _urlopener 85 if data is None: 86 return opener.open(url) 87 else: 88 return opener.open(url, data) 89def urlretrieve(url, filename=None, reporthook=None, data=None): 90 global _urlopener 91 if not _urlopener: 92 _urlopener = FancyURLopener() 93 return _urlopener.retrieve(url, filename, reporthook, data) 94def urlcleanup(): 95 if _urlopener: 96 _urlopener.cleanup() 97 _safe_quoters.clear() 98 ftpcache.clear() 99 100# check for SSL 101try: 102 import ssl 103except: 104 _have_ssl = False 105else: 106 _have_ssl = True 107 108# exception raised when downloaded size does not match content-length 109class ContentTooShortError(IOError): 110 def __init__(self, message, content): 111 IOError.__init__(self, message) 112 self.content = content 113 114ftpcache = {} 115class URLopener: 116 """Class to open URLs. 117 This is a class rather than just a subroutine because we may need 118 more than one set of global protocol-specific options. 119 Note -- this is a base class for those who don't want the 120 automatic handling of errors type 302 (relocated) and 401 121 (authorization needed).""" 122 123 __tempfiles = None 124 125 version = "Python-urllib/%s" % __version__ 126 127 # Constructor 128 def __init__(self, proxies=None, **x509): 129 if proxies is None: 130 proxies = getproxies() 131 assert hasattr(proxies, 'has_key'), "proxies must be a mapping" 132 self.proxies = proxies 133 self.key_file = x509.get('key_file') 134 self.cert_file = x509.get('cert_file') 135 self.addheaders = [('User-Agent', self.version)] 136 self.__tempfiles = [] 137 self.__unlink = os.unlink # See cleanup() 138 self.tempcache = None 139 # Undocumented feature: if you assign {} to tempcache, 140 # it is used to cache files retrieved with 141 # self.retrieve(). This is not enabled by default 142 # since it does not work for changing documents (and I 143 # haven't got the logic to check expiration headers 144 # yet). 145 self.ftpcache = ftpcache 146 # Undocumented feature: you can use a different 147 # ftp cache by assigning to the .ftpcache member; 148 # in case you want logically independent URL openers 149 # XXX This is not threadsafe. Bah. 150 151 def __del__(self): 152 self.close() 153 154 def close(self): 155 self.cleanup() 156 157 def cleanup(self): 158 # This code sometimes runs when the rest of this module 159 # has already been deleted, so it can't use any globals 160 # or import anything. 161 if self.__tempfiles: 162 for file in self.__tempfiles: 163 try: 164 self.__unlink(file) 165 except OSError: 166 pass 167 del self.__tempfiles[:] 168 if self.tempcache: 169 self.tempcache.clear() 170 171 def addheader(self, *args): 172 """Add a header to be used by the HTTP interface only 173 e.g. u.addheader('Accept', 'sound/basic')""" 174 self.addheaders.append(args) 175 176 # External interface 177 def open(self, fullurl, data=None): 178 """Use URLopener().open(file) instead of open(file, 'r').""" 179 fullurl = unwrap(toBytes(fullurl)) 180 # percent encode url. fixing lame server errors like space within url 181 # parts 182 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|") 183 if self.tempcache and fullurl in self.tempcache: 184 filename, headers = self.tempcache[fullurl] 185 fp = open(filename, 'rb') 186 return addinfourl(fp, headers, fullurl) 187 urltype, url = splittype(fullurl) 188 if not urltype: 189 urltype = 'file' 190 if urltype in self.proxies: 191 proxy = self.proxies[urltype] 192 urltype, proxyhost = splittype(proxy) 193 host, selector = splithost(proxyhost) 194 url = (host, fullurl) # Signal special case to open_*() 195 else: 196 proxy = None 197 name = 'open_' + urltype 198 self.type = urltype 199 name = name.replace('-', '_') 200 if not hasattr(self, name): 201 if proxy: 202 return self.open_unknown_proxy(proxy, fullurl, data) 203 else: 204 return self.open_unknown(fullurl, data) 205 try: 206 if data is None: 207 return getattr(self, name)(url) 208 else: 209 return getattr(self, name)(url, data) 210 except socket.error, msg: 211 raise IOError, ('socket error', msg), sys.exc_info()[2] 212 213 def open_unknown(self, fullurl, data=None): 214 """Overridable interface to open unknown URL type.""" 215 type, url = splittype(fullurl) 216 raise IOError, ('url error', 'unknown url type', type) 217 218 def open_unknown_proxy(self, proxy, fullurl, data=None): 219 """Overridable interface to open unknown URL type.""" 220 type, url = splittype(fullurl) 221 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy) 222 223 # External interface 224 def retrieve(self, url, filename=None, reporthook=None, data=None): 225 """retrieve(url) returns (filename, headers) for a local object 226 or (tempfilename, headers) for a remote object.""" 227 url = unwrap(toBytes(url)) 228 if self.tempcache and url in self.tempcache: 229 return self.tempcache[url] 230 type, url1 = splittype(url) 231 if filename is None and (not type or type == 'file'): 232 try: 233 fp = self.open_local_file(url1) 234 hdrs = fp.info() 235 del fp 236 return url2pathname(splithost(url1)[1]), hdrs 237 except IOError, msg: 238 pass 239 fp = self.open(url, data) 240 try: 241 headers = fp.info() 242 if filename: 243 tfp = open(filename, 'wb') 244 else: 245 import tempfile 246 garbage, path = splittype(url) 247 garbage, path = splithost(path or "") 248 path, garbage = splitquery(path or "") 249 path, garbage = splitattr(path or "") 250 suffix = os.path.splitext(path)[1] 251 (fd, filename) = tempfile.mkstemp(suffix) 252 self.__tempfiles.append(filename) 253 tfp = os.fdopen(fd, 'wb') 254 try: 255 result = filename, headers 256 if self.tempcache is not None: 257 self.tempcache[url] = result 258 bs = 1024*8 259 size = -1 260 read = 0 261 blocknum = 0 262 if reporthook: 263 if "content-length" in headers: 264 size = int(headers["Content-Length"]) 265 reporthook(blocknum, bs, size) 266 while 1: 267 block = fp.read(bs) 268 if block == "": 269 break 270 read += len(block) 271 tfp.write(block) 272 blocknum += 1 273 if reporthook: 274 reporthook(blocknum, bs, size) 275 finally: 276 tfp.close() 277 finally: 278 fp.close() 279 del fp 280 del tfp 281 282 # raise exception if actual size does not match content-length header 283 if size >= 0 and read < size: 284 raise ContentTooShortError("retrieval incomplete: got only %i out " 285 "of %i bytes" % (read, size), result) 286 287 return result 288 289 # Each method named open_<type> knows how to open that type of URL 290 291 def open_http(self, url, data=None): 292 """Use HTTP protocol.""" 293 import httplib 294 user_passwd = None 295 proxy_passwd= None 296 if isinstance(url, str): 297 host, selector = splithost(url) 298 if host: 299 user_passwd, host = splituser(host) 300 host = unquote(host) 301 realhost = host 302 else: 303 host, selector = url 304 # check whether the proxy contains authorization information 305 proxy_passwd, host = splituser(host) 306 # now we proceed with the url we want to obtain 307 urltype, rest = splittype(selector) 308 url = rest 309 user_passwd = None 310 if urltype.lower() != 'http': 311 realhost = None 312 else: 313 realhost, rest = splithost(rest) 314 if realhost: 315 user_passwd, realhost = splituser(realhost) 316 if user_passwd: 317 selector = "%s://%s%s" % (urltype, realhost, rest) 318 if proxy_bypass(realhost): 319 host = realhost 320 321 #print "proxy via http:", host, selector 322 if not host: raise IOError, ('http error', 'no host given') 323 324 if proxy_passwd: 325 import base64 326 proxy_auth = base64.b64encode(proxy_passwd).strip() 327 else: 328 proxy_auth = None 329 330 if user_passwd: 331 import base64 332 auth = base64.b64encode(user_passwd).strip() 333 else: 334 auth = None 335 h = httplib.HTTP(host) 336 if data is not None: 337 h.putrequest('POST', selector) 338 h.putheader('Content-Type', 'application/x-www-form-urlencoded') 339 h.putheader('Content-Length', '%d' % len(data)) 340 else: 341 h.putrequest('GET', selector) 342 if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth) 343 if auth: h.putheader('Authorization', 'Basic %s' % auth) 344 if realhost: h.putheader('Host', realhost) 345 for args in self.addheaders: h.putheader(*args) 346 h.endheaders() 347 if data is not None: 348 h.send(data) 349 errcode, errmsg, headers = h.getreply() 350 fp = h.getfile() 351 if errcode == -1: 352 if fp: fp.close() 353 # something went wrong with the HTTP status line 354 raise IOError, ('http protocol error', 0, 355 'got a bad status line', None) 356 # According to RFC 2616, "2xx" code indicates that the client's 357 # request was successfully received, understood, and accepted. 358 if (200 <= errcode < 300): 359 return addinfourl(fp, headers, "http:" + url, errcode) 360 else: 361 if data is None: 362 return self.http_error(url, fp, errcode, errmsg, headers) 363 else: 364 return self.http_error(url, fp, errcode, errmsg, headers, data) 365 366 def http_error(self, url, fp, errcode, errmsg, headers, data=None): 367 """Handle http errors. 368 Derived class can override this, or provide specific handlers 369 named http_error_DDD where DDD is the 3-digit error code.""" 370 # First check if there's a specific handler for this error 371 name = 'http_error_%d' % errcode 372 if hasattr(self, name): 373 method = getattr(self, name) 374 if data is None: 375 result = method(url, fp, errcode, errmsg, headers) 376 else: 377 result = method(url, fp, errcode, errmsg, headers, data) 378 if result: return result 379 return self.http_error_default(url, fp, errcode, errmsg, headers) 380 381 def http_error_default(self, url, fp, errcode, errmsg, headers): 382 """Default error handler: close the connection and raise IOError.""" 383 void = fp.read() 384 fp.close() 385 raise IOError, ('http error', errcode, errmsg, headers) 386 387 if _have_ssl: 388 def open_https(self, url, data=None): 389 """Use HTTPS protocol.""" 390 391 import httplib 392 user_passwd = None 393 proxy_passwd = None 394 if isinstance(url, str): 395 host, selector = splithost(url) 396 if host: 397 user_passwd, host = splituser(host) 398 host = unquote(host) 399 realhost = host 400 else: 401 host, selector = url 402 # here, we determine, whether the proxy contains authorization information 403 proxy_passwd, host = splituser(host) 404 urltype, rest = splittype(selector) 405 url = rest 406 user_passwd = None 407 if urltype.lower() != 'https': 408 realhost = None 409 else: 410 realhost, rest = splithost(rest) 411 if realhost: 412 user_passwd, realhost = splituser(realhost) 413 if user_passwd: 414 selector = "%s://%s%s" % (urltype, realhost, rest) 415 #print "proxy via https:", host, selector 416 if not host: raise IOError, ('https error', 'no host given') 417 if proxy_passwd: 418 import base64 419 proxy_auth = base64.b64encode(proxy_passwd).strip() 420 else: 421 proxy_auth = None 422 if user_passwd: 423 import base64 424 auth = base64.b64encode(user_passwd).strip() 425 else: 426 auth = None 427 h = httplib.HTTPS(host, 0, 428 key_file=self.key_file, 429 cert_file=self.cert_file) 430 if data is not None: 431 h.putrequest('POST', selector) 432 h.putheader('Content-Type', 433 'application/x-www-form-urlencoded') 434 h.putheader('Content-Length', '%d' % len(data)) 435 else: 436 h.putrequest('GET', selector) 437 if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth) 438 if auth: h.putheader('Authorization', 'Basic %s' % auth) 439 if realhost: h.putheader('Host', realhost) 440 for args in self.addheaders: h.putheader(*args) 441 h.endheaders() 442 if data is not None: 443 h.send(data) 444 errcode, errmsg, headers = h.getreply() 445 fp = h.getfile() 446 if errcode == -1: 447 if fp: fp.close() 448 # something went wrong with the HTTP status line 449 raise IOError, ('http protocol error', 0, 450 'got a bad status line', None) 451 # According to RFC 2616, "2xx" code indicates that the client's 452 # request was successfully received, understood, and accepted. 453 if (200 <= errcode < 300): 454 return addinfourl(fp, headers, "https:" + url, errcode) 455 else: 456 if data is None: 457 return self.http_error(url, fp, errcode, errmsg, headers) 458 else: 459 return self.http_error(url, fp, errcode, errmsg, headers, 460 data) 461 462 def open_file(self, url): 463 """Use local file or FTP depending on form of URL.""" 464 if not isinstance(url, str): 465 raise IOError, ('file error', 'proxy support for file protocol currently not implemented') 466 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/': 467 return self.open_ftp(url) 468 else: 469 return self.open_local_file(url) 470 471 def open_local_file(self, url): 472 """Use local file.""" 473 import mimetypes, mimetools, email.utils 474 try: 475 from cStringIO import StringIO 476 except ImportError: 477 from StringIO import StringIO 478 host, file = splithost(url) 479 localname = url2pathname(file) 480 try: 481 stats = os.stat(localname) 482 except OSError, e: 483 raise IOError(e.errno, e.strerror, e.filename) 484 size = stats.st_size 485 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 486 mtype = mimetypes.guess_type(url)[0] 487 headers = mimetools.Message(StringIO( 488 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % 489 (mtype or 'text/plain', size, modified))) 490 if not host: 491 urlfile = file 492 if file[:1] == '/': 493 urlfile = 'file://' + file 494 return addinfourl(open(localname, 'rb'), 495 headers, urlfile) 496 host, port = splitport(host) 497 if not port \ 498 and socket.gethostbyname(host) in (localhost(), thishost()): 499 urlfile = file 500 if file[:1] == '/': 501 urlfile = 'file://' + file 502 return addinfourl(open(localname, 'rb'), 503 headers, urlfile) 504 raise IOError, ('local file error', 'not on local host') 505 506 def open_ftp(self, url): 507 """Use FTP protocol.""" 508 if not isinstance(url, str): 509 raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented') 510 import mimetypes, mimetools 511 try: 512 from cStringIO import StringIO 513 except ImportError: 514 from StringIO import StringIO 515 host, path = splithost(url) 516 if not host: raise IOError, ('ftp error', 'no host given') 517 host, port = splitport(host) 518 user, host = splituser(host) 519 if user: user, passwd = splitpasswd(user) 520 else: passwd = None 521 host = unquote(host) 522 user = unquote(user or '') 523 passwd = unquote(passwd or '') 524 host = socket.gethostbyname(host) 525 if not port: 526 import ftplib 527 port = ftplib.FTP_PORT 528 else: 529 port = int(port) 530 path, attrs = splitattr(path) 531 path = unquote(path) 532 dirs = path.split('/') 533 dirs, file = dirs[:-1], dirs[-1] 534 if dirs and not dirs[0]: dirs = dirs[1:] 535 if dirs and not dirs[0]: dirs[0] = '/' 536 key = user, host, port, '/'.join(dirs) 537 # XXX thread unsafe! 538 if len(self.ftpcache) > MAXFTPCACHE: 539 # Prune the cache, rather arbitrarily 540 for k in self.ftpcache.keys(): 541 if k != key: 542 v = self.ftpcache[k] 543 del self.ftpcache[k] 544 v.close() 545 try: 546 if not key in self.ftpcache: 547 self.ftpcache[key] = \ 548 ftpwrapper(user, passwd, host, port, dirs) 549 if not file: type = 'D' 550 else: type = 'I' 551 for attr in attrs: 552 attr, value = splitvalue(attr) 553 if attr.lower() == 'type' and \ 554 value in ('a', 'A', 'i', 'I', 'd', 'D'): 555 type = value.upper() 556 (fp, retrlen) = self.ftpcache[key].retrfile(file, type) 557 mtype = mimetypes.guess_type("ftp:" + url)[0] 558 headers = "" 559 if mtype: 560 headers += "Content-Type: %s\n" % mtype 561 if retrlen is not None and retrlen >= 0: 562 headers += "Content-Length: %d\n" % retrlen 563 headers = mimetools.Message(StringIO(headers)) 564 return addinfourl(fp, headers, "ftp:" + url) 565 except ftperrors(), msg: 566 raise IOError, ('ftp error', msg), sys.exc_info()[2] 567 568 def open_data(self, url, data=None): 569 """Use "data" URL.""" 570 if not isinstance(url, str): 571 raise IOError, ('data error', 'proxy support for data protocol currently not implemented') 572 # ignore POSTed data 573 # 574 # syntax of data URLs: 575 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 576 # mediatype := [ type "/" subtype ] *( ";" parameter ) 577 # data := *urlchar 578 # parameter := attribute "=" value 579 import mimetools 580 try: 581 from cStringIO import StringIO 582 except ImportError: 583 from StringIO import StringIO 584 try: 585 [type, data] = url.split(',', 1) 586 except ValueError: 587 raise IOError, ('data error', 'bad data URL') 588 if not type: 589 type = 'text/plain;charset=US-ASCII' 590 semi = type.rfind(';') 591 if semi >= 0 and '=' not in type[semi:]: 592 encoding = type[semi+1:] 593 type = type[:semi] 594 else: 595 encoding = '' 596 msg = [] 597 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT', 598 time.gmtime(time.time()))) 599 msg.append('Content-type: %s' % type) 600 if encoding == 'base64': 601 import base64 602 data = base64.decodestring(data) 603 else: 604 data = unquote(data) 605 msg.append('Content-Length: %d' % len(data)) 606 msg.append('') 607 msg.append(data) 608 msg = '\n'.join(msg) 609 f = StringIO(msg) 610 headers = mimetools.Message(f, 0) 611 #f.fileno = None # needed for addinfourl 612 return addinfourl(f, headers, url) 613 614 615class FancyURLopener(URLopener): 616 """Derived class with handlers for errors we can handle (perhaps).""" 617 618 def __init__(self, *args, **kwargs): 619 URLopener.__init__(self, *args, **kwargs) 620 self.auth_cache = {} 621 self.tries = 0 622 self.maxtries = 10 623 624 def http_error_default(self, url, fp, errcode, errmsg, headers): 625 """Default error handling -- don't raise an exception.""" 626 return addinfourl(fp, headers, "http:" + url, errcode) 627 628 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 629 """Error 302 -- relocated (temporarily).""" 630 self.tries += 1 631 if self.maxtries and self.tries >= self.maxtries: 632 if hasattr(self, "http_error_500"): 633 meth = self.http_error_500 634 else: 635 meth = self.http_error_default 636 self.tries = 0 637 return meth(url, fp, 500, 638 "Internal Server Error: Redirect Recursion", headers) 639 result = self.redirect_internal(url, fp, errcode, errmsg, headers, 640 data) 641 self.tries = 0 642 return result 643 644 def redirect_internal(self, url, fp, errcode, errmsg, headers, data): 645 if 'location' in headers: 646 newurl = headers['location'] 647 elif 'uri' in headers: 648 newurl = headers['uri'] 649 else: 650 return 651 void = fp.read() 652 fp.close() 653 # In case the server sent a relative URL, join with original: 654 newurl = basejoin(self.type + ":" + url, newurl) 655 return self.open(newurl) 656 657 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): 658 """Error 301 -- also relocated (permanently).""" 659 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 660 661 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None): 662 """Error 303 -- also relocated (essentially identical to 302).""" 663 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 664 665 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None): 666 """Error 307 -- relocated, but turn POST into error.""" 667 if data is None: 668 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 669 else: 670 return self.http_error_default(url, fp, errcode, errmsg, headers) 671 672 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): 673 """Error 401 -- authentication required. 674 This function supports Basic authentication only.""" 675 if not 'www-authenticate' in headers: 676 URLopener.http_error_default(self, url, fp, 677 errcode, errmsg, headers) 678 stuff = headers['www-authenticate'] 679 import re 680 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 681 if not match: 682 URLopener.http_error_default(self, url, fp, 683 errcode, errmsg, headers) 684 scheme, realm = match.groups() 685 if scheme.lower() != 'basic': 686 URLopener.http_error_default(self, url, fp, 687 errcode, errmsg, headers) 688 name = 'retry_' + self.type + '_basic_auth' 689 if data is None: 690 return getattr(self,name)(url, realm) 691 else: 692 return getattr(self,name)(url, realm, data) 693 694 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None): 695 """Error 407 -- proxy authentication required. 696 This function supports Basic authentication only.""" 697 if not 'proxy-authenticate' in headers: 698 URLopener.http_error_default(self, url, fp, 699 errcode, errmsg, headers) 700 stuff = headers['proxy-authenticate'] 701 import re 702 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 703 if not match: 704 URLopener.http_error_default(self, url, fp, 705 errcode, errmsg, headers) 706 scheme, realm = match.groups() 707 if scheme.lower() != 'basic': 708 URLopener.http_error_default(self, url, fp, 709 errcode, errmsg, headers) 710 name = 'retry_proxy_' + self.type + '_basic_auth' 711 if data is None: 712 return getattr(self,name)(url, realm) 713 else: 714 return getattr(self,name)(url, realm, data) 715 716 def retry_proxy_http_basic_auth(self, url, realm, data=None): 717 host, selector = splithost(url) 718 newurl = 'http://' + host + selector 719 proxy = self.proxies['http'] 720 urltype, proxyhost = splittype(proxy) 721 proxyhost, proxyselector = splithost(proxyhost) 722 i = proxyhost.find('@') + 1 723 proxyhost = proxyhost[i:] 724 user, passwd = self.get_user_passwd(proxyhost, realm, i) 725 if not (user or passwd): return None 726 proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost 727 self.proxies['http'] = 'http://' + proxyhost + proxyselector 728 if data is None: 729 return self.open(newurl) 730 else: 731 return self.open(newurl, data) 732 733 def retry_proxy_https_basic_auth(self, url, realm, data=None): 734 host, selector = splithost(url) 735 newurl = 'https://' + host + selector 736 proxy = self.proxies['https'] 737 urltype, proxyhost = splittype(proxy) 738 proxyhost, proxyselector = splithost(proxyhost) 739 i = proxyhost.find('@') + 1 740 proxyhost = proxyhost[i:] 741 user, passwd = self.get_user_passwd(proxyhost, realm, i) 742 if not (user or passwd): return None 743 proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost 744 self.proxies['https'] = 'https://' + proxyhost + proxyselector 745 if data is None: 746 return self.open(newurl) 747 else: 748 return self.open(newurl, data) 749 750 def retry_http_basic_auth(self, url, realm, data=None): 751 host, selector = splithost(url) 752 i = host.find('@') + 1 753 host = host[i:] 754 user, passwd = self.get_user_passwd(host, realm, i) 755 if not (user or passwd): return None 756 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host 757 newurl = 'http://' + host + selector 758 if data is None: 759 return self.open(newurl) 760 else: 761 return self.open(newurl, data) 762 763 def retry_https_basic_auth(self, url, realm, data=None): 764 host, selector = splithost(url) 765 i = host.find('@') + 1 766 host = host[i:] 767 user, passwd = self.get_user_passwd(host, realm, i) 768 if not (user or passwd): return None 769 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host 770 newurl = 'https://' + host + selector 771 if data is None: 772 return self.open(newurl) 773 else: 774 return self.open(newurl, data) 775 776 def get_user_passwd(self, host, realm, clear_cache=0): 777 key = realm + '@' + host.lower() 778 if key in self.auth_cache: 779 if clear_cache: 780 del self.auth_cache[key] 781 else: 782 return self.auth_cache[key] 783 user, passwd = self.prompt_user_passwd(host, realm) 784 if user or passwd: self.auth_cache[key] = (user, passwd) 785 return user, passwd 786 787 def prompt_user_passwd(self, host, realm): 788 """Override this in a GUI environment!""" 789 import getpass 790 try: 791 user = raw_input("Enter username for %s at %s: " % (realm, 792 host)) 793 passwd = getpass.getpass("Enter password for %s in %s at %s: " % 794 (user, realm, host)) 795 return user, passwd 796 except KeyboardInterrupt: 797 print 798 return None, None 799 800 801# Utility functions 802 803_localhost = None 804def localhost(): 805 """Return the IP address of the magic hostname 'localhost'.""" 806 global _localhost 807 if _localhost is None: 808 _localhost = socket.gethostbyname('localhost') 809 return _localhost 810 811_thishost = None 812def thishost(): 813 """Return the IP address of the current host.""" 814 global _thishost 815 if _thishost is None: 816 _thishost = socket.gethostbyname(socket.gethostname()) 817 return _thishost 818 819_ftperrors = None 820def ftperrors(): 821 """Return the set of errors raised by the FTP class.""" 822 global _ftperrors 823 if _ftperrors is None: 824 import ftplib 825 _ftperrors = ftplib.all_errors 826 return _ftperrors 827 828_noheaders = None 829def noheaders(): 830 """Return an empty mimetools.Message object.""" 831 global _noheaders 832 if _noheaders is None: 833 import mimetools 834 try: 835 from cStringIO import StringIO 836 except ImportError: 837 from StringIO import StringIO 838 _noheaders = mimetools.Message(StringIO(), 0) 839 _noheaders.fp.close() # Recycle file descriptor 840 return _noheaders 841 842 843# Utility classes 844 845class ftpwrapper: 846 """Class used by open_ftp() for cache of open FTP connections.""" 847 848 def __init__(self, user, passwd, host, port, dirs, 849 timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 850 self.user = user 851 self.passwd = passwd 852 self.host = host 853 self.port = port 854 self.dirs = dirs 855 self.timeout = timeout 856 self.init() 857 858 def init(self): 859 import ftplib 860 self.busy = 0 861 self.ftp = ftplib.FTP() 862 self.ftp.connect(self.host, self.port, self.timeout) 863 self.ftp.login(self.user, self.passwd) 864 for dir in self.dirs: 865 self.ftp.cwd(dir) 866 867 def retrfile(self, file, type): 868 import ftplib 869 self.endtransfer() 870 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 871 else: cmd = 'TYPE ' + type; isdir = 0 872 try: 873 self.ftp.voidcmd(cmd) 874 except ftplib.all_errors: 875 self.init() 876 self.ftp.voidcmd(cmd) 877 conn = None 878 if file and not isdir: 879 # Try to retrieve as a file 880 try: 881 cmd = 'RETR ' + file 882 conn = self.ftp.ntransfercmd(cmd) 883 except ftplib.error_perm, reason: 884 if str(reason)[:3] != '550': 885 raise IOError, ('ftp error', reason), sys.exc_info()[2] 886 if not conn: 887 # Set transfer mode to ASCII! 888 self.ftp.voidcmd('TYPE A') 889 # Try a directory listing. Verify that directory exists. 890 if file: 891 pwd = self.ftp.pwd() 892 try: 893 try: 894 self.ftp.cwd(file) 895 except ftplib.error_perm, reason: 896 raise IOError, ('ftp error', reason), sys.exc_info()[2] 897 finally: 898 self.ftp.cwd(pwd) 899 cmd = 'LIST ' + file 900 else: 901 cmd = 'LIST' 902 conn = self.ftp.ntransfercmd(cmd) 903 self.busy = 1 904 # Pass back both a suitably decorated object and a retrieval length 905 return (addclosehook(conn[0].makefile('rb'), 906 self.endtransfer), conn[1]) 907 def endtransfer(self): 908 if not self.busy: 909 return 910 self.busy = 0 911 try: 912 self.ftp.voidresp() 913 except ftperrors(): 914 pass 915 916 def close(self): 917 self.endtransfer() 918 try: 919 self.ftp.close() 920 except ftperrors(): 921 pass 922 923class addbase: 924 """Base class for addinfo and addclosehook.""" 925 926 def __init__(self, fp): 927 self.fp = fp 928 self.read = self.fp.read 929 self.readline = self.fp.readline 930 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines 931 if hasattr(self.fp, "fileno"): 932 self.fileno = self.fp.fileno 933 else: 934 self.fileno = lambda: None 935 if hasattr(self.fp, "__iter__"): 936 self.__iter__ = self.fp.__iter__ 937 if hasattr(self.fp, "next"): 938 self.next = self.fp.next 939 940 def __repr__(self): 941 return '<%s at %r whose fp = %r>' % (self.__class__.__name__, 942 id(self), self.fp) 943 944 def close(self): 945 self.read = None 946 self.readline = None 947 self.readlines = None 948 self.fileno = None 949 if self.fp: self.fp.close() 950 self.fp = None 951 952class addclosehook(addbase): 953 """Class to add a close hook to an open file.""" 954 955 def __init__(self, fp, closehook, *hookargs): 956 addbase.__init__(self, fp) 957 self.closehook = closehook 958 self.hookargs = hookargs 959 960 def close(self): 961 addbase.close(self) 962 if self.closehook: 963 self.closehook(*self.hookargs) 964 self.closehook = None 965 self.hookargs = None 966 967class addinfo(addbase): 968 """class to add an info() method to an open file.""" 969 970 def __init__(self, fp, headers): 971 addbase.__init__(self, fp) 972 self.headers = headers 973 974 def info(self): 975 return self.headers 976 977class addinfourl(addbase): 978 """class to add info() and geturl() methods to an open file.""" 979 980 def __init__(self, fp, headers, url, code=None): 981 addbase.__init__(self, fp) 982 self.headers = headers 983 self.url = url 984 self.code = code 985 986 def info(self): 987 return self.headers 988 989 def getcode(self): 990 return self.code 991 992 def geturl(self): 993 return self.url 994 995 996# Utilities to parse URLs (most of these return None for missing parts): 997# unwrap('<URL:type://host/path>') --> 'type://host/path' 998# splittype('type:opaquestring') --> 'type', 'opaquestring' 999# splithost('//host[:port]/path') --> 'host[:port]', '/path' 1000# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' 1001# splitpasswd('user:passwd') -> 'user', 'passwd' 1002# splitport('host:port') --> 'host', 'port' 1003# splitquery('/path?query') --> '/path', 'query' 1004# splittag('/path#tag') --> '/path', 'tag' 1005# splitattr('/path;attr1=value1;attr2=value2;...') -> 1006# '/path', ['attr1=value1', 'attr2=value2', ...] 1007# splitvalue('attr=value') --> 'attr', 'value' 1008# unquote('abc%20def') -> 'abc def' 1009# quote('abc def') -> 'abc%20def') 1010 1011try: 1012 unicode 1013except NameError: 1014 def _is_unicode(x): 1015 return 0 1016else: 1017 def _is_unicode(x): 1018 return isinstance(x, unicode) 1019 1020def toBytes(url): 1021 """toBytes(u"URL") --> 'URL'.""" 1022 # Most URL schemes require ASCII. If that changes, the conversion 1023 # can be relaxed 1024 if _is_unicode(url): 1025 try: 1026 url = url.encode("ASCII") 1027 except UnicodeError: 1028 raise UnicodeError("URL " + repr(url) + 1029 " contains non-ASCII characters") 1030 return url 1031 1032def unwrap(url): 1033 """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" 1034 url = url.strip() 1035 if url[:1] == '<' and url[-1:] == '>': 1036 url = url[1:-1].strip() 1037 if url[:4] == 'URL:': url = url[4:].strip() 1038 return url 1039 1040_typeprog = None 1041def splittype(url): 1042 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" 1043 global _typeprog 1044 if _typeprog is None: 1045 import re 1046 _typeprog = re.compile('^([^/:]+):') 1047 1048 match = _typeprog.match(url) 1049 if match: 1050 scheme = match.group(1) 1051 return scheme.lower(), url[len(scheme) + 1:] 1052 return None, url 1053 1054_hostprog = None 1055def splithost(url): 1056 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" 1057 global _hostprog 1058 if _hostprog is None: 1059 import re 1060 _hostprog = re.compile('^//([^/?]*)(.*)$') 1061 1062 match = _hostprog.match(url) 1063 if match: return match.group(1, 2) 1064 return None, url 1065 1066_userprog = None 1067def splituser(host): 1068 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" 1069 global _userprog 1070 if _userprog is None: 1071 import re 1072 _userprog = re.compile('^(.*)@(.*)$') 1073 1074 match = _userprog.match(host) 1075 if match: return map(unquote, match.group(1, 2)) 1076 return None, host 1077 1078_passwdprog = None 1079def splitpasswd(user): 1080 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" 1081 global _passwdprog 1082 if _passwdprog is None: 1083 import re 1084 _passwdprog = re.compile('^([^:]*):(.*)$') 1085 1086 match = _passwdprog.match(user) 1087 if match: return match.group(1, 2) 1088 return user, None 1089 1090# splittag('/path#tag') --> '/path', 'tag' 1091_portprog = None 1092def splitport(host): 1093 """splitport('host:port') --> 'host', 'port'.""" 1094 global _portprog 1095 if _portprog is None: 1096 import re 1097 _portprog = re.compile('^(.*):([0-9]+)$') 1098 1099 match = _portprog.match(host) 1100 if match: return match.group(1, 2) 1101 return host, None 1102 1103_nportprog = None 1104def splitnport(host, defport=-1): 1105 """Split host and port, returning numeric port. 1106 Return given default port if no ':' found; defaults to -1. 1107 Return numerical port if a valid number are found after ':'. 1108 Return None if ':' but not a valid number.""" 1109 global _nportprog 1110 if _nportprog is None: 1111 import re 1112 _nportprog = re.compile('^(.*):(.*)$') 1113 1114 match = _nportprog.match(host) 1115 if match: 1116 host, port = match.group(1, 2) 1117 try: 1118 if not port: raise ValueError, "no digits" 1119 nport = int(port) 1120 except ValueError: 1121 nport = None 1122 return host, nport 1123 return host, defport 1124 1125_queryprog = None 1126def splitquery(url): 1127 """splitquery('/path?query') --> '/path', 'query'.""" 1128 global _queryprog 1129 if _queryprog is None: 1130 import re 1131 _queryprog = re.compile('^(.*)\?([^?]*)$') 1132 1133 match = _queryprog.match(url) 1134 if match: return match.group(1, 2) 1135 return url, None 1136 1137_tagprog = None 1138def splittag(url): 1139 """splittag('/path#tag') --> '/path', 'tag'.""" 1140 global _tagprog 1141 if _tagprog is None: 1142 import re 1143 _tagprog = re.compile('^(.*)#([^#]*)$') 1144 1145 match = _tagprog.match(url) 1146 if match: return match.group(1, 2) 1147 return url, None 1148 1149def splitattr(url): 1150 """splitattr('/path;attr1=value1;attr2=value2;...') -> 1151 '/path', ['attr1=value1', 'attr2=value2', ...].""" 1152 words = url.split(';') 1153 return words[0], words[1:] 1154 1155_valueprog = None 1156def splitvalue(attr): 1157 """splitvalue('attr=value') --> 'attr', 'value'.""" 1158 global _valueprog 1159 if _valueprog is None: 1160 import re 1161 _valueprog = re.compile('^([^=]*)=(.*)$') 1162 1163 match = _valueprog.match(attr) 1164 if match: return match.group(1, 2) 1165 return attr, None 1166 1167_hexdig = '0123456789ABCDEFabcdef' 1168_hextochr = dict((a + b, chr(int(a + b, 16))) 1169 for a in _hexdig for b in _hexdig) 1170 1171def unquote(s): 1172 """unquote('abc%20def') -> 'abc def'.""" 1173 res = s.split('%') 1174 # fastpath 1175 if len(res) == 1: 1176 return s 1177 s = res[0] 1178 for item in res[1:]: 1179 try: 1180 s += _hextochr[item[:2]] + item[2:] 1181 except KeyError: 1182 s += '%' + item 1183 except UnicodeDecodeError: 1184 s += unichr(int(item[:2], 16)) + item[2:] 1185 return s 1186 1187def unquote_plus(s): 1188 """unquote('%7e/abc+def') -> '~/abc def'""" 1189 s = s.replace('+', ' ') 1190 return unquote(s) 1191 1192always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 1193 'abcdefghijklmnopqrstuvwxyz' 1194 '0123456789' '_.-') 1195_safe_map = {} 1196for i, c in zip(xrange(256), str(bytearray(xrange(256)))): 1197 _safe_map[c] = c if (i < 128 and c in always_safe) else '%{0:02X}'.format(i) 1198_safe_quoters = {} 1199 1200def quote(s, safe='/'): 1201 """quote('abc def') -> 'abc%20def' 1202 1203 Each part of a URL, e.g. the path info, the query, etc., has a 1204 different set of reserved characters that must be quoted. 1205 1206 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists 1207 the following reserved characters. 1208 1209 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 1210 "$" | "," 1211 1212 Each of these characters is reserved in some component of a URL, 1213 but not necessarily in all of them. 1214 1215 By default, the quote function is intended for quoting the path 1216 section of a URL. Thus, it will not encode '/'. This character 1217 is reserved, but in typical usage the quote function is being 1218 called on a path where the existing slash characters are used as 1219 reserved characters. 1220 """ 1221 # fastpath 1222 if not s: 1223 return s 1224 cachekey = (safe, always_safe) 1225 try: 1226 (quoter, safe) = _safe_quoters[cachekey] 1227 except KeyError: 1228 safe_map = _safe_map.copy() 1229 safe_map.update([(c, c) for c in safe]) 1230 quoter = safe_map.__getitem__ 1231 safe = always_safe + safe 1232 _safe_quoters[cachekey] = (quoter, safe) 1233 if not s.rstrip(safe): 1234 return s 1235 return ''.join(map(quoter, s)) 1236 1237def quote_plus(s, safe=''): 1238 """Quote the query fragment of a URL; replacing ' ' with '+'""" 1239 if ' ' in s: 1240 s = quote(s, safe + ' ') 1241 return s.replace(' ', '+') 1242 return quote(s, safe) 1243 1244def urlencode(query, doseq=0): 1245 """Encode a sequence of two-element tuples or dictionary into a URL query string. 1246 1247 If any values in the query arg are sequences and doseq is true, each 1248 sequence element is converted to a separate parameter. 1249 1250 If the query arg is a sequence of two-element tuples, the order of the 1251 parameters in the output will match the order of parameters in the 1252 input. 1253 """ 1254 1255 if hasattr(query,"items"): 1256 # mapping objects 1257 query = query.items() 1258 else: 1259 # it's a bother at times that strings and string-like objects are 1260 # sequences... 1261 try: 1262 # non-sequence items should not work with len() 1263 # non-empty strings will fail this 1264 if len(query) and not isinstance(query[0], tuple): 1265 raise TypeError 1266 # zero-length sequences of all types will get here and succeed, 1267 # but that's a minor nit - since the original implementation 1268 # allowed empty dicts that type of behavior probably should be 1269 # preserved for consistency 1270 except TypeError: 1271 ty,va,tb = sys.exc_info() 1272 raise TypeError, "not a valid non-string sequence or mapping object", tb 1273 1274 l = [] 1275 if not doseq: 1276 # preserve old behavior 1277 for k, v in query: 1278 k = quote_plus(str(k)) 1279 v = quote_plus(str(v)) 1280 l.append(k + '=' + v) 1281 else: 1282 for k, v in query: 1283 k = quote_plus(str(k)) 1284 if isinstance(v, str): 1285 v = quote_plus(v) 1286 l.append(k + '=' + v) 1287 elif _is_unicode(v): 1288 # is there a reasonable way to convert to ASCII? 1289 # encode generates a string, but "replace" or "ignore" 1290 # lose information and "strict" can raise UnicodeError 1291 v = quote_plus(v.encode("ASCII","replace")) 1292 l.append(k + '=' + v) 1293 else: 1294 try: 1295 # is this a sufficient test for sequence-ness? 1296 x = len(v) 1297 except TypeError: 1298 # not a sequence 1299 v = quote_plus(str(v)) 1300 l.append(k + '=' + v) 1301 else: 1302 # loop over the sequence 1303 for elt in v: 1304 l.append(k + '=' + quote_plus(str(elt))) 1305 return '&'.join(l) 1306 1307# Proxy handling 1308def getproxies_environment(): 1309 """Return a dictionary of scheme -> proxy server URL mappings. 1310 1311 Scan the environment for variables named <scheme>_proxy; 1312 this seems to be the standard convention. If you need a 1313 different way, you can pass a proxies dictionary to the 1314 [Fancy]URLopener constructor. 1315 1316 """ 1317 proxies = {} 1318 for name, value in os.environ.items(): 1319 name = name.lower() 1320 if value and name[-6:] == '_proxy': 1321 proxies[name[:-6]] = value 1322 return proxies 1323 1324def proxy_bypass_environment(host): 1325 """Test if proxies should not be used for a particular host. 1326 1327 Checks the environment for a variable named no_proxy, which should 1328 be a list of DNS suffixes separated by commas, or '*' for all hosts. 1329 """ 1330 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '') 1331 # '*' is special case for always bypass 1332 if no_proxy == '*': 1333 return 1 1334 # strip port off host 1335 hostonly, port = splitport(host) 1336 # check if the host ends with any of the DNS suffixes 1337 for name in no_proxy.split(','): 1338 if name and (hostonly.endswith(name) or host.endswith(name)): 1339 return 1 1340 # otherwise, don't bypass 1341 return 0 1342 1343 1344if sys.platform == 'darwin': 1345 from _scproxy import _get_proxy_settings, _get_proxies 1346 1347 def proxy_bypass_macosx_sysconf(host): 1348 """ 1349 Return True iff this host shouldn't be accessed using a proxy 1350 1351 This function uses the MacOSX framework SystemConfiguration 1352 to fetch the proxy information. 1353 """ 1354 import re 1355 import socket 1356 from fnmatch import fnmatch 1357 1358 hostonly, port = splitport(host) 1359 1360 def ip2num(ipAddr): 1361 parts = ipAddr.split('.') 1362 parts = map(int, parts) 1363 if len(parts) != 4: 1364 parts = (parts + [0, 0, 0, 0])[:4] 1365 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3] 1366 1367 proxy_settings = _get_proxy_settings() 1368 1369 # Check for simple host names: 1370 if '.' not in host: 1371 if proxy_settings['exclude_simple']: 1372 return True 1373 1374 hostIP = None 1375 1376 for value in proxy_settings.get('exceptions', ()): 1377 # Items in the list are strings like these: *.local, 169.254/16 1378 if not value: continue 1379 1380 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value) 1381 if m is not None: 1382 if hostIP is None: 1383 try: 1384 hostIP = socket.gethostbyname(hostonly) 1385 hostIP = ip2num(hostIP) 1386 except socket.error: 1387 continue 1388 1389 base = ip2num(m.group(1)) 1390 mask = m.group(2) 1391 if mask is None: 1392 mask = 8 * (m.group(1).count('.') + 1) 1393 1394 else: 1395 mask = int(mask[1:]) 1396 mask = 32 - mask 1397 1398 if (hostIP >> mask) == (base >> mask): 1399 return True 1400 1401 elif fnmatch(host, value): 1402 return True 1403 1404 return False 1405 1406 def getproxies_macosx_sysconf(): 1407 """Return a dictionary of scheme -> proxy server URL mappings. 1408 1409 This function uses the MacOSX framework SystemConfiguration 1410 to fetch the proxy information. 1411 """ 1412 return _get_proxies() 1413 1414 def proxy_bypass(host): 1415 if getproxies_environment(): 1416 return proxy_bypass_environment(host) 1417 else: 1418 return proxy_bypass_macosx_sysconf(host) 1419 1420 def getproxies(): 1421 return getproxies_environment() or getproxies_macosx_sysconf() 1422 1423elif os.name == 'nt': 1424 def getproxies_registry(): 1425 """Return a dictionary of scheme -> proxy server URL mappings. 1426 1427 Win32 uses the registry to store proxies. 1428 1429 """ 1430 proxies = {} 1431 try: 1432 import _winreg 1433 except ImportError: 1434 # Std module, so should be around - but you never know! 1435 return proxies 1436 try: 1437 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, 1438 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 1439 proxyEnable = _winreg.QueryValueEx(internetSettings, 1440 'ProxyEnable')[0] 1441 if proxyEnable: 1442 # Returned as Unicode but problems if not converted to ASCII 1443 proxyServer = str(_winreg.QueryValueEx(internetSettings, 1444 'ProxyServer')[0]) 1445 if '=' in proxyServer: 1446 # Per-protocol settings 1447 for p in proxyServer.split(';'): 1448 protocol, address = p.split('=', 1) 1449 # See if address has a type:// prefix 1450 import re 1451 if not re.match('^([^/:]+)://', address): 1452 address = '%s://%s' % (protocol, address) 1453 proxies[protocol] = address 1454 else: 1455 # Use one setting for all protocols 1456 if proxyServer[:5] == 'http:': 1457 proxies['http'] = proxyServer 1458 else: 1459 proxies['http'] = 'http://%s' % proxyServer 1460 proxies['https'] = 'https://%s' % proxyServer 1461 proxies['ftp'] = 'ftp://%s' % proxyServer 1462 internetSettings.Close() 1463 except (WindowsError, ValueError, TypeError): 1464 # Either registry key not found etc, or the value in an 1465 # unexpected format. 1466 # proxies already set up to be empty so nothing to do 1467 pass 1468 return proxies 1469 1470 def getproxies(): 1471 """Return a dictionary of scheme -> proxy server URL mappings. 1472 1473 Returns settings gathered from the environment, if specified, 1474 or the registry. 1475 1476 """ 1477 return getproxies_environment() or getproxies_registry() 1478 1479 def proxy_bypass_registry(host): 1480 try: 1481 import _winreg 1482 import re 1483 except ImportError: 1484 # Std modules, so should be around - but you never know! 1485 return 0 1486 try: 1487 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, 1488 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 1489 proxyEnable = _winreg.QueryValueEx(internetSettings, 1490 'ProxyEnable')[0] 1491 proxyOverride = str(_winreg.QueryValueEx(internetSettings, 1492 'ProxyOverride')[0]) 1493 # ^^^^ Returned as Unicode but problems if not converted to ASCII 1494 except WindowsError: 1495 return 0 1496 if not proxyEnable or not proxyOverride: 1497 return 0 1498 # try to make a host list from name and IP address. 1499 rawHost, port = splitport(host) 1500 host = [rawHost] 1501 try: 1502 addr = socket.gethostbyname(rawHost) 1503 if addr != rawHost: 1504 host.append(addr) 1505 except socket.error: 1506 pass 1507 try: 1508 fqdn = socket.getfqdn(rawHost) 1509 if fqdn != rawHost: 1510 host.append(fqdn) 1511 except socket.error: 1512 pass 1513 # make a check value list from the registry entry: replace the 1514 # '<local>' string by the localhost entry and the corresponding 1515 # canonical entry. 1516 proxyOverride = proxyOverride.split(';') 1517 i = 0 1518 while i < len(proxyOverride): 1519 if proxyOverride[i] == '<local>': 1520 proxyOverride[i:i+1] = ['localhost', 1521 '127.0.0.1', 1522 socket.gethostname(), 1523 socket.gethostbyname( 1524 socket.gethostname())] 1525 i += 1 1526 # print proxyOverride 1527 # now check if we match one of the registry values. 1528 for test in proxyOverride: 1529 test = test.replace(".", r"\.") # mask dots 1530 test = test.replace("*", r".*") # change glob sequence 1531 test = test.replace("?", r".") # change glob char 1532 for val in host: 1533 # print "%s <--> %s" %( test, val ) 1534 if re.match(test, val, re.I): 1535 return 1 1536 return 0 1537 1538 def proxy_bypass(host): 1539 """Return a dictionary of scheme -> proxy server URL mappings. 1540 1541 Returns settings gathered from the environment, if specified, 1542 or the registry. 1543 1544 """ 1545 if getproxies_environment(): 1546 return proxy_bypass_environment(host) 1547 else: 1548 return proxy_bypass_registry(host) 1549 1550else: 1551 # By default use environment variables 1552 getproxies = getproxies_environment 1553 proxy_bypass = proxy_bypass_environment 1554 1555# Test and time quote() and unquote() 1556def test1(): 1557 s = '' 1558 for i in range(256): s = s + chr(i) 1559 s = s*4 1560 t0 = time.time() 1561 qs = quote(s) 1562 uqs = unquote(qs) 1563 t1 = time.time() 1564 if uqs != s: 1565 print 'Wrong!' 1566 print repr(s) 1567 print repr(qs) 1568 print repr(uqs) 1569 print round(t1 - t0, 3), 'sec' 1570 1571 1572def reporthook(blocknum, blocksize, totalsize): 1573 # Report during remote transfers 1574 print "Block number: %d, Block size: %d, Total size: %d" % ( 1575 blocknum, blocksize, totalsize) 1576 1577# Test program 1578def test(args=[]): 1579 if not args: 1580 args = [ 1581 '/etc/passwd', 1582 'file:/etc/passwd', 1583 'file://localhost/etc/passwd', 1584 'ftp://ftp.gnu.org/pub/README', 1585 'http://www.python.org/index.html', 1586 ] 1587 if hasattr(URLopener, "open_https"): 1588 args.append('https://synergy.as.cmu.edu/~geek/') 1589 try: 1590 for url in args: 1591 print '-'*10, url, '-'*10 1592 fn, h = urlretrieve(url, None, reporthook) 1593 print fn 1594 if h: 1595 print '======' 1596 for k in h.keys(): print k + ':', h[k] 1597 print '======' 1598 fp = open(fn, 'rb') 1599 data = fp.read() 1600 del fp 1601 if '\r' in data: 1602 table = string.maketrans("", "") 1603 data = data.translate(table, "\r") 1604 print data 1605 fn, h = None, None 1606 print '-'*40 1607 finally: 1608 urlcleanup() 1609 1610def main(): 1611 import getopt, sys 1612 try: 1613 opts, args = getopt.getopt(sys.argv[1:], "th") 1614 except getopt.error, msg: 1615 print msg 1616 print "Use -h for help" 1617 return 1618 t = 0 1619 for o, a in opts: 1620 if o == '-t': 1621 t = t + 1 1622 if o == '-h': 1623 print "Usage: python urllib.py [-t] [url ...]" 1624 print "-t runs self-test;", 1625 print "otherwise, contents of urls are printed" 1626 return 1627 if t: 1628 if t > 1: 1629 test1() 1630 test(args) 1631 else: 1632 if not args: 1633 print "Use -h for help" 1634 for url in args: 1635 print urlopen(url).read(), 1636 1637# Run test program when run as a script 1638if __name__ == '__main__': 1639 main() 1640