urllib.py revision 7a4e8379437017ada5d0e8e74c752cc6c4b5030f
1"""Open an arbitrary URL. 2 3See the following document for more info on URLs: 4"Names and Addresses, URIs, URLs, URNs, URCs", at 5http://www.w3.org/pub/WWW/Addressing/Overview.html 6 7See also the HTTP spec (from which the error codes are derived): 8"HTTP - Hypertext Transfer Protocol", at 9http://www.w3.org/pub/WWW/Protocols/ 10 11Related standards and specs: 12- RFC1808: the "relative URL" spec. (authoritative status) 13- RFC1738 - the "URL standard". (authoritative status) 14- RFC1630 - the "URI spec". (informational status) 15 16The object returned by URLopener().open(file) will differ per 17protocol. All you know is that is has methods read(), readline(), 18readlines(), fileno(), close() and info(). The read*(), fileno() 19and close() methods work like those of open files. 20The info() method returns a mimetools.Message object which can be 21used to query various info about the object, if available. 22(mimetools.Message objects are queried with the getheader() method.) 23""" 24 25import string 26import socket 27import os 28import time 29import sys 30from urlparse import urljoin as basejoin 31 32__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve", 33 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus", 34 "urlencode", "url2pathname", "pathname2url", "splittag", 35 "localhost", "thishost", "ftperrors", "basejoin", "unwrap", 36 "splittype", "splithost", "splituser", "splitpasswd", "splitport", 37 "splitnport", "splitquery", "splitattr", "splitvalue", 38 "getproxies"] 39 40__version__ = '1.17' # XXX This version is not always updated :-( 41 42MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 43 44# Helper for non-unix systems 45if os.name == 'mac': 46 from macurl2path import url2pathname, pathname2url 47elif os.name == 'nt': 48 from nturl2path import url2pathname, pathname2url 49elif os.name == 'riscos': 50 from rourl2path import url2pathname, pathname2url 51else: 52 def url2pathname(pathname): 53 """OS-specific conversion from a relative URL of the 'file' scheme 54 to a file system path; not recommended for general use.""" 55 return unquote(pathname) 56 57 def pathname2url(pathname): 58 """OS-specific conversion from a file system path to a relative URL 59 of the 'file' scheme; not recommended for general use.""" 60 return quote(pathname) 61 62# This really consists of two pieces: 63# (1) a class which handles opening of all sorts of URLs 64# (plus assorted utilities etc.) 65# (2) a set of functions for parsing URLs 66# XXX Should these be separated out into different modules? 67 68 69# Shortcut for basic usage 70_urlopener = None 71def urlopen(url, data=None, proxies=None): 72 """Create a file-like object for the specified URL to read from.""" 73 from warnings import warnpy3k 74 warnpy3k("urllib.urlopen() has been removed in Python 3.0 in " 75 "favor of urllib2.urlopen()", stacklevel=2) 76 77 global _urlopener 78 if proxies is not None: 79 opener = FancyURLopener(proxies=proxies) 80 elif not _urlopener: 81 opener = FancyURLopener() 82 _urlopener = opener 83 else: 84 opener = _urlopener 85 if data is None: 86 return opener.open(url) 87 else: 88 return opener.open(url, data) 89def urlretrieve(url, filename=None, reporthook=None, data=None): 90 global _urlopener 91 if not _urlopener: 92 _urlopener = FancyURLopener() 93 return _urlopener.retrieve(url, filename, reporthook, data) 94def urlcleanup(): 95 if _urlopener: 96 _urlopener.cleanup() 97 98# check for SSL 99try: 100 import ssl 101except: 102 _have_ssl = False 103else: 104 _have_ssl = True 105 106# exception raised when downloaded size does not match content-length 107class ContentTooShortError(IOError): 108 def __init__(self, message, content): 109 IOError.__init__(self, message) 110 self.content = content 111 112ftpcache = {} 113class URLopener: 114 """Class to open URLs. 115 This is a class rather than just a subroutine because we may need 116 more than one set of global protocol-specific options. 117 Note -- this is a base class for those who don't want the 118 automatic handling of errors type 302 (relocated) and 401 119 (authorization needed).""" 120 121 __tempfiles = None 122 123 version = "Python-urllib/%s" % __version__ 124 125 # Constructor 126 def __init__(self, proxies=None, **x509): 127 if proxies is None: 128 proxies = getproxies() 129 assert hasattr(proxies, 'has_key'), "proxies must be a mapping" 130 self.proxies = proxies 131 self.key_file = x509.get('key_file') 132 self.cert_file = x509.get('cert_file') 133 self.addheaders = [('User-Agent', self.version)] 134 self.__tempfiles = [] 135 self.__unlink = os.unlink # See cleanup() 136 self.tempcache = None 137 # Undocumented feature: if you assign {} to tempcache, 138 # it is used to cache files retrieved with 139 # self.retrieve(). This is not enabled by default 140 # since it does not work for changing documents (and I 141 # haven't got the logic to check expiration headers 142 # yet). 143 self.ftpcache = ftpcache 144 # Undocumented feature: you can use a different 145 # ftp cache by assigning to the .ftpcache member; 146 # in case you want logically independent URL openers 147 # XXX This is not threadsafe. Bah. 148 149 def __del__(self): 150 self.close() 151 152 def close(self): 153 self.cleanup() 154 155 def cleanup(self): 156 # This code sometimes runs when the rest of this module 157 # has already been deleted, so it can't use any globals 158 # or import anything. 159 if self.__tempfiles: 160 for file in self.__tempfiles: 161 try: 162 self.__unlink(file) 163 except OSError: 164 pass 165 del self.__tempfiles[:] 166 if self.tempcache: 167 self.tempcache.clear() 168 169 def addheader(self, *args): 170 """Add a header to be used by the HTTP interface only 171 e.g. u.addheader('Accept', 'sound/basic')""" 172 self.addheaders.append(args) 173 174 # External interface 175 def open(self, fullurl, data=None): 176 """Use URLopener().open(file) instead of open(file, 'r').""" 177 fullurl = unwrap(toBytes(fullurl)) 178 # percent encode url. fixing lame server errors like space within url 179 # parts 180 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|") 181 if self.tempcache and fullurl in self.tempcache: 182 filename, headers = self.tempcache[fullurl] 183 fp = open(filename, 'rb') 184 return addinfourl(fp, headers, fullurl) 185 urltype, url = splittype(fullurl) 186 if not urltype: 187 urltype = 'file' 188 if urltype in self.proxies: 189 proxy = self.proxies[urltype] 190 urltype, proxyhost = splittype(proxy) 191 host, selector = splithost(proxyhost) 192 url = (host, fullurl) # Signal special case to open_*() 193 else: 194 proxy = None 195 name = 'open_' + urltype 196 self.type = urltype 197 name = name.replace('-', '_') 198 if not hasattr(self, name): 199 if proxy: 200 return self.open_unknown_proxy(proxy, fullurl, data) 201 else: 202 return self.open_unknown(fullurl, data) 203 try: 204 if data is None: 205 return getattr(self, name)(url) 206 else: 207 return getattr(self, name)(url, data) 208 except socket.error, msg: 209 raise IOError, ('socket error', msg), sys.exc_info()[2] 210 211 def open_unknown(self, fullurl, data=None): 212 """Overridable interface to open unknown URL type.""" 213 type, url = splittype(fullurl) 214 raise IOError, ('url error', 'unknown url type', type) 215 216 def open_unknown_proxy(self, proxy, fullurl, data=None): 217 """Overridable interface to open unknown URL type.""" 218 type, url = splittype(fullurl) 219 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy) 220 221 # External interface 222 def retrieve(self, url, filename=None, reporthook=None, data=None): 223 """retrieve(url) returns (filename, headers) for a local object 224 or (tempfilename, headers) for a remote object.""" 225 url = unwrap(toBytes(url)) 226 if self.tempcache and url in self.tempcache: 227 return self.tempcache[url] 228 type, url1 = splittype(url) 229 if filename is None and (not type or type == 'file'): 230 try: 231 fp = self.open_local_file(url1) 232 hdrs = fp.info() 233 del fp 234 return url2pathname(splithost(url1)[1]), hdrs 235 except IOError, msg: 236 pass 237 fp = self.open(url, data) 238 try: 239 headers = fp.info() 240 if filename: 241 tfp = open(filename, 'wb') 242 else: 243 import tempfile 244 garbage, path = splittype(url) 245 garbage, path = splithost(path or "") 246 path, garbage = splitquery(path or "") 247 path, garbage = splitattr(path or "") 248 suffix = os.path.splitext(path)[1] 249 (fd, filename) = tempfile.mkstemp(suffix) 250 self.__tempfiles.append(filename) 251 tfp = os.fdopen(fd, 'wb') 252 try: 253 result = filename, headers 254 if self.tempcache is not None: 255 self.tempcache[url] = result 256 bs = 1024*8 257 size = -1 258 read = 0 259 blocknum = 0 260 if reporthook: 261 if "content-length" in headers: 262 size = int(headers["Content-Length"]) 263 reporthook(blocknum, bs, size) 264 while 1: 265 block = fp.read(bs) 266 if block == "": 267 break 268 read += len(block) 269 tfp.write(block) 270 blocknum += 1 271 if reporthook: 272 reporthook(blocknum, bs, size) 273 finally: 274 tfp.close() 275 finally: 276 fp.close() 277 del fp 278 del tfp 279 280 # raise exception if actual size does not match content-length header 281 if size >= 0 and read < size: 282 raise ContentTooShortError("retrieval incomplete: got only %i out " 283 "of %i bytes" % (read, size), result) 284 285 return result 286 287 # Each method named open_<type> knows how to open that type of URL 288 289 def open_http(self, url, data=None): 290 """Use HTTP protocol.""" 291 import httplib 292 user_passwd = None 293 proxy_passwd= None 294 if isinstance(url, str): 295 host, selector = splithost(url) 296 if host: 297 user_passwd, host = splituser(host) 298 host = unquote(host) 299 realhost = host 300 else: 301 host, selector = url 302 # check whether the proxy contains authorization information 303 proxy_passwd, host = splituser(host) 304 # now we proceed with the url we want to obtain 305 urltype, rest = splittype(selector) 306 url = rest 307 user_passwd = None 308 if urltype.lower() != 'http': 309 realhost = None 310 else: 311 realhost, rest = splithost(rest) 312 if realhost: 313 user_passwd, realhost = splituser(realhost) 314 if user_passwd: 315 selector = "%s://%s%s" % (urltype, realhost, rest) 316 if proxy_bypass(realhost): 317 host = realhost 318 319 #print "proxy via http:", host, selector 320 if not host: raise IOError, ('http error', 'no host given') 321 322 if proxy_passwd: 323 import base64 324 proxy_auth = base64.b64encode(proxy_passwd).strip() 325 else: 326 proxy_auth = None 327 328 if user_passwd: 329 import base64 330 auth = base64.b64encode(user_passwd).strip() 331 else: 332 auth = None 333 h = httplib.HTTP(host) 334 if data is not None: 335 h.putrequest('POST', selector) 336 h.putheader('Content-Type', 'application/x-www-form-urlencoded') 337 h.putheader('Content-Length', '%d' % len(data)) 338 else: 339 h.putrequest('GET', selector) 340 if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth) 341 if auth: h.putheader('Authorization', 'Basic %s' % auth) 342 if realhost: h.putheader('Host', realhost) 343 for args in self.addheaders: h.putheader(*args) 344 h.endheaders() 345 if data is not None: 346 h.send(data) 347 errcode, errmsg, headers = h.getreply() 348 fp = h.getfile() 349 if errcode == -1: 350 if fp: fp.close() 351 # something went wrong with the HTTP status line 352 raise IOError, ('http protocol error', 0, 353 'got a bad status line', None) 354 # According to RFC 2616, "2xx" code indicates that the client's 355 # request was successfully received, understood, and accepted. 356 if (200 <= errcode < 300): 357 return addinfourl(fp, headers, "http:" + url, errcode) 358 else: 359 if data is None: 360 return self.http_error(url, fp, errcode, errmsg, headers) 361 else: 362 return self.http_error(url, fp, errcode, errmsg, headers, data) 363 364 def http_error(self, url, fp, errcode, errmsg, headers, data=None): 365 """Handle http errors. 366 Derived class can override this, or provide specific handlers 367 named http_error_DDD where DDD is the 3-digit error code.""" 368 # First check if there's a specific handler for this error 369 name = 'http_error_%d' % errcode 370 if hasattr(self, name): 371 method = getattr(self, name) 372 if data is None: 373 result = method(url, fp, errcode, errmsg, headers) 374 else: 375 result = method(url, fp, errcode, errmsg, headers, data) 376 if result: return result 377 return self.http_error_default(url, fp, errcode, errmsg, headers) 378 379 def http_error_default(self, url, fp, errcode, errmsg, headers): 380 """Default error handler: close the connection and raise IOError.""" 381 void = fp.read() 382 fp.close() 383 raise IOError, ('http error', errcode, errmsg, headers) 384 385 if _have_ssl: 386 def open_https(self, url, data=None): 387 """Use HTTPS protocol.""" 388 389 import httplib 390 user_passwd = None 391 proxy_passwd = None 392 if isinstance(url, str): 393 host, selector = splithost(url) 394 if host: 395 user_passwd, host = splituser(host) 396 host = unquote(host) 397 realhost = host 398 else: 399 host, selector = url 400 # here, we determine, whether the proxy contains authorization information 401 proxy_passwd, host = splituser(host) 402 urltype, rest = splittype(selector) 403 url = rest 404 user_passwd = None 405 if urltype.lower() != 'https': 406 realhost = None 407 else: 408 realhost, rest = splithost(rest) 409 if realhost: 410 user_passwd, realhost = splituser(realhost) 411 if user_passwd: 412 selector = "%s://%s%s" % (urltype, realhost, rest) 413 #print "proxy via https:", host, selector 414 if not host: raise IOError, ('https error', 'no host given') 415 if proxy_passwd: 416 import base64 417 proxy_auth = base64.b64encode(proxy_passwd).strip() 418 else: 419 proxy_auth = None 420 if user_passwd: 421 import base64 422 auth = base64.b64encode(user_passwd).strip() 423 else: 424 auth = None 425 h = httplib.HTTPS(host, 0, 426 key_file=self.key_file, 427 cert_file=self.cert_file) 428 if data is not None: 429 h.putrequest('POST', selector) 430 h.putheader('Content-Type', 431 'application/x-www-form-urlencoded') 432 h.putheader('Content-Length', '%d' % len(data)) 433 else: 434 h.putrequest('GET', selector) 435 if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth) 436 if auth: h.putheader('Authorization', 'Basic %s' % auth) 437 if realhost: h.putheader('Host', realhost) 438 for args in self.addheaders: h.putheader(*args) 439 h.endheaders() 440 if data is not None: 441 h.send(data) 442 errcode, errmsg, headers = h.getreply() 443 fp = h.getfile() 444 if errcode == -1: 445 if fp: fp.close() 446 # something went wrong with the HTTP status line 447 raise IOError, ('http protocol error', 0, 448 'got a bad status line', None) 449 # According to RFC 2616, "2xx" code indicates that the client's 450 # request was successfully received, understood, and accepted. 451 if (200 <= errcode < 300): 452 return addinfourl(fp, headers, "https:" + url, errcode) 453 else: 454 if data is None: 455 return self.http_error(url, fp, errcode, errmsg, headers) 456 else: 457 return self.http_error(url, fp, errcode, errmsg, headers, 458 data) 459 460 def open_file(self, url): 461 """Use local file or FTP depending on form of URL.""" 462 if not isinstance(url, str): 463 raise IOError, ('file error', 'proxy support for file protocol currently not implemented') 464 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/': 465 return self.open_ftp(url) 466 else: 467 return self.open_local_file(url) 468 469 def open_local_file(self, url): 470 """Use local file.""" 471 import mimetypes, mimetools, email.utils 472 try: 473 from cStringIO import StringIO 474 except ImportError: 475 from StringIO import StringIO 476 host, file = splithost(url) 477 localname = url2pathname(file) 478 try: 479 stats = os.stat(localname) 480 except OSError, e: 481 raise IOError(e.errno, e.strerror, e.filename) 482 size = stats.st_size 483 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 484 mtype = mimetypes.guess_type(url)[0] 485 headers = mimetools.Message(StringIO( 486 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % 487 (mtype or 'text/plain', size, modified))) 488 if not host: 489 urlfile = file 490 if file[:1] == '/': 491 urlfile = 'file://' + file 492 return addinfourl(open(localname, 'rb'), 493 headers, urlfile) 494 host, port = splitport(host) 495 if not port \ 496 and socket.gethostbyname(host) in (localhost(), thishost()): 497 urlfile = file 498 if file[:1] == '/': 499 urlfile = 'file://' + file 500 return addinfourl(open(localname, 'rb'), 501 headers, urlfile) 502 raise IOError, ('local file error', 'not on local host') 503 504 def open_ftp(self, url): 505 """Use FTP protocol.""" 506 if not isinstance(url, str): 507 raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented') 508 import mimetypes, mimetools 509 try: 510 from cStringIO import StringIO 511 except ImportError: 512 from StringIO import StringIO 513 host, path = splithost(url) 514 if not host: raise IOError, ('ftp error', 'no host given') 515 host, port = splitport(host) 516 user, host = splituser(host) 517 if user: user, passwd = splitpasswd(user) 518 else: passwd = None 519 host = unquote(host) 520 user = unquote(user or '') 521 passwd = unquote(passwd or '') 522 host = socket.gethostbyname(host) 523 if not port: 524 import ftplib 525 port = ftplib.FTP_PORT 526 else: 527 port = int(port) 528 path, attrs = splitattr(path) 529 path = unquote(path) 530 dirs = path.split('/') 531 dirs, file = dirs[:-1], dirs[-1] 532 if dirs and not dirs[0]: dirs = dirs[1:] 533 if dirs and not dirs[0]: dirs[0] = '/' 534 key = user, host, port, '/'.join(dirs) 535 # XXX thread unsafe! 536 if len(self.ftpcache) > MAXFTPCACHE: 537 # Prune the cache, rather arbitrarily 538 for k in self.ftpcache.keys(): 539 if k != key: 540 v = self.ftpcache[k] 541 del self.ftpcache[k] 542 v.close() 543 try: 544 if not key in self.ftpcache: 545 self.ftpcache[key] = \ 546 ftpwrapper(user, passwd, host, port, dirs) 547 if not file: type = 'D' 548 else: type = 'I' 549 for attr in attrs: 550 attr, value = splitvalue(attr) 551 if attr.lower() == 'type' and \ 552 value in ('a', 'A', 'i', 'I', 'd', 'D'): 553 type = value.upper() 554 (fp, retrlen) = self.ftpcache[key].retrfile(file, type) 555 mtype = mimetypes.guess_type("ftp:" + url)[0] 556 headers = "" 557 if mtype: 558 headers += "Content-Type: %s\n" % mtype 559 if retrlen is not None and retrlen >= 0: 560 headers += "Content-Length: %d\n" % retrlen 561 headers = mimetools.Message(StringIO(headers)) 562 return addinfourl(fp, headers, "ftp:" + url) 563 except ftperrors(), msg: 564 raise IOError, ('ftp error', msg), sys.exc_info()[2] 565 566 def open_data(self, url, data=None): 567 """Use "data" URL.""" 568 if not isinstance(url, str): 569 raise IOError, ('data error', 'proxy support for data protocol currently not implemented') 570 # ignore POSTed data 571 # 572 # syntax of data URLs: 573 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 574 # mediatype := [ type "/" subtype ] *( ";" parameter ) 575 # data := *urlchar 576 # parameter := attribute "=" value 577 import mimetools 578 try: 579 from cStringIO import StringIO 580 except ImportError: 581 from StringIO import StringIO 582 try: 583 [type, data] = url.split(',', 1) 584 except ValueError: 585 raise IOError, ('data error', 'bad data URL') 586 if not type: 587 type = 'text/plain;charset=US-ASCII' 588 semi = type.rfind(';') 589 if semi >= 0 and '=' not in type[semi:]: 590 encoding = type[semi+1:] 591 type = type[:semi] 592 else: 593 encoding = '' 594 msg = [] 595 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT', 596 time.gmtime(time.time()))) 597 msg.append('Content-type: %s' % type) 598 if encoding == 'base64': 599 import base64 600 data = base64.decodestring(data) 601 else: 602 data = unquote(data) 603 msg.append('Content-Length: %d' % len(data)) 604 msg.append('') 605 msg.append(data) 606 msg = '\n'.join(msg) 607 f = StringIO(msg) 608 headers = mimetools.Message(f, 0) 609 #f.fileno = None # needed for addinfourl 610 return addinfourl(f, headers, url) 611 612 613class FancyURLopener(URLopener): 614 """Derived class with handlers for errors we can handle (perhaps).""" 615 616 def __init__(self, *args, **kwargs): 617 URLopener.__init__(self, *args, **kwargs) 618 self.auth_cache = {} 619 self.tries = 0 620 self.maxtries = 10 621 622 def http_error_default(self, url, fp, errcode, errmsg, headers): 623 """Default error handling -- don't raise an exception.""" 624 return addinfourl(fp, headers, "http:" + url, errcode) 625 626 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 627 """Error 302 -- relocated (temporarily).""" 628 self.tries += 1 629 if self.maxtries and self.tries >= self.maxtries: 630 if hasattr(self, "http_error_500"): 631 meth = self.http_error_500 632 else: 633 meth = self.http_error_default 634 self.tries = 0 635 return meth(url, fp, 500, 636 "Internal Server Error: Redirect Recursion", headers) 637 result = self.redirect_internal(url, fp, errcode, errmsg, headers, 638 data) 639 self.tries = 0 640 return result 641 642 def redirect_internal(self, url, fp, errcode, errmsg, headers, data): 643 if 'location' in headers: 644 newurl = headers['location'] 645 elif 'uri' in headers: 646 newurl = headers['uri'] 647 else: 648 return 649 void = fp.read() 650 fp.close() 651 # In case the server sent a relative URL, join with original: 652 newurl = basejoin(self.type + ":" + url, newurl) 653 return self.open(newurl) 654 655 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): 656 """Error 301 -- also relocated (permanently).""" 657 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 658 659 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None): 660 """Error 303 -- also relocated (essentially identical to 302).""" 661 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 662 663 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None): 664 """Error 307 -- relocated, but turn POST into error.""" 665 if data is None: 666 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 667 else: 668 return self.http_error_default(url, fp, errcode, errmsg, headers) 669 670 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): 671 """Error 401 -- authentication required. 672 This function supports Basic authentication only.""" 673 if not 'www-authenticate' in headers: 674 URLopener.http_error_default(self, url, fp, 675 errcode, errmsg, headers) 676 stuff = headers['www-authenticate'] 677 import re 678 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 679 if not match: 680 URLopener.http_error_default(self, url, fp, 681 errcode, errmsg, headers) 682 scheme, realm = match.groups() 683 if scheme.lower() != 'basic': 684 URLopener.http_error_default(self, url, fp, 685 errcode, errmsg, headers) 686 name = 'retry_' + self.type + '_basic_auth' 687 if data is None: 688 return getattr(self,name)(url, realm) 689 else: 690 return getattr(self,name)(url, realm, data) 691 692 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None): 693 """Error 407 -- proxy authentication required. 694 This function supports Basic authentication only.""" 695 if not 'proxy-authenticate' in headers: 696 URLopener.http_error_default(self, url, fp, 697 errcode, errmsg, headers) 698 stuff = headers['proxy-authenticate'] 699 import re 700 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 701 if not match: 702 URLopener.http_error_default(self, url, fp, 703 errcode, errmsg, headers) 704 scheme, realm = match.groups() 705 if scheme.lower() != 'basic': 706 URLopener.http_error_default(self, url, fp, 707 errcode, errmsg, headers) 708 name = 'retry_proxy_' + self.type + '_basic_auth' 709 if data is None: 710 return getattr(self,name)(url, realm) 711 else: 712 return getattr(self,name)(url, realm, data) 713 714 def retry_proxy_http_basic_auth(self, url, realm, data=None): 715 host, selector = splithost(url) 716 newurl = 'http://' + host + selector 717 proxy = self.proxies['http'] 718 urltype, proxyhost = splittype(proxy) 719 proxyhost, proxyselector = splithost(proxyhost) 720 i = proxyhost.find('@') + 1 721 proxyhost = proxyhost[i:] 722 user, passwd = self.get_user_passwd(proxyhost, realm, i) 723 if not (user or passwd): return None 724 proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost 725 self.proxies['http'] = 'http://' + proxyhost + proxyselector 726 if data is None: 727 return self.open(newurl) 728 else: 729 return self.open(newurl, data) 730 731 def retry_proxy_https_basic_auth(self, url, realm, data=None): 732 host, selector = splithost(url) 733 newurl = 'https://' + host + selector 734 proxy = self.proxies['https'] 735 urltype, proxyhost = splittype(proxy) 736 proxyhost, proxyselector = splithost(proxyhost) 737 i = proxyhost.find('@') + 1 738 proxyhost = proxyhost[i:] 739 user, passwd = self.get_user_passwd(proxyhost, realm, i) 740 if not (user or passwd): return None 741 proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost 742 self.proxies['https'] = 'https://' + proxyhost + proxyselector 743 if data is None: 744 return self.open(newurl) 745 else: 746 return self.open(newurl, data) 747 748 def retry_http_basic_auth(self, url, realm, data=None): 749 host, selector = splithost(url) 750 i = host.find('@') + 1 751 host = host[i:] 752 user, passwd = self.get_user_passwd(host, realm, i) 753 if not (user or passwd): return None 754 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host 755 newurl = 'http://' + host + selector 756 if data is None: 757 return self.open(newurl) 758 else: 759 return self.open(newurl, data) 760 761 def retry_https_basic_auth(self, url, realm, data=None): 762 host, selector = splithost(url) 763 i = host.find('@') + 1 764 host = host[i:] 765 user, passwd = self.get_user_passwd(host, realm, i) 766 if not (user or passwd): return None 767 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host 768 newurl = 'https://' + host + selector 769 if data is None: 770 return self.open(newurl) 771 else: 772 return self.open(newurl, data) 773 774 def get_user_passwd(self, host, realm, clear_cache = 0): 775 key = realm + '@' + host.lower() 776 if key in self.auth_cache: 777 if clear_cache: 778 del self.auth_cache[key] 779 else: 780 return self.auth_cache[key] 781 user, passwd = self.prompt_user_passwd(host, realm) 782 if user or passwd: self.auth_cache[key] = (user, passwd) 783 return user, passwd 784 785 def prompt_user_passwd(self, host, realm): 786 """Override this in a GUI environment!""" 787 import getpass 788 try: 789 user = raw_input("Enter username for %s at %s: " % (realm, 790 host)) 791 passwd = getpass.getpass("Enter password for %s in %s at %s: " % 792 (user, realm, host)) 793 return user, passwd 794 except KeyboardInterrupt: 795 print 796 return None, None 797 798 799# Utility functions 800 801_localhost = None 802def localhost(): 803 """Return the IP address of the magic hostname 'localhost'.""" 804 global _localhost 805 if _localhost is None: 806 _localhost = socket.gethostbyname('localhost') 807 return _localhost 808 809_thishost = None 810def thishost(): 811 """Return the IP address of the current host.""" 812 global _thishost 813 if _thishost is None: 814 _thishost = socket.gethostbyname(socket.gethostname()) 815 return _thishost 816 817_ftperrors = None 818def ftperrors(): 819 """Return the set of errors raised by the FTP class.""" 820 global _ftperrors 821 if _ftperrors is None: 822 import ftplib 823 _ftperrors = ftplib.all_errors 824 return _ftperrors 825 826_noheaders = None 827def noheaders(): 828 """Return an empty mimetools.Message object.""" 829 global _noheaders 830 if _noheaders is None: 831 import mimetools 832 try: 833 from cStringIO import StringIO 834 except ImportError: 835 from StringIO import StringIO 836 _noheaders = mimetools.Message(StringIO(), 0) 837 _noheaders.fp.close() # Recycle file descriptor 838 return _noheaders 839 840 841# Utility classes 842 843class ftpwrapper: 844 """Class used by open_ftp() for cache of open FTP connections.""" 845 846 def __init__(self, user, passwd, host, port, dirs, 847 timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 848 self.user = user 849 self.passwd = passwd 850 self.host = host 851 self.port = port 852 self.dirs = dirs 853 self.timeout = timeout 854 self.init() 855 856 def init(self): 857 import ftplib 858 self.busy = 0 859 self.ftp = ftplib.FTP() 860 self.ftp.connect(self.host, self.port, self.timeout) 861 self.ftp.login(self.user, self.passwd) 862 for dir in self.dirs: 863 self.ftp.cwd(dir) 864 865 def retrfile(self, file, type): 866 import ftplib 867 self.endtransfer() 868 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 869 else: cmd = 'TYPE ' + type; isdir = 0 870 try: 871 self.ftp.voidcmd(cmd) 872 except ftplib.all_errors: 873 self.init() 874 self.ftp.voidcmd(cmd) 875 conn = None 876 if file and not isdir: 877 # Try to retrieve as a file 878 try: 879 cmd = 'RETR ' + file 880 conn = self.ftp.ntransfercmd(cmd) 881 except ftplib.error_perm, reason: 882 if str(reason)[:3] != '550': 883 raise IOError, ('ftp error', reason), sys.exc_info()[2] 884 if not conn: 885 # Set transfer mode to ASCII! 886 self.ftp.voidcmd('TYPE A') 887 # Try a directory listing. Verify that directory exists. 888 if file: 889 pwd = self.ftp.pwd() 890 try: 891 try: 892 self.ftp.cwd(file) 893 except ftplib.error_perm, reason: 894 raise IOError, ('ftp error', reason), sys.exc_info()[2] 895 finally: 896 self.ftp.cwd(pwd) 897 cmd = 'LIST ' + file 898 else: 899 cmd = 'LIST' 900 conn = self.ftp.ntransfercmd(cmd) 901 self.busy = 1 902 # Pass back both a suitably decorated object and a retrieval length 903 return (addclosehook(conn[0].makefile('rb'), 904 self.endtransfer), conn[1]) 905 def endtransfer(self): 906 if not self.busy: 907 return 908 self.busy = 0 909 try: 910 self.ftp.voidresp() 911 except ftperrors(): 912 pass 913 914 def close(self): 915 self.endtransfer() 916 try: 917 self.ftp.close() 918 except ftperrors(): 919 pass 920 921class addbase: 922 """Base class for addinfo and addclosehook.""" 923 924 def __init__(self, fp): 925 self.fp = fp 926 self.read = self.fp.read 927 self.readline = self.fp.readline 928 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines 929 if hasattr(self.fp, "fileno"): 930 self.fileno = self.fp.fileno 931 else: 932 self.fileno = lambda: None 933 if hasattr(self.fp, "__iter__"): 934 self.__iter__ = self.fp.__iter__ 935 if hasattr(self.fp, "next"): 936 self.next = self.fp.next 937 938 def __repr__(self): 939 return '<%s at %r whose fp = %r>' % (self.__class__.__name__, 940 id(self), self.fp) 941 942 def close(self): 943 self.read = None 944 self.readline = None 945 self.readlines = None 946 self.fileno = None 947 if self.fp: self.fp.close() 948 self.fp = None 949 950class addclosehook(addbase): 951 """Class to add a close hook to an open file.""" 952 953 def __init__(self, fp, closehook, *hookargs): 954 addbase.__init__(self, fp) 955 self.closehook = closehook 956 self.hookargs = hookargs 957 958 def close(self): 959 addbase.close(self) 960 if self.closehook: 961 self.closehook(*self.hookargs) 962 self.closehook = None 963 self.hookargs = None 964 965class addinfo(addbase): 966 """class to add an info() method to an open file.""" 967 968 def __init__(self, fp, headers): 969 addbase.__init__(self, fp) 970 self.headers = headers 971 972 def info(self): 973 return self.headers 974 975class addinfourl(addbase): 976 """class to add info() and geturl() methods to an open file.""" 977 978 def __init__(self, fp, headers, url, code=None): 979 addbase.__init__(self, fp) 980 self.headers = headers 981 self.url = url 982 self.code = code 983 984 def info(self): 985 return self.headers 986 987 def getcode(self): 988 return self.code 989 990 def geturl(self): 991 return self.url 992 993 994# Utilities to parse URLs (most of these return None for missing parts): 995# unwrap('<URL:type://host/path>') --> 'type://host/path' 996# splittype('type:opaquestring') --> 'type', 'opaquestring' 997# splithost('//host[:port]/path') --> 'host[:port]', '/path' 998# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' 999# splitpasswd('user:passwd') -> 'user', 'passwd' 1000# splitport('host:port') --> 'host', 'port' 1001# splitquery('/path?query') --> '/path', 'query' 1002# splittag('/path#tag') --> '/path', 'tag' 1003# splitattr('/path;attr1=value1;attr2=value2;...') -> 1004# '/path', ['attr1=value1', 'attr2=value2', ...] 1005# splitvalue('attr=value') --> 'attr', 'value' 1006# unquote('abc%20def') -> 'abc def' 1007# quote('abc def') -> 'abc%20def') 1008 1009try: 1010 unicode 1011except NameError: 1012 def _is_unicode(x): 1013 return 0 1014else: 1015 def _is_unicode(x): 1016 return isinstance(x, unicode) 1017 1018def toBytes(url): 1019 """toBytes(u"URL") --> 'URL'.""" 1020 # Most URL schemes require ASCII. If that changes, the conversion 1021 # can be relaxed 1022 if _is_unicode(url): 1023 try: 1024 url = url.encode("ASCII") 1025 except UnicodeError: 1026 raise UnicodeError("URL " + repr(url) + 1027 " contains non-ASCII characters") 1028 return url 1029 1030def unwrap(url): 1031 """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" 1032 url = url.strip() 1033 if url[:1] == '<' and url[-1:] == '>': 1034 url = url[1:-1].strip() 1035 if url[:4] == 'URL:': url = url[4:].strip() 1036 return url 1037 1038_typeprog = None 1039def splittype(url): 1040 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" 1041 global _typeprog 1042 if _typeprog is None: 1043 import re 1044 _typeprog = re.compile('^([^/:]+):') 1045 1046 match = _typeprog.match(url) 1047 if match: 1048 scheme = match.group(1) 1049 return scheme.lower(), url[len(scheme) + 1:] 1050 return None, url 1051 1052_hostprog = None 1053def splithost(url): 1054 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" 1055 global _hostprog 1056 if _hostprog is None: 1057 import re 1058 _hostprog = re.compile('^//([^/?]*)(.*)$') 1059 1060 match = _hostprog.match(url) 1061 if match: return match.group(1, 2) 1062 return None, url 1063 1064_userprog = None 1065def splituser(host): 1066 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" 1067 global _userprog 1068 if _userprog is None: 1069 import re 1070 _userprog = re.compile('^(.*)@(.*)$') 1071 1072 match = _userprog.match(host) 1073 if match: return map(unquote, match.group(1, 2)) 1074 return None, host 1075 1076_passwdprog = None 1077def splitpasswd(user): 1078 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" 1079 global _passwdprog 1080 if _passwdprog is None: 1081 import re 1082 _passwdprog = re.compile('^([^:]*):(.*)$') 1083 1084 match = _passwdprog.match(user) 1085 if match: return match.group(1, 2) 1086 return user, None 1087 1088# splittag('/path#tag') --> '/path', 'tag' 1089_portprog = None 1090def splitport(host): 1091 """splitport('host:port') --> 'host', 'port'.""" 1092 global _portprog 1093 if _portprog is None: 1094 import re 1095 _portprog = re.compile('^(.*):([0-9]+)$') 1096 1097 match = _portprog.match(host) 1098 if match: return match.group(1, 2) 1099 return host, None 1100 1101_nportprog = None 1102def splitnport(host, defport=-1): 1103 """Split host and port, returning numeric port. 1104 Return given default port if no ':' found; defaults to -1. 1105 Return numerical port if a valid number are found after ':'. 1106 Return None if ':' but not a valid number.""" 1107 global _nportprog 1108 if _nportprog is None: 1109 import re 1110 _nportprog = re.compile('^(.*):(.*)$') 1111 1112 match = _nportprog.match(host) 1113 if match: 1114 host, port = match.group(1, 2) 1115 try: 1116 if not port: raise ValueError, "no digits" 1117 nport = int(port) 1118 except ValueError: 1119 nport = None 1120 return host, nport 1121 return host, defport 1122 1123_queryprog = None 1124def splitquery(url): 1125 """splitquery('/path?query') --> '/path', 'query'.""" 1126 global _queryprog 1127 if _queryprog is None: 1128 import re 1129 _queryprog = re.compile('^(.*)\?([^?]*)$') 1130 1131 match = _queryprog.match(url) 1132 if match: return match.group(1, 2) 1133 return url, None 1134 1135_tagprog = None 1136def splittag(url): 1137 """splittag('/path#tag') --> '/path', 'tag'.""" 1138 global _tagprog 1139 if _tagprog is None: 1140 import re 1141 _tagprog = re.compile('^(.*)#([^#]*)$') 1142 1143 match = _tagprog.match(url) 1144 if match: return match.group(1, 2) 1145 return url, None 1146 1147def splitattr(url): 1148 """splitattr('/path;attr1=value1;attr2=value2;...') -> 1149 '/path', ['attr1=value1', 'attr2=value2', ...].""" 1150 words = url.split(';') 1151 return words[0], words[1:] 1152 1153_valueprog = None 1154def splitvalue(attr): 1155 """splitvalue('attr=value') --> 'attr', 'value'.""" 1156 global _valueprog 1157 if _valueprog is None: 1158 import re 1159 _valueprog = re.compile('^([^=]*)=(.*)$') 1160 1161 match = _valueprog.match(attr) 1162 if match: return match.group(1, 2) 1163 return attr, None 1164 1165_hexdig = '0123456789ABCDEFabcdef' 1166_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig) 1167 1168def unquote(s): 1169 """unquote('abc%20def') -> 'abc def'.""" 1170 res = s.split('%') 1171 for i in xrange(1, len(res)): 1172 item = res[i] 1173 try: 1174 res[i] = _hextochr[item[:2]] + item[2:] 1175 except KeyError: 1176 res[i] = '%' + item 1177 except UnicodeDecodeError: 1178 res[i] = unichr(int(item[:2], 16)) + item[2:] 1179 return "".join(res) 1180 1181def unquote_plus(s): 1182 """unquote('%7e/abc+def') -> '~/abc def'""" 1183 s = s.replace('+', ' ') 1184 return unquote(s) 1185 1186always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 1187 'abcdefghijklmnopqrstuvwxyz' 1188 '0123456789' '_.-') 1189_safemaps = {} 1190 1191def quote(s, safe = '/'): 1192 """quote('abc def') -> 'abc%20def' 1193 1194 Each part of a URL, e.g. the path info, the query, etc., has a 1195 different set of reserved characters that must be quoted. 1196 1197 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists 1198 the following reserved characters. 1199 1200 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 1201 "$" | "," 1202 1203 Each of these characters is reserved in some component of a URL, 1204 but not necessarily in all of them. 1205 1206 By default, the quote function is intended for quoting the path 1207 section of a URL. Thus, it will not encode '/'. This character 1208 is reserved, but in typical usage the quote function is being 1209 called on a path where the existing slash characters are used as 1210 reserved characters. 1211 """ 1212 cachekey = (safe, always_safe) 1213 try: 1214 safe_map = _safemaps[cachekey] 1215 except KeyError: 1216 safe += always_safe 1217 safe_map = {} 1218 for i in range(256): 1219 c = chr(i) 1220 safe_map[c] = (c in safe) and c or ('%%%02X' % i) 1221 _safemaps[cachekey] = safe_map 1222 res = map(safe_map.__getitem__, s) 1223 return ''.join(res) 1224 1225def quote_plus(s, safe = ''): 1226 """Quote the query fragment of a URL; replacing ' ' with '+'""" 1227 if ' ' in s: 1228 s = quote(s, safe + ' ') 1229 return s.replace(' ', '+') 1230 return quote(s, safe) 1231 1232def urlencode(query,doseq=0): 1233 """Encode a sequence of two-element tuples or dictionary into a URL query string. 1234 1235 If any values in the query arg are sequences and doseq is true, each 1236 sequence element is converted to a separate parameter. 1237 1238 If the query arg is a sequence of two-element tuples, the order of the 1239 parameters in the output will match the order of parameters in the 1240 input. 1241 """ 1242 1243 if hasattr(query,"items"): 1244 # mapping objects 1245 query = query.items() 1246 else: 1247 # it's a bother at times that strings and string-like objects are 1248 # sequences... 1249 try: 1250 # non-sequence items should not work with len() 1251 # non-empty strings will fail this 1252 if len(query) and not isinstance(query[0], tuple): 1253 raise TypeError 1254 # zero-length sequences of all types will get here and succeed, 1255 # but that's a minor nit - since the original implementation 1256 # allowed empty dicts that type of behavior probably should be 1257 # preserved for consistency 1258 except TypeError: 1259 ty,va,tb = sys.exc_info() 1260 raise TypeError, "not a valid non-string sequence or mapping object", tb 1261 1262 l = [] 1263 if not doseq: 1264 # preserve old behavior 1265 for k, v in query: 1266 k = quote_plus(str(k)) 1267 v = quote_plus(str(v)) 1268 l.append(k + '=' + v) 1269 else: 1270 for k, v in query: 1271 k = quote_plus(str(k)) 1272 if isinstance(v, str): 1273 v = quote_plus(v) 1274 l.append(k + '=' + v) 1275 elif _is_unicode(v): 1276 # is there a reasonable way to convert to ASCII? 1277 # encode generates a string, but "replace" or "ignore" 1278 # lose information and "strict" can raise UnicodeError 1279 v = quote_plus(v.encode("ASCII","replace")) 1280 l.append(k + '=' + v) 1281 else: 1282 try: 1283 # is this a sufficient test for sequence-ness? 1284 x = len(v) 1285 except TypeError: 1286 # not a sequence 1287 v = quote_plus(str(v)) 1288 l.append(k + '=' + v) 1289 else: 1290 # loop over the sequence 1291 for elt in v: 1292 l.append(k + '=' + quote_plus(str(elt))) 1293 return '&'.join(l) 1294 1295# Proxy handling 1296def getproxies_environment(): 1297 """Return a dictionary of scheme -> proxy server URL mappings. 1298 1299 Scan the environment for variables named <scheme>_proxy; 1300 this seems to be the standard convention. If you need a 1301 different way, you can pass a proxies dictionary to the 1302 [Fancy]URLopener constructor. 1303 1304 """ 1305 proxies = {} 1306 for name, value in os.environ.items(): 1307 name = name.lower() 1308 if value and name[-6:] == '_proxy': 1309 proxies[name[:-6]] = value 1310 return proxies 1311 1312def proxy_bypass_environment(host): 1313 """Test if proxies should not be used for a particular host. 1314 1315 Checks the environment for a variable named no_proxy, which should 1316 be a list of DNS suffixes separated by commas, or '*' for all hosts. 1317 """ 1318 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '') 1319 # '*' is special case for always bypass 1320 if no_proxy == '*': 1321 return 1 1322 # strip port off host 1323 hostonly, port = splitport(host) 1324 # check if the host ends with any of the DNS suffixes 1325 for name in no_proxy.split(','): 1326 if name and (hostonly.endswith(name) or host.endswith(name)): 1327 return 1 1328 # otherwise, don't bypass 1329 return 0 1330 1331 1332if sys.platform == 'darwin': 1333 from _scproxy import _get_proxy_settings, _get_proxies 1334 1335 def proxy_bypass_macosx_sysconf(host): 1336 """ 1337 Return True iff this host shouldn't be accessed using a proxy 1338 1339 This function uses the MacOSX framework SystemConfiguration 1340 to fetch the proxy information. 1341 """ 1342 import re 1343 import socket 1344 from fnmatch import fnmatch 1345 1346 hostonly, port = splitport(host) 1347 1348 def ip2num(ipAddr): 1349 parts = ipAddr.split('.') 1350 parts = map(int, parts) 1351 if len(parts) != 4: 1352 parts = (parts + [0, 0, 0, 0])[:4] 1353 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3] 1354 1355 proxy_settings = _get_proxy_settings() 1356 1357 # Check for simple host names: 1358 if '.' not in host: 1359 if proxy_settings['exclude_simple']: 1360 return True 1361 1362 hostIP = None 1363 1364 for value in proxy_settings.get('exceptions', ()): 1365 # Items in the list are strings like these: *.local, 169.254/16 1366 if not value: continue 1367 1368 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value) 1369 if m is not None: 1370 if hostIP is None: 1371 try: 1372 hostIP = socket.gethostbyname(hostonly) 1373 hostIP = ip2num(hostIP) 1374 except socket.error: 1375 continue 1376 1377 base = ip2num(m.group(1)) 1378 mask = m.group(2) 1379 if mask is None: 1380 mask = 8 * (m.group(1).count('.') + 1) 1381 1382 else: 1383 mask = int(mask[1:]) 1384 mask = 32 - mask 1385 1386 if (hostIP >> mask) == (base >> mask): 1387 return True 1388 1389 elif fnmatch(host, value): 1390 return True 1391 1392 return False 1393 1394 1395 def getproxies_macosx_sysconf(): 1396 """Return a dictionary of scheme -> proxy server URL mappings. 1397 1398 This function uses the MacOSX framework SystemConfiguration 1399 to fetch the proxy information. 1400 """ 1401 return _get_proxies() 1402 1403 1404 1405 def proxy_bypass(host): 1406 if getproxies_environment(): 1407 return proxy_bypass_environment(host) 1408 else: 1409 return proxy_bypass_macosx_sysconf(host) 1410 1411 def getproxies(): 1412 return getproxies_environment() or getproxies_macosx_sysconf() 1413 1414elif os.name == 'nt': 1415 def getproxies_registry(): 1416 """Return a dictionary of scheme -> proxy server URL mappings. 1417 1418 Win32 uses the registry to store proxies. 1419 1420 """ 1421 proxies = {} 1422 try: 1423 import _winreg 1424 except ImportError: 1425 # Std module, so should be around - but you never know! 1426 return proxies 1427 try: 1428 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, 1429 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 1430 proxyEnable = _winreg.QueryValueEx(internetSettings, 1431 'ProxyEnable')[0] 1432 if proxyEnable: 1433 # Returned as Unicode but problems if not converted to ASCII 1434 proxyServer = str(_winreg.QueryValueEx(internetSettings, 1435 'ProxyServer')[0]) 1436 if '=' in proxyServer: 1437 # Per-protocol settings 1438 for p in proxyServer.split(';'): 1439 protocol, address = p.split('=', 1) 1440 # See if address has a type:// prefix 1441 import re 1442 if not re.match('^([^/:]+)://', address): 1443 address = '%s://%s' % (protocol, address) 1444 proxies[protocol] = address 1445 else: 1446 # Use one setting for all protocols 1447 if proxyServer[:5] == 'http:': 1448 proxies['http'] = proxyServer 1449 else: 1450 proxies['http'] = 'http://%s' % proxyServer 1451 proxies['https'] = 'http://%s' % proxyServer 1452 proxies['ftp'] = 'ftp://%s' % proxyServer 1453 internetSettings.Close() 1454 except (WindowsError, ValueError, TypeError): 1455 # Either registry key not found etc, or the value in an 1456 # unexpected format. 1457 # proxies already set up to be empty so nothing to do 1458 pass 1459 return proxies 1460 1461 def getproxies(): 1462 """Return a dictionary of scheme -> proxy server URL mappings. 1463 1464 Returns settings gathered from the environment, if specified, 1465 or the registry. 1466 1467 """ 1468 return getproxies_environment() or getproxies_registry() 1469 1470 def proxy_bypass_registry(host): 1471 try: 1472 import _winreg 1473 import re 1474 except ImportError: 1475 # Std modules, so should be around - but you never know! 1476 return 0 1477 try: 1478 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, 1479 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 1480 proxyEnable = _winreg.QueryValueEx(internetSettings, 1481 'ProxyEnable')[0] 1482 proxyOverride = str(_winreg.QueryValueEx(internetSettings, 1483 'ProxyOverride')[0]) 1484 # ^^^^ Returned as Unicode but problems if not converted to ASCII 1485 except WindowsError: 1486 return 0 1487 if not proxyEnable or not proxyOverride: 1488 return 0 1489 # try to make a host list from name and IP address. 1490 rawHost, port = splitport(host) 1491 host = [rawHost] 1492 try: 1493 addr = socket.gethostbyname(rawHost) 1494 if addr != rawHost: 1495 host.append(addr) 1496 except socket.error: 1497 pass 1498 try: 1499 fqdn = socket.getfqdn(rawHost) 1500 if fqdn != rawHost: 1501 host.append(fqdn) 1502 except socket.error: 1503 pass 1504 # make a check value list from the registry entry: replace the 1505 # '<local>' string by the localhost entry and the corresponding 1506 # canonical entry. 1507 proxyOverride = proxyOverride.split(';') 1508 i = 0 1509 while i < len(proxyOverride): 1510 if proxyOverride[i] == '<local>': 1511 proxyOverride[i:i+1] = ['localhost', 1512 '127.0.0.1', 1513 socket.gethostname(), 1514 socket.gethostbyname( 1515 socket.gethostname())] 1516 i += 1 1517 # print proxyOverride 1518 # now check if we match one of the registry values. 1519 for test in proxyOverride: 1520 test = test.replace(".", r"\.") # mask dots 1521 test = test.replace("*", r".*") # change glob sequence 1522 test = test.replace("?", r".") # change glob char 1523 for val in host: 1524 # print "%s <--> %s" %( test, val ) 1525 if re.match(test, val, re.I): 1526 return 1 1527 return 0 1528 1529 def proxy_bypass(host): 1530 """Return a dictionary of scheme -> proxy server URL mappings. 1531 1532 Returns settings gathered from the environment, if specified, 1533 or the registry. 1534 1535 """ 1536 if getproxies_environment(): 1537 return proxy_bypass_environment(host) 1538 else: 1539 return proxy_bypass_registry(host) 1540 1541else: 1542 # By default use environment variables 1543 getproxies = getproxies_environment 1544 proxy_bypass = proxy_bypass_environment 1545 1546# Test and time quote() and unquote() 1547def test1(): 1548 s = '' 1549 for i in range(256): s = s + chr(i) 1550 s = s*4 1551 t0 = time.time() 1552 qs = quote(s) 1553 uqs = unquote(qs) 1554 t1 = time.time() 1555 if uqs != s: 1556 print 'Wrong!' 1557 print repr(s) 1558 print repr(qs) 1559 print repr(uqs) 1560 print round(t1 - t0, 3), 'sec' 1561 1562 1563def reporthook(blocknum, blocksize, totalsize): 1564 # Report during remote transfers 1565 print "Block number: %d, Block size: %d, Total size: %d" % ( 1566 blocknum, blocksize, totalsize) 1567 1568# Test program 1569def test(args=[]): 1570 if not args: 1571 args = [ 1572 '/etc/passwd', 1573 'file:/etc/passwd', 1574 'file://localhost/etc/passwd', 1575 'ftp://ftp.gnu.org/pub/README', 1576 'http://www.python.org/index.html', 1577 ] 1578 if hasattr(URLopener, "open_https"): 1579 args.append('https://synergy.as.cmu.edu/~geek/') 1580 try: 1581 for url in args: 1582 print '-'*10, url, '-'*10 1583 fn, h = urlretrieve(url, None, reporthook) 1584 print fn 1585 if h: 1586 print '======' 1587 for k in h.keys(): print k + ':', h[k] 1588 print '======' 1589 fp = open(fn, 'rb') 1590 data = fp.read() 1591 del fp 1592 if '\r' in data: 1593 table = string.maketrans("", "") 1594 data = data.translate(table, "\r") 1595 print data 1596 fn, h = None, None 1597 print '-'*40 1598 finally: 1599 urlcleanup() 1600 1601def main(): 1602 import getopt, sys 1603 try: 1604 opts, args = getopt.getopt(sys.argv[1:], "th") 1605 except getopt.error, msg: 1606 print msg 1607 print "Use -h for help" 1608 return 1609 t = 0 1610 for o, a in opts: 1611 if o == '-t': 1612 t = t + 1 1613 if o == '-h': 1614 print "Usage: python urllib.py [-t] [url ...]" 1615 print "-t runs self-test;", 1616 print "otherwise, contents of urls are printed" 1617 return 1618 if t: 1619 if t > 1: 1620 test1() 1621 test(args) 1622 else: 1623 if not args: 1624 print "Use -h for help" 1625 for url in args: 1626 print urlopen(url).read(), 1627 1628# Run test program when run as a script 1629if __name__ == '__main__': 1630 main() 1631