urllib.py revision 332e14437c6e7461e9756f75e4fac3f9d2043023
1# Open an arbitrary URL
2#
3# See the following document for more info on URLs:
4# "Names and Addresses, URIs, URLs, URNs, URCs", at
5# http://www.w3.org/pub/WWW/Addressing/Overview.html
6#
7# See also the HTTP spec (from which the error codes are derived):
8# "HTTP - Hypertext Transfer Protocol", at
9# http://www.w3.org/pub/WWW/Protocols/
10#
11# Related standards and specs:
12# - RFC1808: the "relative URL" spec. (authoritative status)
13# - RFC1738 - the "URL standard". (authoritative status)
14# - RFC1630 - the "URI spec". (informational status)
15#
16# The object returned by URLopener().open(file) will differ per
17# protocol.  All you know is that is has methods read(), readline(),
18# readlines(), fileno(), close() and info().  The read*(), fileno()
19# and close() methods work like those of open files.
20# The info() method returns a mimetools.Message object which can be
21# used to query various info about the object, if available.
22# (mimetools.Message objects are queried with the getheader() method.)
23
24import string
25import socket
26import os
27import sys
28
29
30__version__ = '1.8'
31
32MAXFTPCACHE = 10		# Trim the ftp cache beyond this size
33
34# Helper for non-unix systems
35if os.name == 'mac':
36	from macurl2path import url2pathname, pathname2url
37elif os.name == 'nt':
38	from nturl2path import url2pathname, pathname2url
39else:
40	def url2pathname(pathname):
41		return pathname
42	def pathname2url(pathname):
43		return pathname
44
45# This really consists of two pieces:
46# (1) a class which handles opening of all sorts of URLs
47#     (plus assorted utilities etc.)
48# (2) a set of functions for parsing URLs
49# XXX Should these be separated out into different modules?
50
51
52# Shortcut for basic usage
53_urlopener = None
54def urlopen(url, data=None):
55	global _urlopener
56	if not _urlopener:
57		_urlopener = FancyURLopener()
58	if data is None:
59		return _urlopener.open(url)
60	else:
61		return _urlopener.open(url, data)
62def urlretrieve(url, filename=None):
63	global _urlopener
64	if not _urlopener:
65		_urlopener = FancyURLopener()
66	if filename:
67	    return _urlopener.retrieve(url, filename)
68	else:
69	    return _urlopener.retrieve(url)
70def urlcleanup():
71	if _urlopener:
72		_urlopener.cleanup()
73
74
75# Class to open URLs.
76# This is a class rather than just a subroutine because we may need
77# more than one set of global protocol-specific options.
78# Note -- this is a base class for those who don't want the
79# automatic handling of errors type 302 (relocated) and 401
80# (authorization needed).
81ftpcache = {}
82class URLopener:
83
84	__tempfiles = []
85
86	# Constructor
87	def __init__(self, proxies=None):
88		if proxies is None:
89			proxies = getproxies()
90		self.proxies = proxies
91		server_version = "Python-urllib/%s" % __version__
92		self.addheaders = [('User-agent', server_version)]
93		self.__tempfiles = []
94		self.tempcache = None
95		# Undocumented feature: if you assign {} to tempcache,
96		# it is used to cache files retrieved with
97		# self.retrieve().  This is not enabled by default
98		# since it does not work for changing documents (and I
99		# haven't got the logic to check expiration headers
100		# yet).
101		self.ftpcache = ftpcache
102		# Undocumented feature: you can use a different
103		# ftp cache by assigning to the .ftpcache member;
104		# in case you want logically independent URL openers
105
106	def __del__(self):
107		self.close()
108
109	def close(self):
110		self.cleanup()
111
112	def cleanup(self):
113		if self.__tempfiles:
114			import os
115			for file in self.__tempfiles:
116				try:
117					os.unlink(file)
118				except os.error:
119					pass
120		URLopener.__tempfiles = []
121		self.tempcache = None
122
123	# Add a header to be used by the HTTP interface only
124	# e.g. u.addheader('Accept', 'sound/basic')
125	def addheader(self, *args):
126		self.addheaders.append(args)
127
128	# External interface
129	# Use URLopener().open(file) instead of open(file, 'r')
130	def open(self, fullurl, data=None):
131		fullurl = unwrap(fullurl)
132		type, url = splittype(fullurl)
133 		if not type: type = 'file'
134		self.openedurl = '%s:%s' % (type, url)
135		if self.proxies.has_key(type):
136			proxy = self.proxies[type]
137			type, proxy = splittype(proxy)
138			host, selector = splithost(proxy)
139			url = (host, fullurl) # Signal special case to open_*()
140		name = 'open_' + type
141		if '-' in name:
142		        # replace - with _
143			name = string.join(string.split(name, '-'), '_')
144		if not hasattr(self, name):
145			if data is None:
146				return self.open_unknown(fullurl)
147			else:
148				return self.open_unknown(fullurl, data)
149		try:
150			if data is None:
151				return getattr(self, name)(url)
152			else:
153				return getattr(self, name)(url, data)
154		except socket.error, msg:
155			raise IOError, ('socket error', msg), sys.exc_info()[2]
156
157	# Overridable interface to open unknown URL type
158	def open_unknown(self, fullurl, data=None):
159		type, url = splittype(fullurl)
160		raise IOError, ('url error', 'unknown url type', type)
161
162	# External interface
163	# retrieve(url) returns (filename, None) for a local object
164	# or (tempfilename, headers) for a remote object
165	def retrieve(self, url, filename=None):
166		if self.tempcache and self.tempcache.has_key(url):
167			return self.tempcache[url]
168		url1 = unwrap(url)
169		self.openedurl = url1
170		if self.tempcache and self.tempcache.has_key(url1):
171			self.tempcache[url] = self.tempcache[url1]
172			return self.tempcache[url1]
173		type, url1 = splittype(url1)
174		if not filename and (not type or type == 'file'):
175			try:
176				fp = self.open_local_file(url1)
177				del fp
178				return url2pathname(splithost(url1)[1]), None
179			except IOError, msg:
180				pass
181		fp = self.open(url)
182		headers = fp.info()
183		if not filename:
184		    import tempfile
185		    filename = tempfile.mktemp()
186		    self.__tempfiles.append(filename)
187		result = filename, headers
188		if self.tempcache is not None:
189			self.tempcache[url] = result
190		tfp = open(filename, 'wb')
191		bs = 1024*8
192		block = fp.read(bs)
193		while block:
194			tfp.write(block)
195			block = fp.read(bs)
196		fp.close()
197		tfp.close()
198		del fp
199		del tfp
200		return result
201
202	# Each method named open_<type> knows how to open that type of URL
203
204	# Use HTTP protocol
205	def open_http(self, url, data=None):
206		import httplib
207		if type(url) is type(""):
208			host, selector = splithost(url)
209			user_passwd, host = splituser(host)
210			realhost = host
211		else:
212			host, selector = url
213			urltype, rest = splittype(selector)
214			user_passwd = None
215			if string.lower(urltype) != 'http':
216			    realhost = None
217			else:
218			    realhost, rest = splithost(rest)
219			    user_passwd, realhost = splituser(realhost)
220			    if user_passwd:
221				selector = "%s://%s%s" % (urltype,
222							  realhost, rest)
223			#print "proxy via http:", host, selector
224		if not host: raise IOError, ('http error', 'no host given')
225		if user_passwd:
226			import base64
227			auth = string.strip(base64.encodestring(user_passwd))
228		else:
229			auth = None
230		h = httplib.HTTP(host)
231		if data is not None:
232			h.putrequest('POST', selector)
233			h.putheader('Content-type',
234				    'application/x-www-form-urlencoded')
235			h.putheader('Content-length', '%d' % len(data))
236		else:
237			h.putrequest('GET', selector)
238		if auth: h.putheader('Authorization', 'Basic %s' % auth)
239		if realhost: h.putheader('Host', realhost)
240		for args in self.addheaders: apply(h.putheader, args)
241		h.endheaders()
242		if data is not None:
243			h.send(data + '\r\n')
244		errcode, errmsg, headers = h.getreply()
245		fp = h.getfile()
246		if errcode == 200:
247			return addinfourl(fp, headers, self.openedurl)
248		else:
249			return self.http_error(url,
250					       fp, errcode, errmsg, headers)
251
252	# Handle http errors.
253	# Derived class can override this, or provide specific handlers
254	# named http_error_DDD where DDD is the 3-digit error code
255	def http_error(self, url, fp, errcode, errmsg, headers):
256		# First check if there's a specific handler for this error
257		name = 'http_error_%d' % errcode
258		if hasattr(self, name):
259			method = getattr(self, name)
260			result = method(url, fp, errcode, errmsg, headers)
261			if result: return result
262		return self.http_error_default(
263			url, fp, errcode, errmsg, headers)
264
265	# Default http error handler: close the connection and raises IOError
266	def http_error_default(self, url, fp, errcode, errmsg, headers):
267		void = fp.read()
268		fp.close()
269		raise IOError, ('http error', errcode, errmsg, headers)
270
271	# Use Gopher protocol
272	def open_gopher(self, url):
273		import gopherlib
274		host, selector = splithost(url)
275		if not host: raise IOError, ('gopher error', 'no host given')
276		type, selector = splitgophertype(selector)
277		selector, query = splitquery(selector)
278		selector = unquote(selector)
279		if query:
280			query = unquote(query)
281			fp = gopherlib.send_query(selector, query, host)
282		else:
283			fp = gopherlib.send_selector(selector, host)
284		return addinfourl(fp, noheaders(), self.openedurl)
285
286	# Use local file or FTP depending on form of URL
287	def open_file(self, url):
288	    if url[:2] == '//' and url[2:3] != '/':
289		return self.open_ftp(url)
290	    else:
291		return self.open_local_file(url)
292
293	# Use local file
294	def open_local_file(self, url):
295		host, file = splithost(url)
296		if not host:
297			return addinfourl(
298				open(url2pathname(file), 'rb'),
299				noheaders(), 'file:'+file)
300		host, port = splitport(host)
301		if not port and socket.gethostbyname(host) in (
302			  localhost(), thishost()):
303			file = unquote(file)
304			return addinfourl(
305				open(url2pathname(file), 'rb'),
306				noheaders(), 'file:'+file)
307		raise IOError, ('local file error', 'not on local host')
308
309	# Use FTP protocol
310	def open_ftp(self, url):
311		host, path = splithost(url)
312		if not host: raise IOError, ('ftp error', 'no host given')
313		host, port = splitport(host)
314		user, host = splituser(host)
315		if user: user, passwd = splitpasswd(user)
316		else: passwd = None
317		host = socket.gethostbyname(host)
318		if not port:
319			import ftplib
320			port = ftplib.FTP_PORT
321		path, attrs = splitattr(path)
322		dirs = string.splitfields(path, '/')
323		dirs, file = dirs[:-1], dirs[-1]
324		if dirs and not dirs[0]: dirs = dirs[1:]
325		key = (user, host, port, string.joinfields(dirs, '/'))
326		if len(self.ftpcache) > MAXFTPCACHE:
327			# Prune the cache, rather arbitrarily
328			for k in self.ftpcache.keys():
329				if k != key:
330					v = self.ftpcache[k]
331					del self.ftpcache[k]
332					v.close()
333		try:
334			if not self.ftpcache.has_key(key):
335				self.ftpcache[key] = \
336						   ftpwrapper(user, passwd,
337							      host, port, dirs)
338			if not file: type = 'D'
339			else: type = 'I'
340			for attr in attrs:
341				attr, value = splitvalue(attr)
342				if string.lower(attr) == 'type' and \
343				   value in ('a', 'A', 'i', 'I', 'd', 'D'):
344					type = string.upper(value)
345			return addinfourl(
346				self.ftpcache[key].retrfile(file, type),
347				noheaders(), self.openedurl)
348		except ftperrors(), msg:
349			raise IOError, ('ftp error', msg), sys.exc_info()[2]
350
351
352# Derived class with handlers for errors we can handle (perhaps)
353class FancyURLopener(URLopener):
354
355	def __init__(self, *args):
356		apply(URLopener.__init__, (self,) + args)
357		self.auth_cache = {}
358
359	# Default error handling -- don't raise an exception
360	def http_error_default(self, url, fp, errcode, errmsg, headers):
361	    return addinfourl(fp, headers, self.openedurl)
362
363	# Error 302 -- relocated (temporarily)
364	def http_error_302(self, url, fp, errcode, errmsg, headers):
365		# XXX The server can force infinite recursion here!
366		if headers.has_key('location'):
367			newurl = headers['location']
368		elif headers.has_key('uri'):
369			newurl = headers['uri']
370		else:
371			return
372		void = fp.read()
373		fp.close()
374		return self.open(newurl)
375
376	# Error 301 -- also relocated (permanently)
377	http_error_301 = http_error_302
378
379	# Error 401 -- authentication required
380	# See this URL for a description of the basic authentication scheme:
381	# http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt
382	def http_error_401(self, url, fp, errcode, errmsg, headers):
383		if headers.has_key('www-authenticate'):
384			stuff = headers['www-authenticate']
385			import re
386			match = re.match(
387			    '[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
388			if match:
389			        scheme, realm = match.group()
390				if string.lower(scheme) == 'basic':
391					return self.retry_http_basic_auth(
392						url, realm)
393
394	def retry_http_basic_auth(self, url, realm):
395		host, selector = splithost(url)
396		i = string.find(host, '@') + 1
397		host = host[i:]
398		user, passwd = self.get_user_passwd(host, realm, i)
399		if not (user or passwd): return None
400		host = user + ':' + passwd + '@' + host
401		newurl = '//' + host + selector
402		return self.open_http(newurl)
403
404	def get_user_passwd(self, host, realm, clear_cache = 0):
405		key = realm + '@' + string.lower(host)
406		if self.auth_cache.has_key(key):
407			if clear_cache:
408				del self.auth_cache[key]
409			else:
410				return self.auth_cache[key]
411		user, passwd = self.prompt_user_passwd(host, realm)
412		if user or passwd: self.auth_cache[key] = (user, passwd)
413		return user, passwd
414
415	def prompt_user_passwd(self, host, realm):
416		# Override this in a GUI environment!
417		try:
418			user = raw_input("Enter username for %s at %s: " %
419					 (realm, host))
420			self.echo_off()
421			try:
422				passwd = raw_input(
423				  "Enter password for %s in %s at %s: " %
424				  (user, realm, host))
425			finally:
426				self.echo_on()
427			return user, passwd
428		except KeyboardInterrupt:
429			return None, None
430
431	def echo_off(self):
432		import os
433		os.system("stty -echo")
434
435	def echo_on(self):
436		import os
437		print
438		os.system("stty echo")
439
440
441# Utility functions
442
443# Return the IP address of the magic hostname 'localhost'
444_localhost = None
445def localhost():
446	global _localhost
447	if not _localhost:
448		_localhost = socket.gethostbyname('localhost')
449	return _localhost
450
451# Return the IP address of the current host
452_thishost = None
453def thishost():
454	global _thishost
455	if not _thishost:
456		_thishost = socket.gethostbyname(socket.gethostname())
457	return _thishost
458
459# Return the set of errors raised by the FTP class
460_ftperrors = None
461def ftperrors():
462	global _ftperrors
463	if not _ftperrors:
464		import ftplib
465		_ftperrors = ftplib.all_errors
466	return _ftperrors
467
468# Return an empty mimetools.Message object
469_noheaders = None
470def noheaders():
471	global _noheaders
472	if not _noheaders:
473		import mimetools
474		import StringIO
475		_noheaders = mimetools.Message(StringIO.StringIO(), 0)
476		_noheaders.fp.close()	# Recycle file descriptor
477	return _noheaders
478
479
480# Utility classes
481
482# Class used by open_ftp() for cache of open FTP connections
483class ftpwrapper:
484	def __init__(self, user, passwd, host, port, dirs):
485		self.user = unquote(user or '')
486		self.passwd = unquote(passwd or '')
487		self.host = host
488		self.port = port
489		self.dirs = []
490		for dir in dirs:
491			self.dirs.append(unquote(dir))
492		self.init()
493	def init(self):
494		import ftplib
495		self.ftp = ftplib.FTP()
496		self.ftp.connect(self.host, self.port)
497		self.ftp.login(self.user, self.passwd)
498		for dir in self.dirs:
499			self.ftp.cwd(dir)
500	def retrfile(self, file, type):
501		import ftplib
502		if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
503		else: cmd = 'TYPE ' + type; isdir = 0
504		try:
505			self.ftp.voidcmd(cmd)
506		except ftplib.all_errors:
507			self.init()
508			self.ftp.voidcmd(cmd)
509		conn = None
510		if file and not isdir:
511			try:
512				cmd = 'RETR ' + file
513				conn = self.ftp.transfercmd(cmd)
514			except ftplib.error_perm, reason:
515				if reason[:3] != '550':
516					raise IOError, ('ftp error', reason), \
517					      sys.exc_info()[2]
518		if not conn:
519			# Try a directory listing
520			if file: cmd = 'LIST ' + file
521			else: cmd = 'LIST'
522			conn = self.ftp.transfercmd(cmd)
523		return addclosehook(conn.makefile('rb'), self.endtransfer)
524	def endtransfer(self):
525		try:
526			self.ftp.voidresp()
527		except ftperrors():
528			pass
529	def close(self):
530		try:
531			self.ftp.close()
532		except ftperrors():
533			pass
534
535# Base class for addinfo and addclosehook
536class addbase:
537	def __init__(self, fp):
538		self.fp = fp
539		self.read = self.fp.read
540		self.readline = self.fp.readline
541		self.readlines = self.fp.readlines
542		self.fileno = self.fp.fileno
543	def __repr__(self):
544		return '<%s at %s whose fp = %s>' % (
545			  self.__class__.__name__, `id(self)`, `self.fp`)
546	def close(self):
547		self.read = None
548		self.readline = None
549		self.readlines = None
550		self.fileno = None
551		if self.fp: self.fp.close()
552		self.fp = None
553
554# Class to add a close hook to an open file
555class addclosehook(addbase):
556	def __init__(self, fp, closehook, *hookargs):
557		addbase.__init__(self, fp)
558		self.closehook = closehook
559		self.hookargs = hookargs
560	def close(self):
561		if self.closehook:
562			apply(self.closehook, self.hookargs)
563			self.closehook = None
564			self.hookargs = None
565		addbase.close(self)
566
567# class to add an info() method to an open file
568class addinfo(addbase):
569	def __init__(self, fp, headers):
570		addbase.__init__(self, fp)
571		self.headers = headers
572	def info(self):
573		return self.headers
574
575# class to add info() and geturl() methods to an open file
576class addinfourl(addbase):
577	def __init__(self, fp, headers, url):
578		addbase.__init__(self, fp)
579		self.headers = headers
580		self.url = url
581	def info(self):
582		return self.headers
583	def geturl(self):
584		return self.url
585
586
587# Utility to combine a URL with a base URL to form a new URL
588
589def basejoin(base, url):
590	type, path = splittype(url)
591	if type:
592		# if url is complete (i.e., it contains a type), return it
593		return url
594	host, path = splithost(path)
595	type, basepath = splittype(base) # inherit type from base
596	if host:
597		# if url contains host, just inherit type
598		if type: return type + '://' + host + path
599		else:
600			# no type inherited, so url must have started with //
601			# just return it
602			return url
603	host, basepath = splithost(basepath) # inherit host
604	basepath, basetag = splittag(basepath) # remove extraneuous cruft
605	basepath, basequery = splitquery(basepath) # idem
606	if path[:1] != '/':
607		# non-absolute path name
608		if path[:1] in ('#', '?'):
609			# path is just a tag or query, attach to basepath
610			i = len(basepath)
611		else:
612			# else replace last component
613			i = string.rfind(basepath, '/')
614		if i < 0:
615			# basepath not absolute
616			if host:
617				# host present, make absolute
618				basepath = '/'
619			else:
620				# else keep non-absolute
621				basepath = ''
622		else:
623			# remove last file component
624			basepath = basepath[:i+1]
625		# Interpret ../ (important because of symlinks)
626		while basepath and path[:3] == '../':
627			path = path[3:]
628			i = string.rfind(basepath[:-1], '/')
629			if i > 0:
630				basepath = basepath[:i+1]
631			elif i == 0:
632				basepath = '/'
633				break
634			else:
635				basepath = ''
636
637		path = basepath + path
638	if type and host: return type + '://' + host + path
639	elif type: return type + ':' + path
640	elif host: return '//' + host + path # don't know what this means
641	else: return path
642
643
644# Utilities to parse URLs (most of these return None for missing parts):
645# unwrap('<URL:type://host/path>') --> 'type://host/path'
646# splittype('type:opaquestring') --> 'type', 'opaquestring'
647# splithost('//host[:port]/path') --> 'host[:port]', '/path'
648# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
649# splitpasswd('user:passwd') -> 'user', 'passwd'
650# splitport('host:port') --> 'host', 'port'
651# splitquery('/path?query') --> '/path', 'query'
652# splittag('/path#tag') --> '/path', 'tag'
653# splitattr('/path;attr1=value1;attr2=value2;...') ->
654#   '/path', ['attr1=value1', 'attr2=value2', ...]
655# splitvalue('attr=value') --> 'attr', 'value'
656# splitgophertype('/Xselector') --> 'X', 'selector'
657# unquote('abc%20def') -> 'abc def'
658# quote('abc def') -> 'abc%20def')
659
660def unwrap(url):
661	url = string.strip(url)
662	if url[:1] == '<' and url[-1:] == '>':
663		url = string.strip(url[1:-1])
664	if url[:4] == 'URL:': url = string.strip(url[4:])
665	return url
666
667_typeprog = None
668def splittype(url):
669	global _typeprog
670	if _typeprog is None:
671	    import re
672	    _typeprog = re.compile('^([^/:]+):')
673
674        match = _typeprog.match(url)
675	if match:
676		scheme = match.group(1)
677		return scheme, url[len(scheme) + 1:]
678	return None, url
679
680_hostprog = None
681def splithost(url):
682	global _hostprog
683	if _hostprog is None:
684	    import re
685	    _hostprog = re.compile('^//([^/]+)(.*)$')
686
687        match = _hostprog.match(url)
688	if match: return match.group(1, 2)
689	return None, url
690
691_userprog = None
692def splituser(host):
693	global _userprog
694	if _userprog is None:
695	    import re
696	    _userprog = re.compile('^([^@]*)@(.*)$')
697
698        match = _userprog.match(host)
699	if match: return match.group(1, 2)
700	return None, host
701
702_passwdprog = None
703def splitpasswd(user):
704	global _passwdprog
705	if _passwdprog is None:
706	    import re
707	    _passwdprog = re.compile('^([^:]*):(.*)$')
708
709        match = _passwdprog.match(host)
710	if match: return match.group(1, 2)
711	return user, None
712
713_portprog = None
714def splitport(host):
715	global _portprog
716	if _portprog is None:
717	    import re
718	    _portprog = re.compile('^(.*):([0-9]+)$')
719
720        match = _portprog.match(host)
721	if match: return match.group(1, 2)
722	return host, None
723
724# Split host and port, returning numeric port.
725# Return given default port if no ':' found; defaults to -1.
726# Return numerical port if a valid number are found after ':'.
727# Return None if ':' but not a valid number.
728_nportprog = None
729def splitnport(host, defport=-1):
730	global _nportprog
731	if _nportprog is None:
732	    import re
733	    _nportprog = re.compile('^(.*):(.*)$')
734
735        match = _nportprog.match(host)
736	if match:
737	    host, port = match.group(1, 2)
738	    try:
739		if not port: raise string.atoi_error, "no digits"
740		nport = string.atoi(port)
741	    except string.atoi_error:
742		nport = None
743	    return host, nport
744	return host, defport
745
746_queryprog = None
747def splitquery(url):
748	global _queryprog
749	if _queryprog is None:
750	    import re
751	    _queryprog = re.compile('^(.*)\?([^?]*)$')
752
753        match = _queryprog.match(url)
754	if match: return match.group(1, 2)
755	return url, None
756
757_tagprog = None
758def splittag(url):
759	global _tagprog
760	if _tagprog is None:
761	    import re
762	    _tagprog = re.compile('^(.*)#([^#]*)$')
763
764        match = _tagprog.match(url)
765	if match: return match.group(1, 2)
766	return url, None
767
768def splitattr(url):
769	words = string.splitfields(url, ';')
770	return words[0], words[1:]
771
772_valueprog = None
773def splitvalue(attr):
774	global _valueprog
775	if _valueprog is None:
776	    import re
777	    _valueprog = re.compile('^([^=]*)=(.*)$')
778
779        match = _valueprog.match(attr)
780	if match: return match.group(1, 2)
781	return attr, None
782
783def splitgophertype(selector):
784	if selector[:1] == '/' and selector[1:2]:
785		return selector[1], selector[2:]
786	return None, selector
787
788_quoteprog = None
789def unquote(s):
790	global _quoteprog
791	if _quoteprog is None:
792	    import re
793	    _quoteprog = re.compile('%[0-9a-fA-F][0-9a-fA-F]')
794
795	i = 0
796	n = len(s)
797	res = []
798	while 0 <= i < n:
799		match = _quoteprog.search(s, i)
800		if not match:
801			res.append(s[i:])
802			break
803		j = match.start(0)
804		res.append(s[i:j] + chr(string.atoi(s[j+1:j+3], 16)))
805		i = j+3
806	return string.joinfields(res, '')
807
808def unquote_plus(s):
809    if '+' in s:
810	# replace '+' with ' '
811	s = string.join(string.split(s, '+'), ' ')
812    return unquote(s)
813
814always_safe = string.letters + string.digits + '_,.-'
815def quote(s, safe = '/'):
816	safe = always_safe + safe
817	res = []
818	for c in s:
819		if c in safe:
820			res.append(c)
821		else:
822			res.append('%%%02x' % ord(c))
823	return string.joinfields(res, '')
824
825def quote_plus(s, safe = '/'):
826    if ' ' in s:
827	# replace ' ' with '+'
828	s = string.join(string.split(s, ' '), '+')
829	return quote(s, safe + '+')
830    else:
831	return quote(s, safe)
832
833
834# Proxy handling
835def getproxies():
836	"""Return a dictionary of protocol scheme -> proxy server URL mappings.
837
838	Scan the environment for variables named <scheme>_proxy;
839	this seems to be the standard convention.  If you need a
840	different way, you can pass a proxies dictionary to the
841	[Fancy]URLopener constructor.
842
843	"""
844	proxies = {}
845	for name, value in os.environ.items():
846		name = string.lower(name)
847		if value and name[-6:] == '_proxy':
848			proxies[name[:-6]] = value
849	return proxies
850
851
852# Test and time quote() and unquote()
853def test1():
854	import time
855	s = ''
856	for i in range(256): s = s + chr(i)
857	s = s*4
858	t0 = time.time()
859	qs = quote(s)
860	uqs = unquote(qs)
861	t1 = time.time()
862	if uqs != s:
863		print 'Wrong!'
864	print `s`
865	print `qs`
866	print `uqs`
867	print round(t1 - t0, 3), 'sec'
868
869
870# Test program
871def test():
872	import sys
873	args = sys.argv[1:]
874	if not args:
875		args = [
876			'/etc/passwd',
877			'file:/etc/passwd',
878			'file://localhost/etc/passwd',
879			'ftp://ftp.python.org/etc/passwd',
880			'gopher://gopher.micro.umn.edu/1/',
881			'http://www.python.org/index.html',
882			]
883	try:
884		for url in args:
885			print '-'*10, url, '-'*10
886			fn, h = urlretrieve(url)
887			print fn, h
888			if h:
889				print '======'
890				for k in h.keys(): print k + ':', h[k]
891				print '======'
892			fp = open(fn, 'rb')
893			data = fp.read()
894			del fp
895			if '\r' in data:
896			    table = string.maketrans("", "")
897			    data = string.translate(data, table, "\r")
898			print data
899			fn, h = None, None
900		print '-'*40
901	finally:
902		urlcleanup()
903
904# Run test program when run as a script
905if __name__ == '__main__':
906	test1()
907	test()
908