1"""Parse (absolute and relative) URLs. 2 3urlparse module is based upon the following RFC specifications. 4 5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding 6and L. Masinter, January 2005. 7 8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter 9and L.Masinter, December 1999. 10 11RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. 12Berners-Lee, R. Fielding, and L. Masinter, August 1998. 13 14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998. 15 16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June 171995. 18 19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. 20McCahill, December 1994 21 22RFC 3986 is considered the current standard and any future changes to 23urlparse module should conform with it. The urlparse module is 24currently not entirely compliant with this RFC due to defacto 25scenarios for parsing, and for backward compatibility purposes, some 26parsing quirks from older RFCs are retained. The testcases in 27test_urlparse.py provides a good indicator of parsing behavior. 28 29""" 30 31import re 32 33__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", 34 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"] 35 36# A classification of schemes ('' means apply by default) 37uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', 38 'wais', 'file', 'https', 'shttp', 'mms', 39 'prospero', 'rtsp', 'rtspu', '', 'sftp', 40 'svn', 'svn+ssh'] 41uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 42 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 43 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', 44 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh'] 45uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', 46 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', 47 'mms', '', 'sftp', 'tel'] 48 49# These are not actually used anymore, but should stay for backwards 50# compatibility. (They are undocumented, but have a public-looking name.) 51non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 52 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] 53uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', 54 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] 55uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 56 'nntp', 'wais', 'https', 'shttp', 'snews', 57 'file', 'prospero', ''] 58 59# Characters valid in scheme names 60scheme_chars = ('abcdefghijklmnopqrstuvwxyz' 61 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 62 '0123456789' 63 '+-.') 64 65MAX_CACHE_SIZE = 20 66_parse_cache = {} 67 68def clear_cache(): 69 """Clear the parse cache.""" 70 _parse_cache.clear() 71 72 73class ResultMixin(object): 74 """Shared methods for the parsed result objects.""" 75 76 @property 77 def username(self): 78 netloc = self.netloc 79 if "@" in netloc: 80 userinfo = netloc.rsplit("@", 1)[0] 81 if ":" in userinfo: 82 userinfo = userinfo.split(":", 1)[0] 83 return userinfo 84 return None 85 86 @property 87 def password(self): 88 netloc = self.netloc 89 if "@" in netloc: 90 userinfo = netloc.rsplit("@", 1)[0] 91 if ":" in userinfo: 92 return userinfo.split(":", 1)[1] 93 return None 94 95 @property 96 def hostname(self): 97 netloc = self.netloc.split('@')[-1] 98 if '[' in netloc and ']' in netloc: 99 return netloc.split(']')[0][1:].lower() 100 elif ':' in netloc: 101 return netloc.split(':')[0].lower() 102 elif netloc == '': 103 return None 104 else: 105 return netloc.lower() 106 107 @property 108 def port(self): 109 netloc = self.netloc.split('@')[-1].split(']')[-1] 110 if ':' in netloc: 111 port = netloc.split(':')[1] 112 port = int(port, 10) 113 # verify legal port 114 if (0 <= port <= 65535): 115 return port 116 return None 117 118from collections import namedtuple 119 120class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin): 121 122 __slots__ = () 123 124 def geturl(self): 125 return urlunsplit(self) 126 127 128class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin): 129 130 __slots__ = () 131 132 def geturl(self): 133 return urlunparse(self) 134 135 136def urlparse(url, scheme='', allow_fragments=True): 137 """Parse a URL into 6 components: 138 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> 139 Return a 6-tuple: (scheme, netloc, path, params, query, fragment). 140 Note that we don't break the components up in smaller bits 141 (e.g. netloc is a single string) and we don't expand % escapes.""" 142 tuple = urlsplit(url, scheme, allow_fragments) 143 scheme, netloc, url, query, fragment = tuple 144 if scheme in uses_params and ';' in url: 145 url, params = _splitparams(url) 146 else: 147 params = '' 148 return ParseResult(scheme, netloc, url, params, query, fragment) 149 150def _splitparams(url): 151 if '/' in url: 152 i = url.find(';', url.rfind('/')) 153 if i < 0: 154 return url, '' 155 else: 156 i = url.find(';') 157 return url[:i], url[i+1:] 158 159def _splitnetloc(url, start=0): 160 delim = len(url) # position of end of domain part of url, default is end 161 for c in '/?#': # look for delimiters; the order is NOT important 162 wdelim = url.find(c, start) # find first of this delim 163 if wdelim >= 0: # if found 164 delim = min(delim, wdelim) # use earliest delim position 165 return url[start:delim], url[delim:] # return (domain, rest) 166 167def urlsplit(url, scheme='', allow_fragments=True): 168 """Parse a URL into 5 components: 169 <scheme>://<netloc>/<path>?<query>#<fragment> 170 Return a 5-tuple: (scheme, netloc, path, query, fragment). 171 Note that we don't break the components up in smaller bits 172 (e.g. netloc is a single string) and we don't expand % escapes.""" 173 allow_fragments = bool(allow_fragments) 174 key = url, scheme, allow_fragments, type(url), type(scheme) 175 cached = _parse_cache.get(key, None) 176 if cached: 177 return cached 178 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth 179 clear_cache() 180 netloc = query = fragment = '' 181 i = url.find(':') 182 if i > 0: 183 if url[:i] == 'http': # optimize the common case 184 scheme = url[:i].lower() 185 url = url[i+1:] 186 if url[:2] == '//': 187 netloc, url = _splitnetloc(url, 2) 188 if (('[' in netloc and ']' not in netloc) or 189 (']' in netloc and '[' not in netloc)): 190 raise ValueError("Invalid IPv6 URL") 191 if allow_fragments and '#' in url: 192 url, fragment = url.split('#', 1) 193 if '?' in url: 194 url, query = url.split('?', 1) 195 v = SplitResult(scheme, netloc, url, query, fragment) 196 _parse_cache[key] = v 197 return v 198 for c in url[:i]: 199 if c not in scheme_chars: 200 break 201 else: 202 # make sure "url" is not actually a port number (in which case 203 # "scheme" is really part of the path) 204 rest = url[i+1:] 205 if not rest or any(c not in '0123456789' for c in rest): 206 # not a port number 207 scheme, url = url[:i].lower(), rest 208 209 if url[:2] == '//': 210 netloc, url = _splitnetloc(url, 2) 211 if (('[' in netloc and ']' not in netloc) or 212 (']' in netloc and '[' not in netloc)): 213 raise ValueError("Invalid IPv6 URL") 214 if allow_fragments and '#' in url: 215 url, fragment = url.split('#', 1) 216 if '?' in url: 217 url, query = url.split('?', 1) 218 v = SplitResult(scheme, netloc, url, query, fragment) 219 _parse_cache[key] = v 220 return v 221 222def urlunparse(data): 223 """Put a parsed URL back together again. This may result in a 224 slightly different, but equivalent URL, if the URL that was parsed 225 originally had redundant delimiters, e.g. a ? with an empty query 226 (the draft states that these are equivalent).""" 227 scheme, netloc, url, params, query, fragment = data 228 if params: 229 url = "%s;%s" % (url, params) 230 return urlunsplit((scheme, netloc, url, query, fragment)) 231 232def urlunsplit(data): 233 """Combine the elements of a tuple as returned by urlsplit() into a 234 complete URL as a string. The data argument can be any five-item iterable. 235 This may result in a slightly different, but equivalent URL, if the URL that 236 was parsed originally had unnecessary delimiters (for example, a ? with an 237 empty query; the RFC states that these are equivalent).""" 238 scheme, netloc, url, query, fragment = data 239 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): 240 if url and url[:1] != '/': url = '/' + url 241 url = '//' + (netloc or '') + url 242 if scheme: 243 url = scheme + ':' + url 244 if query: 245 url = url + '?' + query 246 if fragment: 247 url = url + '#' + fragment 248 return url 249 250def urljoin(base, url, allow_fragments=True): 251 """Join a base URL and a possibly relative URL to form an absolute 252 interpretation of the latter.""" 253 if not base: 254 return url 255 if not url: 256 return base 257 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ 258 urlparse(base, '', allow_fragments) 259 scheme, netloc, path, params, query, fragment = \ 260 urlparse(url, bscheme, allow_fragments) 261 if scheme != bscheme or scheme not in uses_relative: 262 return url 263 if scheme in uses_netloc: 264 if netloc: 265 return urlunparse((scheme, netloc, path, 266 params, query, fragment)) 267 netloc = bnetloc 268 if path[:1] == '/': 269 return urlunparse((scheme, netloc, path, 270 params, query, fragment)) 271 if not path and not params: 272 path = bpath 273 params = bparams 274 if not query: 275 query = bquery 276 return urlunparse((scheme, netloc, path, 277 params, query, fragment)) 278 segments = bpath.split('/')[:-1] + path.split('/') 279 # XXX The stuff below is bogus in various ways... 280 if segments[-1] == '.': 281 segments[-1] = '' 282 while '.' in segments: 283 segments.remove('.') 284 while 1: 285 i = 1 286 n = len(segments) - 1 287 while i < n: 288 if (segments[i] == '..' 289 and segments[i-1] not in ('', '..')): 290 del segments[i-1:i+1] 291 break 292 i = i+1 293 else: 294 break 295 if segments == ['', '..']: 296 segments[-1] = '' 297 elif len(segments) >= 2 and segments[-1] == '..': 298 segments[-2:] = [''] 299 return urlunparse((scheme, netloc, '/'.join(segments), 300 params, query, fragment)) 301 302def urldefrag(url): 303 """Removes any existing fragment from URL. 304 305 Returns a tuple of the defragmented URL and the fragment. If 306 the URL contained no fragments, the second element is the 307 empty string. 308 """ 309 if '#' in url: 310 s, n, p, a, q, frag = urlparse(url) 311 defrag = urlunparse((s, n, p, a, q, '')) 312 return defrag, frag 313 else: 314 return url, '' 315 316try: 317 unicode 318except NameError: 319 def _is_unicode(x): 320 return 0 321else: 322 def _is_unicode(x): 323 return isinstance(x, unicode) 324 325# unquote method for parse_qs and parse_qsl 326# Cannot use directly from urllib as it would create a circular reference 327# because urllib uses urlparse methods (urljoin). If you update this function, 328# update it also in urllib. This code duplication does not existin in Python3. 329 330_hexdig = '0123456789ABCDEFabcdef' 331_hextochr = dict((a+b, chr(int(a+b,16))) 332 for a in _hexdig for b in _hexdig) 333_asciire = re.compile('([\x00-\x7f]+)') 334 335def unquote(s): 336 """unquote('abc%20def') -> 'abc def'.""" 337 if _is_unicode(s): 338 if '%' not in s: 339 return s 340 bits = _asciire.split(s) 341 res = [bits[0]] 342 append = res.append 343 for i in range(1, len(bits), 2): 344 append(unquote(str(bits[i])).decode('latin1')) 345 append(bits[i + 1]) 346 return ''.join(res) 347 348 bits = s.split('%') 349 # fastpath 350 if len(bits) == 1: 351 return s 352 res = [bits[0]] 353 append = res.append 354 for item in bits[1:]: 355 try: 356 append(_hextochr[item[:2]]) 357 append(item[2:]) 358 except KeyError: 359 append('%') 360 append(item) 361 return ''.join(res) 362 363def parse_qs(qs, keep_blank_values=0, strict_parsing=0): 364 """Parse a query given as a string argument. 365 366 Arguments: 367 368 qs: percent-encoded query string to be parsed 369 370 keep_blank_values: flag indicating whether blank values in 371 percent-encoded queries should be treated as blank strings. 372 A true value indicates that blanks should be retained as 373 blank strings. The default false value indicates that 374 blank values are to be ignored and treated as if they were 375 not included. 376 377 strict_parsing: flag indicating what to do with parsing errors. 378 If false (the default), errors are silently ignored. 379 If true, errors raise a ValueError exception. 380 """ 381 dict = {} 382 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing): 383 if name in dict: 384 dict[name].append(value) 385 else: 386 dict[name] = [value] 387 return dict 388 389def parse_qsl(qs, keep_blank_values=0, strict_parsing=0): 390 """Parse a query given as a string argument. 391 392 Arguments: 393 394 qs: percent-encoded query string to be parsed 395 396 keep_blank_values: flag indicating whether blank values in 397 percent-encoded queries should be treated as blank strings. A 398 true value indicates that blanks should be retained as blank 399 strings. The default false value indicates that blank values 400 are to be ignored and treated as if they were not included. 401 402 strict_parsing: flag indicating what to do with parsing errors. If 403 false (the default), errors are silently ignored. If true, 404 errors raise a ValueError exception. 405 406 Returns a list, as G-d intended. 407 """ 408 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] 409 r = [] 410 for name_value in pairs: 411 if not name_value and not strict_parsing: 412 continue 413 nv = name_value.split('=', 1) 414 if len(nv) != 2: 415 if strict_parsing: 416 raise ValueError, "bad query field: %r" % (name_value,) 417 # Handle case of a control-name with no equal sign 418 if keep_blank_values: 419 nv.append('') 420 else: 421 continue 422 if len(nv[1]) or keep_blank_values: 423 name = unquote(nv[0].replace('+', ' ')) 424 value = unquote(nv[1].replace('+', ' ')) 425 r.append((name, value)) 426 427 return r 428