1"""Parse (absolute and relative) URLs.
2
3urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L.  Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it.  The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
27test_urlparse.py provides a good indicator of parsing behavior.
28
29"""
30
31import re
32
33__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
34           "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
35
36# A classification of schemes ('' means apply by default)
37uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
38                 'wais', 'file', 'https', 'shttp', 'mms',
39                 'prospero', 'rtsp', 'rtspu', '', 'sftp',
40                 'svn', 'svn+ssh']
41uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
42               'imap', 'wais', 'file', 'mms', 'https', 'shttp',
43               'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
44               'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
45uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
46               'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
47               'mms', '', 'sftp', 'tel']
48
49# These are not actually used anymore, but should stay for backwards
50# compatibility.  (They are undocumented, but have a public-looking name.)
51non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
52                    'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
53uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
54              'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
55uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
56                 'nntp', 'wais', 'https', 'shttp', 'snews',
57                 'file', 'prospero', '']
58
59# Characters valid in scheme names
60scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
61                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
62                '0123456789'
63                '+-.')
64
65MAX_CACHE_SIZE = 20
66_parse_cache = {}
67
68def clear_cache():
69    """Clear the parse cache."""
70    _parse_cache.clear()
71
72
73class ResultMixin(object):
74    """Shared methods for the parsed result objects."""
75
76    @property
77    def username(self):
78        netloc = self.netloc
79        if "@" in netloc:
80            userinfo = netloc.rsplit("@", 1)[0]
81            if ":" in userinfo:
82                userinfo = userinfo.split(":", 1)[0]
83            return userinfo
84        return None
85
86    @property
87    def password(self):
88        netloc = self.netloc
89        if "@" in netloc:
90            userinfo = netloc.rsplit("@", 1)[0]
91            if ":" in userinfo:
92                return userinfo.split(":", 1)[1]
93        return None
94
95    @property
96    def hostname(self):
97        netloc = self.netloc.split('@')[-1]
98        if '[' in netloc and ']' in netloc:
99            return netloc.split(']')[0][1:].lower()
100        elif ':' in netloc:
101            return netloc.split(':')[0].lower()
102        elif netloc == '':
103            return None
104        else:
105            return netloc.lower()
106
107    @property
108    def port(self):
109        netloc = self.netloc.split('@')[-1].split(']')[-1]
110        if ':' in netloc:
111            port = netloc.split(':')[1]
112            port = int(port, 10)
113            # verify legal port
114            if (0 <= port <= 65535):
115                return port
116        return None
117
118from collections import namedtuple
119
120class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
121
122    __slots__ = ()
123
124    def geturl(self):
125        return urlunsplit(self)
126
127
128class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
129
130    __slots__ = ()
131
132    def geturl(self):
133        return urlunparse(self)
134
135
136def urlparse(url, scheme='', allow_fragments=True):
137    """Parse a URL into 6 components:
138    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
139    Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
140    Note that we don't break the components up in smaller bits
141    (e.g. netloc is a single string) and we don't expand % escapes."""
142    tuple = urlsplit(url, scheme, allow_fragments)
143    scheme, netloc, url, query, fragment = tuple
144    if scheme in uses_params and ';' in url:
145        url, params = _splitparams(url)
146    else:
147        params = ''
148    return ParseResult(scheme, netloc, url, params, query, fragment)
149
150def _splitparams(url):
151    if '/'  in url:
152        i = url.find(';', url.rfind('/'))
153        if i < 0:
154            return url, ''
155    else:
156        i = url.find(';')
157    return url[:i], url[i+1:]
158
159def _splitnetloc(url, start=0):
160    delim = len(url)   # position of end of domain part of url, default is end
161    for c in '/?#':    # look for delimiters; the order is NOT important
162        wdelim = url.find(c, start)        # find first of this delim
163        if wdelim >= 0:                    # if found
164            delim = min(delim, wdelim)     # use earliest delim position
165    return url[start:delim], url[delim:]   # return (domain, rest)
166
167def urlsplit(url, scheme='', allow_fragments=True):
168    """Parse a URL into 5 components:
169    <scheme>://<netloc>/<path>?<query>#<fragment>
170    Return a 5-tuple: (scheme, netloc, path, query, fragment).
171    Note that we don't break the components up in smaller bits
172    (e.g. netloc is a single string) and we don't expand % escapes."""
173    allow_fragments = bool(allow_fragments)
174    key = url, scheme, allow_fragments, type(url), type(scheme)
175    cached = _parse_cache.get(key, None)
176    if cached:
177        return cached
178    if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
179        clear_cache()
180    netloc = query = fragment = ''
181    i = url.find(':')
182    if i > 0:
183        if url[:i] == 'http': # optimize the common case
184            scheme = url[:i].lower()
185            url = url[i+1:]
186            if url[:2] == '//':
187                netloc, url = _splitnetloc(url, 2)
188                if (('[' in netloc and ']' not in netloc) or
189                        (']' in netloc and '[' not in netloc)):
190                    raise ValueError("Invalid IPv6 URL")
191            if allow_fragments and '#' in url:
192                url, fragment = url.split('#', 1)
193            if '?' in url:
194                url, query = url.split('?', 1)
195            v = SplitResult(scheme, netloc, url, query, fragment)
196            _parse_cache[key] = v
197            return v
198        for c in url[:i]:
199            if c not in scheme_chars:
200                break
201        else:
202            # make sure "url" is not actually a port number (in which case
203            # "scheme" is really part of the path)
204            rest = url[i+1:]
205            if not rest or any(c not in '0123456789' for c in rest):
206                # not a port number
207                scheme, url = url[:i].lower(), rest
208
209    if url[:2] == '//':
210        netloc, url = _splitnetloc(url, 2)
211        if (('[' in netloc and ']' not in netloc) or
212                (']' in netloc and '[' not in netloc)):
213            raise ValueError("Invalid IPv6 URL")
214    if allow_fragments and '#' in url:
215        url, fragment = url.split('#', 1)
216    if '?' in url:
217        url, query = url.split('?', 1)
218    v = SplitResult(scheme, netloc, url, query, fragment)
219    _parse_cache[key] = v
220    return v
221
222def urlunparse(data):
223    """Put a parsed URL back together again.  This may result in a
224    slightly different, but equivalent URL, if the URL that was parsed
225    originally had redundant delimiters, e.g. a ? with an empty query
226    (the draft states that these are equivalent)."""
227    scheme, netloc, url, params, query, fragment = data
228    if params:
229        url = "%s;%s" % (url, params)
230    return urlunsplit((scheme, netloc, url, query, fragment))
231
232def urlunsplit(data):
233    """Combine the elements of a tuple as returned by urlsplit() into a
234    complete URL as a string. The data argument can be any five-item iterable.
235    This may result in a slightly different, but equivalent URL, if the URL that
236    was parsed originally had unnecessary delimiters (for example, a ? with an
237    empty query; the RFC states that these are equivalent)."""
238    scheme, netloc, url, query, fragment = data
239    if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
240        if url and url[:1] != '/': url = '/' + url
241        url = '//' + (netloc or '') + url
242    if scheme:
243        url = scheme + ':' + url
244    if query:
245        url = url + '?' + query
246    if fragment:
247        url = url + '#' + fragment
248    return url
249
250def urljoin(base, url, allow_fragments=True):
251    """Join a base URL and a possibly relative URL to form an absolute
252    interpretation of the latter."""
253    if not base:
254        return url
255    if not url:
256        return base
257    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
258            urlparse(base, '', allow_fragments)
259    scheme, netloc, path, params, query, fragment = \
260            urlparse(url, bscheme, allow_fragments)
261    if scheme != bscheme or scheme not in uses_relative:
262        return url
263    if scheme in uses_netloc:
264        if netloc:
265            return urlunparse((scheme, netloc, path,
266                               params, query, fragment))
267        netloc = bnetloc
268    if path[:1] == '/':
269        return urlunparse((scheme, netloc, path,
270                           params, query, fragment))
271    if not path and not params:
272        path = bpath
273        params = bparams
274        if not query:
275            query = bquery
276        return urlunparse((scheme, netloc, path,
277                           params, query, fragment))
278    segments = bpath.split('/')[:-1] + path.split('/')
279    # XXX The stuff below is bogus in various ways...
280    if segments[-1] == '.':
281        segments[-1] = ''
282    while '.' in segments:
283        segments.remove('.')
284    while 1:
285        i = 1
286        n = len(segments) - 1
287        while i < n:
288            if (segments[i] == '..'
289                and segments[i-1] not in ('', '..')):
290                del segments[i-1:i+1]
291                break
292            i = i+1
293        else:
294            break
295    if segments == ['', '..']:
296        segments[-1] = ''
297    elif len(segments) >= 2 and segments[-1] == '..':
298        segments[-2:] = ['']
299    return urlunparse((scheme, netloc, '/'.join(segments),
300                       params, query, fragment))
301
302def urldefrag(url):
303    """Removes any existing fragment from URL.
304
305    Returns a tuple of the defragmented URL and the fragment.  If
306    the URL contained no fragments, the second element is the
307    empty string.
308    """
309    if '#' in url:
310        s, n, p, a, q, frag = urlparse(url)
311        defrag = urlunparse((s, n, p, a, q, ''))
312        return defrag, frag
313    else:
314        return url, ''
315
316try:
317    unicode
318except NameError:
319    def _is_unicode(x):
320        return 0
321else:
322    def _is_unicode(x):
323        return isinstance(x, unicode)
324
325# unquote method for parse_qs and parse_qsl
326# Cannot use directly from urllib as it would create a circular reference
327# because urllib uses urlparse methods (urljoin).  If you update this function,
328# update it also in urllib.  This code duplication does not existin in Python3.
329
330_hexdig = '0123456789ABCDEFabcdef'
331_hextochr = dict((a+b, chr(int(a+b,16)))
332                 for a in _hexdig for b in _hexdig)
333_asciire = re.compile('([\x00-\x7f]+)')
334
335def unquote(s):
336    """unquote('abc%20def') -> 'abc def'."""
337    if _is_unicode(s):
338        if '%' not in s:
339            return s
340        bits = _asciire.split(s)
341        res = [bits[0]]
342        append = res.append
343        for i in range(1, len(bits), 2):
344            append(unquote(str(bits[i])).decode('latin1'))
345            append(bits[i + 1])
346        return ''.join(res)
347
348    bits = s.split('%')
349    # fastpath
350    if len(bits) == 1:
351        return s
352    res = [bits[0]]
353    append = res.append
354    for item in bits[1:]:
355        try:
356            append(_hextochr[item[:2]])
357            append(item[2:])
358        except KeyError:
359            append('%')
360            append(item)
361    return ''.join(res)
362
363def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
364    """Parse a query given as a string argument.
365
366        Arguments:
367
368        qs: percent-encoded query string to be parsed
369
370        keep_blank_values: flag indicating whether blank values in
371            percent-encoded queries should be treated as blank strings.
372            A true value indicates that blanks should be retained as
373            blank strings.  The default false value indicates that
374            blank values are to be ignored and treated as if they were
375            not included.
376
377        strict_parsing: flag indicating what to do with parsing errors.
378            If false (the default), errors are silently ignored.
379            If true, errors raise a ValueError exception.
380    """
381    dict = {}
382    for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
383        if name in dict:
384            dict[name].append(value)
385        else:
386            dict[name] = [value]
387    return dict
388
389def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
390    """Parse a query given as a string argument.
391
392    Arguments:
393
394    qs: percent-encoded query string to be parsed
395
396    keep_blank_values: flag indicating whether blank values in
397        percent-encoded queries should be treated as blank strings.  A
398        true value indicates that blanks should be retained as blank
399        strings.  The default false value indicates that blank values
400        are to be ignored and treated as if they were  not included.
401
402    strict_parsing: flag indicating what to do with parsing errors. If
403        false (the default), errors are silently ignored. If true,
404        errors raise a ValueError exception.
405
406    Returns a list, as G-d intended.
407    """
408    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
409    r = []
410    for name_value in pairs:
411        if not name_value and not strict_parsing:
412            continue
413        nv = name_value.split('=', 1)
414        if len(nv) != 2:
415            if strict_parsing:
416                raise ValueError, "bad query field: %r" % (name_value,)
417            # Handle case of a control-name with no equal sign
418            if keep_blank_values:
419                nv.append('')
420            else:
421                continue
422        if len(nv[1]) or keep_blank_values:
423            name = unquote(nv[0].replace('+', ' '))
424            value = unquote(nv[1].replace('+', ' '))
425            r.append((name, value))
426
427    return r
428