1# markdown is released under the BSD license
2# Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
3# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
4# Copyright 2004 Manfred Stienstra (the original version)
5#
6# All rights reserved.
7#
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the following conditions are met:
10#
11# *   Redistributions of source code must retain the above copyright
12#     notice, this list of conditions and the following disclaimer.
13# *   Redistributions in binary form must reproduce the above copyright
14#     notice, this list of conditions and the following disclaimer in the
15#     documentation and/or other materials provided with the distribution.
16# *   Neither the name of the <organization> nor the
17#     names of its contributors may be used to endorse or promote products
18#     derived from this software without specific prior written permission.
19#
20# THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
21# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23# DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
24# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30# POSSIBILITY OF SUCH DAMAGE.
31
32
33"""
34INLINE PATTERNS
35=============================================================================
36
37Inline patterns such as *emphasis* are handled by means of auxiliary
38objects, one per pattern.  Pattern objects must be instances of classes
39that extend markdown.Pattern.  Each pattern object uses a single regular
40expression and needs support the following methods:
41
42    pattern.getCompiledRegExp() # returns a regular expression
43
44    pattern.handleMatch(m) # takes a match object and returns
45                           # an ElementTree element or just plain text
46
47All of python markdown's built-in patterns subclass from Pattern,
48but you can add additional patterns that don't.
49
50Also note that all the regular expressions used by inline must
51capture the whole block.  For this reason, they all start with
52'^(.*)' and end with '(.*)!'.  In case with built-in expression
53Pattern takes care of adding the "^(.*)" and "(.*)!".
54
55Finally, the order in which regular expressions are applied is very
56important - e.g. if we first replace http://.../ links with <a> tags
57and _then_ try to replace inline html, we would end up with a mess.
58So, we apply the expressions in the following order:
59
60* escape and backticks have to go before everything else, so
61  that we can preempt any markdown patterns by escaping them.
62
63* then we handle auto-links (must be done before inline html)
64
65* then we handle inline HTML.  At this point we will simply
66  replace all inline HTML strings with a placeholder and add
67  the actual HTML to a hash.
68
69* then inline images (must be done before links)
70
71* then bracketed links, first regular then reference-style
72
73* finally we apply strong and emphasis
74"""
75
76from __future__ import absolute_import
77from __future__ import unicode_literals
78from . import util
79from . import odict
80import re
81try:
82    from urllib.parse import urlparse, urlunparse
83except ImportError:
84    from urlparse import urlparse, urlunparse
85try:
86    from html import entities
87except ImportError:
88    import htmlentitydefs as entities
89
90
91def build_inlinepatterns(md_instance, **kwargs):
92    """ Build the default set of inline patterns for Markdown. """
93    inlinePatterns = odict.OrderedDict()
94    inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
95    inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)
96    inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
97    inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
98    inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
99    inlinePatterns["image_reference"] = \
100            ImageReferencePattern(IMAGE_REFERENCE_RE, md_instance)
101    inlinePatterns["short_reference"] = \
102            ReferencePattern(SHORT_REF_RE, md_instance)
103    inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
104    inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
105    inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
106    if md_instance.safeMode != 'escape':
107        inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
108    inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
109    inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
110    inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'strong,em')
111    inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
112    inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
113    if md_instance.smart_emphasis:
114        inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
115    else:
116        inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
117    return inlinePatterns
118
119"""
120The actual regular expressions for patterns
121-----------------------------------------------------------------------------
122"""
123
124NOBRACKET = r'[^\]\[]*'
125BRK = ( r'\[('
126        + (NOBRACKET + r'(\[')*6
127        + (NOBRACKET+ r'\])*')*6
128        + NOBRACKET + r')\]' )
129NOIMG = r'(?<!\!)'
130
131BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``
132ESCAPE_RE = r'\\(.)'                             # \<
133EMPHASIS_RE = r'(\*)([^\*]+)\2'                    # *emphasis*
134STRONG_RE = r'(\*{2}|_{2})(.+?)\2'                      # **strong**
135STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2'            # ***strong***
136SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)'  # _smart_emphasis_
137EMPHASIS_2_RE = r'(_)(.+?)\2'                 # _emphasis_
138LINK_RE = NOIMG + BRK + \
139r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
140# [text](url) or [text](<url>) or [text](url "title")
141
142IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)'
143# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
144REFERENCE_RE = NOIMG + BRK+ r'\s?\[([^\]]*)\]'           # [Google][3]
145SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]'                   # [Google]
146IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]' # ![alt text][2]
147NOT_STRONG_RE = r'((^| )(\*|_)( |$))'                        # stand-alone * or _
148AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>' # <http://www.123.com>
149AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>'               # <me@example.com>
150
151HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)'               # <...>
152ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'               # &amp;
153LINE_BREAK_RE = r'  \n'                     # two spaces at end of line
154
155
156def dequote(string):
157    """Remove quotes from around a string."""
158    if ( ( string.startswith('"') and string.endswith('"'))
159         or (string.startswith("'") and string.endswith("'")) ):
160        return string[1:-1]
161    else:
162        return string
163
164ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
165
166def handleAttributes(text, parent):
167    """Set values of an element based on attribute definitions ({@id=123})."""
168    def attributeCallback(match):
169        parent.set(match.group(1), match.group(2).replace('\n', ' '))
170    return ATTR_RE.sub(attributeCallback, text)
171
172
173"""
174The pattern classes
175-----------------------------------------------------------------------------
176"""
177
178class Pattern(object):
179    """Base class that inline patterns subclass. """
180
181    def __init__(self, pattern, markdown_instance=None):
182        """
183        Create an instant of an inline pattern.
184
185        Keyword arguments:
186
187        * pattern: A regular expression that matches a pattern
188
189        """
190        self.pattern = pattern
191        self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern,
192                                      re.DOTALL | re.UNICODE)
193
194        # Api for Markdown to pass safe_mode into instance
195        self.safe_mode = False
196        if markdown_instance:
197            self.markdown = markdown_instance
198
199    def getCompiledRegExp(self):
200        """ Return a compiled regular expression. """
201        return self.compiled_re
202
203    def handleMatch(self, m):
204        """Return a ElementTree element from the given match.
205
206        Subclasses should override this method.
207
208        Keyword arguments:
209
210        * m: A re match object containing a match of the pattern.
211
212        """
213        pass
214
215    def type(self):
216        """ Return class name, to define pattern type """
217        return self.__class__.__name__
218
219    def unescape(self, text):
220        """ Return unescaped text given text with an inline placeholder. """
221        try:
222            stash = self.markdown.treeprocessors['inline'].stashed_nodes
223        except KeyError:
224            return text
225        def itertext(el):
226            ' Reimplement Element.itertext for older python versions '
227            tag = el.tag
228            if not isinstance(tag, util.string_type) and tag is not None:
229                return
230            if el.text:
231                yield el.text
232            for e in el:
233                for s in itertext(e):
234                    yield s
235                if e.tail:
236                    yield e.tail
237        def get_stash(m):
238            id = m.group(1)
239            if id in stash:
240                value = stash.get(id)
241                if isinstance(value, util.string_type):
242                    return value
243                else:
244                    # An etree Element - return text content only
245                    return ''.join(itertext(value))
246        return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
247
248
249class SimpleTextPattern(Pattern):
250    """ Return a simple text of group(2) of a Pattern. """
251    def handleMatch(self, m):
252        text = m.group(2)
253        if text == util.INLINE_PLACEHOLDER_PREFIX:
254            return None
255        return text
256
257
258class EscapePattern(Pattern):
259    """ Return an escaped character. """
260
261    def handleMatch(self, m):
262        char = m.group(2)
263        if char in self.markdown.ESCAPED_CHARS:
264            return '%s%s%s' % (util.STX, ord(char), util.ETX)
265        else:
266            return '\\%s' % char
267
268
269class SimpleTagPattern(Pattern):
270    """
271    Return element of type `tag` with a text attribute of group(3)
272    of a Pattern.
273
274    """
275    def __init__ (self, pattern, tag):
276        Pattern.__init__(self, pattern)
277        self.tag = tag
278
279    def handleMatch(self, m):
280        el = util.etree.Element(self.tag)
281        el.text = m.group(3)
282        return el
283
284
285class SubstituteTagPattern(SimpleTagPattern):
286    """ Return an element of type `tag` with no children. """
287    def handleMatch (self, m):
288        return util.etree.Element(self.tag)
289
290
291class BacktickPattern(Pattern):
292    """ Return a `<code>` element containing the matching text. """
293    def __init__ (self, pattern):
294        Pattern.__init__(self, pattern)
295        self.tag = "code"
296
297    def handleMatch(self, m):
298        el = util.etree.Element(self.tag)
299        el.text = util.AtomicString(m.group(3).strip())
300        return el
301
302
303class DoubleTagPattern(SimpleTagPattern):
304    """Return a ElementTree element nested in tag2 nested in tag1.
305
306    Useful for strong emphasis etc.
307
308    """
309    def handleMatch(self, m):
310        tag1, tag2 = self.tag.split(",")
311        el1 = util.etree.Element(tag1)
312        el2 = util.etree.SubElement(el1, tag2)
313        el2.text = m.group(3)
314        return el1
315
316
317class HtmlPattern(Pattern):
318    """ Store raw inline html and return a placeholder. """
319    def handleMatch (self, m):
320        rawhtml = self.unescape(m.group(2))
321        place_holder = self.markdown.htmlStash.store(rawhtml)
322        return place_holder
323
324    def unescape(self, text):
325        """ Return unescaped text given text with an inline placeholder. """
326        try:
327            stash = self.markdown.treeprocessors['inline'].stashed_nodes
328        except KeyError:
329            return text
330        def get_stash(m):
331            id = m.group(1)
332            value = stash.get(id)
333            if value is not None:
334                try:
335                    return self.markdown.serializer(value)
336                except:
337                    return '\%s' % value
338
339        return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
340
341
342class LinkPattern(Pattern):
343    """ Return a link element from the given match. """
344    def handleMatch(self, m):
345        el = util.etree.Element("a")
346        el.text = m.group(2)
347        title = m.group(13)
348        href = m.group(9)
349
350        if href:
351            if href[0] == "<":
352                href = href[1:-1]
353            el.set("href", self.sanitize_url(self.unescape(href.strip())))
354        else:
355            el.set("href", "")
356
357        if title:
358            title = dequote(self.unescape(title))
359            el.set("title", title)
360        return el
361
362    def sanitize_url(self, url):
363        """
364        Sanitize a url against xss attacks in "safe_mode".
365
366        Rather than specifically blacklisting `javascript:alert("XSS")` and all
367        its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
368        safe url formats. Most urls contain a network location, however some
369        are known not to (i.e.: mailto links). Script urls do not contain a
370        location. Additionally, for `javascript:...`, the scheme would be
371        "javascript" but some aliases will appear to `urlparse()` to have no
372        scheme. On top of that relative links (i.e.: "foo/bar.html") have no
373        scheme. Therefore we must check "path", "parameters", "query" and
374        "fragment" for any literal colons. We don't check "scheme" for colons
375        because it *should* never have any and "netloc" must allow the form:
376        `username:password@host:port`.
377
378        """
379        url = url.replace(' ', '%20')
380        if not self.markdown.safeMode:
381            # Return immediately bipassing parsing.
382            return url
383
384        try:
385            scheme, netloc, path, params, query, fragment = url = urlparse(url)
386        except ValueError:
387            # Bad url - so bad it couldn't be parsed.
388            return ''
389
390        locless_schemes = ['', 'mailto', 'news']
391        allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']
392        if scheme not in allowed_schemes:
393            # Not a known (allowed) scheme. Not safe.
394            return ''
395
396        if netloc == '' and scheme not in locless_schemes:
397            # This should not happen. Treat as suspect.
398            return ''
399
400        for part in url[2:]:
401            if ":" in part:
402                # A colon in "path", "parameters", "query" or "fragment" is suspect.
403                return ''
404
405        # Url passes all tests. Return url as-is.
406        return urlunparse(url)
407
408class ImagePattern(LinkPattern):
409    """ Return a img element from the given match. """
410    def handleMatch(self, m):
411        el = util.etree.Element("img")
412        src_parts = m.group(9).split()
413        if src_parts:
414            src = src_parts[0]
415            if src[0] == "<" and src[-1] == ">":
416                src = src[1:-1]
417            el.set('src', self.sanitize_url(self.unescape(src)))
418        else:
419            el.set('src', "")
420        if len(src_parts) > 1:
421            el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))
422
423        if self.markdown.enable_attributes:
424            truealt = handleAttributes(m.group(2), el)
425        else:
426            truealt = m.group(2)
427
428        el.set('alt', self.unescape(truealt))
429        return el
430
431class ReferencePattern(LinkPattern):
432    """ Match to a stored reference and return link element. """
433
434    NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)
435
436    def handleMatch(self, m):
437        try:
438            id = m.group(9).lower()
439        except IndexError:
440            id = None
441        if not id:
442            # if we got something like "[Google][]" or "[Goggle]"
443            # we'll use "google" as the id
444            id = m.group(2).lower()
445
446        # Clean up linebreaks in id
447        id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
448        if not id in self.markdown.references: # ignore undefined refs
449            return None
450        href, title = self.markdown.references[id]
451
452        text = m.group(2)
453        return self.makeTag(href, title, text)
454
455    def makeTag(self, href, title, text):
456        el = util.etree.Element('a')
457
458        el.set('href', self.sanitize_url(href))
459        if title:
460            el.set('title', title)
461
462        el.text = text
463        return el
464
465
466class ImageReferencePattern(ReferencePattern):
467    """ Match to a stored reference and return img element. """
468    def makeTag(self, href, title, text):
469        el = util.etree.Element("img")
470        el.set("src", self.sanitize_url(href))
471        if title:
472            el.set("title", title)
473
474        if self.markdown.enable_attributes:
475            text = handleAttributes(text, el)
476
477        el.set("alt", self.unescape(text))
478        return el
479
480
481class AutolinkPattern(Pattern):
482    """ Return a link Element given an autolink (`<http://example/com>`). """
483    def handleMatch(self, m):
484        el = util.etree.Element("a")
485        el.set('href', self.unescape(m.group(2)))
486        el.text = util.AtomicString(m.group(2))
487        return el
488
489class AutomailPattern(Pattern):
490    """
491    Return a mailto link Element given an automail link (`<foo@example.com>`).
492    """
493    def handleMatch(self, m):
494        el = util.etree.Element('a')
495        email = self.unescape(m.group(2))
496        if email.startswith("mailto:"):
497            email = email[len("mailto:"):]
498
499        def codepoint2name(code):
500            """Return entity definition by code, or the code if not defined."""
501            entity = entities.codepoint2name.get(code)
502            if entity:
503                return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
504            else:
505                return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
506
507        letters = [codepoint2name(ord(letter)) for letter in email]
508        el.text = util.AtomicString(''.join(letters))
509
510        mailto = "mailto:" + email
511        mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
512                          ord(letter) for letter in mailto])
513        el.set('href', mailto)
514        return el
515
516