1"""
2INLINE PATTERNS
3=============================================================================
4
5Inline patterns such as *emphasis* are handled by means of auxiliary
6objects, one per pattern.  Pattern objects must be instances of classes
7that extend markdown.Pattern.  Each pattern object uses a single regular
8expression and needs support the following methods:
9
10    pattern.getCompiledRegExp() # returns a regular expression
11
12    pattern.handleMatch(m) # takes a match object and returns
13                           # an ElementTree element or just plain text
14
15All of python markdown's built-in patterns subclass from Pattern,
16but you can add additional patterns that don't.
17
18Also note that all the regular expressions used by inline must
19capture the whole block.  For this reason, they all start with
20'^(.*)' and end with '(.*)!'.  In case with built-in expression
21Pattern takes care of adding the "^(.*)" and "(.*)!".
22
23Finally, the order in which regular expressions are applied is very
24important - e.g. if we first replace http://.../ links with <a> tags
25and _then_ try to replace inline html, we would end up with a mess.
26So, we apply the expressions in the following order:
27
28* escape and backticks have to go before everything else, so
29  that we can preempt any markdown patterns by escaping them.
30
31* then we handle auto-links (must be done before inline html)
32
33* then we handle inline HTML.  At this point we will simply
34  replace all inline HTML strings with a placeholder and add
35  the actual HTML to a hash.
36
37* then inline images (must be done before links)
38
39* then bracketed links, first regular then reference-style
40
41* finally we apply strong and emphasis
42"""
43
44import markdown
45import re
46from urlparse import urlparse, urlunparse
47import sys
48if sys.version >= "3.0":
49    from html import entities as htmlentitydefs
50else:
51    import htmlentitydefs
52
53"""
54The actual regular expressions for patterns
55-----------------------------------------------------------------------------
56"""
57
58NOBRACKET = r'[^\]\[]*'
59BRK = ( r'\[('
60        + (NOBRACKET + r'(\[')*6
61        + (NOBRACKET+ r'\])*')*6
62        + NOBRACKET + r')\]' )
63NOIMG = r'(?<!\!)'
64
65BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``
66ESCAPE_RE = r'\\(.)'                             # \<
67EMPHASIS_RE = r'(\*)([^\*]+)\2'                    # *emphasis*
68STRONG_RE = r'(\*{2}|_{2})(.+?)\2'                      # **strong**
69STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2'            # ***strong***
70
71if markdown.SMART_EMPHASIS:
72    EMPHASIS_2_RE = r'(?<!\w)(_)(\S.+?)\2(?!\w)'        # _emphasis_
73else:
74    EMPHASIS_2_RE = r'(_)(.+?)\2'                 # _emphasis_
75
76LINK_RE = NOIMG + BRK + \
77r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12)?\)'''
78# [text](url) or [text](<url>)
79
80IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)'
81# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
82REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]'           # [Google][3]
83IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2]
84NOT_STRONG_RE = r'((^| )(\*|_)( |$))'                        # stand-alone * or _
85AUTOLINK_RE = r'<((?:f|ht)tps?://[^>]*)>'        # <http://www.123.com>
86AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>'               # <me@example.com>
87
88HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)'               # <...>
89ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'               # &amp;
90LINE_BREAK_RE = r'  \n'                     # two spaces at end of line
91LINE_BREAK_2_RE = r'  $'                    # two spaces at end of text
92
93
94def dequote(string):
95    """Remove quotes from around a string."""
96    if ( ( string.startswith('"') and string.endswith('"'))
97         or (string.startswith("'") and string.endswith("'")) ):
98        return string[1:-1]
99    else:
100        return string
101
102ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
103
104def handleAttributes(text, parent):
105    """Set values of an element based on attribute definitions ({@id=123})."""
106    def attributeCallback(match):
107        parent.set(match.group(1), match.group(2).replace('\n', ' '))
108    return ATTR_RE.sub(attributeCallback, text)
109
110
111"""
112The pattern classes
113-----------------------------------------------------------------------------
114"""
115
116class Pattern:
117    """Base class that inline patterns subclass. """
118
119    def __init__ (self, pattern, markdown_instance=None):
120        """
121        Create an instant of an inline pattern.
122
123        Keyword arguments:
124
125        * pattern: A regular expression that matches a pattern
126
127        """
128        self.pattern = pattern
129        self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, re.DOTALL)
130
131        # Api for Markdown to pass safe_mode into instance
132        self.safe_mode = False
133        if markdown_instance:
134            self.markdown = markdown_instance
135
136    def getCompiledRegExp (self):
137        """ Return a compiled regular expression. """
138        return self.compiled_re
139
140    def handleMatch(self, m):
141        """Return a ElementTree element from the given match.
142
143        Subclasses should override this method.
144
145        Keyword arguments:
146
147        * m: A re match object containing a match of the pattern.
148
149        """
150        pass
151
152    def type(self):
153        """ Return class name, to define pattern type """
154        return self.__class__.__name__
155
156BasePattern = Pattern # for backward compatibility
157
158class SimpleTextPattern (Pattern):
159    """ Return a simple text of group(2) of a Pattern. """
160    def handleMatch(self, m):
161        text = m.group(2)
162        if text == markdown.INLINE_PLACEHOLDER_PREFIX:
163            return None
164        return text
165
166class SimpleTagPattern (Pattern):
167    """
168    Return element of type `tag` with a text attribute of group(3)
169    of a Pattern.
170
171    """
172    def __init__ (self, pattern, tag):
173        Pattern.__init__(self, pattern)
174        self.tag = tag
175
176    def handleMatch(self, m):
177        el = markdown.etree.Element(self.tag)
178        el.text = m.group(3)
179        return el
180
181
182class SubstituteTagPattern (SimpleTagPattern):
183    """ Return a eLement of type `tag` with no children. """
184    def handleMatch (self, m):
185        return markdown.etree.Element(self.tag)
186
187
188class BacktickPattern (Pattern):
189    """ Return a `<code>` element containing the matching text. """
190    def __init__ (self, pattern):
191        Pattern.__init__(self, pattern)
192        self.tag = "code"
193
194    def handleMatch(self, m):
195        el = markdown.etree.Element(self.tag)
196        el.text = markdown.AtomicString(m.group(3).strip())
197        return el
198
199
200class DoubleTagPattern (SimpleTagPattern):
201    """Return a ElementTree element nested in tag2 nested in tag1.
202
203    Useful for strong emphasis etc.
204
205    """
206    def handleMatch(self, m):
207        tag1, tag2 = self.tag.split(",")
208        el1 = markdown.etree.Element(tag1)
209        el2 = markdown.etree.SubElement(el1, tag2)
210        el2.text = m.group(3)
211        return el1
212
213
214class HtmlPattern (Pattern):
215    """ Store raw inline html and return a placeholder. """
216    def handleMatch (self, m):
217        rawhtml = m.group(2)
218        inline = True
219        place_holder = self.markdown.htmlStash.store(rawhtml)
220        return place_holder
221
222
223class LinkPattern (Pattern):
224    """ Return a link element from the given match. """
225    def handleMatch(self, m):
226        el = markdown.etree.Element("a")
227        el.text = m.group(2)
228        title = m.group(11)
229        href = m.group(9)
230
231        if href:
232            if href[0] == "<":
233                href = href[1:-1]
234            el.set("href", self.sanitize_url(href.strip()))
235        else:
236            el.set("href", "")
237
238        if title:
239            title = dequote(title) #.replace('"', "&quot;")
240            el.set("title", title)
241        return el
242
243    def sanitize_url(self, url):
244        """
245        Sanitize a url against xss attacks in "safe_mode".
246
247        Rather than specifically blacklisting `javascript:alert("XSS")` and all
248        its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
249        safe url formats. Most urls contain a network location, however some
250        are known not to (i.e.: mailto links). Script urls do not contain a
251        location. Additionally, for `javascript:...`, the scheme would be
252        "javascript" but some aliases will appear to `urlparse()` to have no
253        scheme. On top of that relative links (i.e.: "foo/bar.html") have no
254        scheme. Therefore we must check "path", "parameters", "query" and
255        "fragment" for any literal colons. We don't check "scheme" for colons
256        because it *should* never have any and "netloc" must allow the form:
257        `username:password@host:port`.
258
259        """
260        locless_schemes = ['', 'mailto', 'news']
261        scheme, netloc, path, params, query, fragment = url = urlparse(url)
262        safe_url = False
263        if netloc != '' or scheme in locless_schemes:
264            safe_url = True
265
266        for part in url[2:]:
267            if ":" in part:
268                safe_url = False
269
270        if self.markdown.safeMode and not safe_url:
271            return ''
272        else:
273            return urlunparse(url)
274
275class ImagePattern(LinkPattern):
276    """ Return a img element from the given match. """
277    def handleMatch(self, m):
278        el = markdown.etree.Element("img")
279        src_parts = m.group(9).split()
280        if src_parts:
281            src = src_parts[0]
282            if src[0] == "<" and src[-1] == ">":
283                src = src[1:-1]
284            el.set('src', self.sanitize_url(src))
285        else:
286            el.set('src', "")
287        if len(src_parts) > 1:
288            el.set('title', dequote(" ".join(src_parts[1:])))
289
290        if markdown.ENABLE_ATTRIBUTES:
291            truealt = handleAttributes(m.group(2), el)
292        else:
293            truealt = m.group(2)
294
295        el.set('alt', truealt)
296        return el
297
298class ReferencePattern(LinkPattern):
299    """ Match to a stored reference and return link element. """
300    def handleMatch(self, m):
301        if m.group(9):
302            id = m.group(9).lower()
303        else:
304            # if we got something like "[Google][]"
305            # we'll use "google" as the id
306            id = m.group(2).lower()
307
308        if not id in self.markdown.references: # ignore undefined refs
309            return None
310        href, title = self.markdown.references[id]
311
312        text = m.group(2)
313        return self.makeTag(href, title, text)
314
315    def makeTag(self, href, title, text):
316        el = markdown.etree.Element('a')
317
318        el.set('href', self.sanitize_url(href))
319        if title:
320            el.set('title', title)
321
322        el.text = text
323        return el
324
325
326class ImageReferencePattern (ReferencePattern):
327    """ Match to a stored reference and return img element. """
328    def makeTag(self, href, title, text):
329        el = markdown.etree.Element("img")
330        el.set("src", self.sanitize_url(href))
331        if title:
332            el.set("title", title)
333        el.set("alt", text)
334        return el
335
336
337class AutolinkPattern (Pattern):
338    """ Return a link Element given an autolink (`<http://example/com>`). """
339    def handleMatch(self, m):
340        el = markdown.etree.Element("a")
341        el.set('href', m.group(2))
342        el.text = markdown.AtomicString(m.group(2))
343        return el
344
345class AutomailPattern (Pattern):
346    """
347    Return a mailto link Element given an automail link (`<foo@example.com>`).
348    """
349    def handleMatch(self, m):
350        el = markdown.etree.Element('a')
351        email = m.group(2)
352        if email.startswith("mailto:"):
353            email = email[len("mailto:"):]
354
355        def codepoint2name(code):
356            """Return entity definition by code, or the code if not defined."""
357            entity = htmlentitydefs.codepoint2name.get(code)
358            if entity:
359                return "%s%s;" % (markdown.AMP_SUBSTITUTE, entity)
360            else:
361                return "%s#%d;" % (markdown.AMP_SUBSTITUTE, code)
362
363        letters = [codepoint2name(ord(letter)) for letter in email]
364        el.text = markdown.AtomicString(''.join(letters))
365
366        mailto = "mailto:" + email
367        mailto = "".join([markdown.AMP_SUBSTITUTE + '#%d;' %
368                          ord(letter) for letter in mailto])
369        el.set('href', mailto)
370        return el
371
372