1# markdown is released under the BSD license
2# Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
3# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
4# Copyright 2004 Manfred Stienstra (the original version)
5#
6# All rights reserved.
7#
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the following conditions are met:
10#
11# *   Redistributions of source code must retain the above copyright
12#     notice, this list of conditions and the following disclaimer.
13# *   Redistributions in binary form must reproduce the above copyright
14#     notice, this list of conditions and the following disclaimer in the
15#     documentation and/or other materials provided with the distribution.
16# *   Neither the name of the <organization> nor the
17#     names of its contributors may be used to endorse or promote products
18#     derived from this software without specific prior written permission.
19#
20# THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
21# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23# DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
24# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30# POSSIBILITY OF SUCH DAMAGE.
31
32
33"""
34PRE-PROCESSORS
35=============================================================================
36
37Preprocessors work on source text before we start doing anything too
38complicated.
39"""
40
41from __future__ import absolute_import
42from __future__ import unicode_literals
43from . import util
44from . import odict
45import re
46
47
48def build_preprocessors(md_instance, **kwargs):
49    """ Build the default set of preprocessors used by Markdown. """
50    preprocessors = odict.OrderedDict()
51    preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
52    if md_instance.safeMode != 'escape':
53        preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
54    preprocessors["reference"] = ReferencePreprocessor(md_instance)
55    return preprocessors
56
57
58class Preprocessor(util.Processor):
59    """
60    Preprocessors are run after the text is broken into lines.
61
62    Each preprocessor implements a "run" method that takes a pointer to a
63    list of lines of the document, modifies it as necessary and returns
64    either the same pointer or a pointer to a new list.
65
66    Preprocessors must extend markdown.Preprocessor.
67
68    """
69    def run(self, lines):
70        """
71        Each subclass of Preprocessor should override the `run` method, which
72        takes the document as a list of strings split by newlines and returns
73        the (possibly modified) list of lines.
74
75        """
76        pass
77
78
79class NormalizeWhitespace(Preprocessor):
80    """ Normalize whitespace for consistant parsing. """
81
82    def run(self, lines):
83        source = '\n'.join(lines)
84        source = source.replace(util.STX, "").replace(util.ETX, "")
85        source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
86        source = source.expandtabs(self.markdown.tab_length)
87        source = re.sub(r'(?<=\n) +\n', '\n', source)
88        return source.split('\n')
89
90
91class HtmlBlockPreprocessor(Preprocessor):
92    """Remove html blocks from the text and store them for later retrieval."""
93
94    right_tag_patterns = ["</%s>", "%s>"]
95    attrs_pattern = r"""
96        \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q)   # attr="value"
97        |                                                         # OR
98        \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+)               # attr=value
99        |                                                         # OR
100        \s+(?P<attr2>[^>"'/= ]+)                                  # attr
101        """
102    left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern
103    attrs_re = re.compile(attrs_pattern, re.VERBOSE)
104    left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
105    markdown_in_raw = False
106
107    def _get_left_tag(self, block):
108        m = self.left_tag_re.match(block)
109        if m:
110            tag = m.group('tag')
111            raw_attrs = m.group('attrs')
112            attrs = {}
113            if raw_attrs:
114                for ma in self.attrs_re.finditer(raw_attrs):
115                    if ma.group('attr'):
116                        if ma.group('value'):
117                            attrs[ma.group('attr').strip()] = ma.group('value')
118                        else:
119                            attrs[ma.group('attr').strip()] = ""
120                    elif ma.group('attr1'):
121                        if ma.group('value1'):
122                            attrs[ma.group('attr1').strip()] = ma.group('value1')
123                        else:
124                            attrs[ma.group('attr1').strip()] = ""
125                    elif ma.group('attr2'):
126                        attrs[ma.group('attr2').strip()] = ""
127            return tag, len(m.group(0)), attrs
128        else:
129            tag = block[1:].split(">", 1)[0].lower()
130            return tag, len(tag)+2, {}
131
132    def _recursive_tagfind(self, ltag, rtag, start_index, block):
133        while 1:
134            i = block.find(rtag, start_index)
135            if i == -1:
136                return -1
137            j = block.find(ltag, start_index)
138            # if no ltag, or rtag found before another ltag, return index
139            if (j > i or j == -1):
140                return i + len(rtag)
141            # another ltag found before rtag, use end of ltag as starting
142            # point and search again
143            j = block.find('>', j)
144            start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
145            if start_index == -1:
146                # HTML potentially malformed- ltag has no corresponding
147                # rtag
148                return -1
149
150    def _get_right_tag(self, left_tag, left_index, block):
151        for p in self.right_tag_patterns:
152            tag = p % left_tag
153            i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)
154            if i > 2:
155                return tag.lstrip("<").rstrip(">"), i
156        return block.rstrip()[-left_index:-1].lower(), len(block)
157
158    def _equal_tags(self, left_tag, right_tag):
159        if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
160            return True
161        if ("/" + left_tag) == right_tag:
162            return True
163        if (right_tag == "--" and left_tag == "--"):
164            return True
165        elif left_tag == right_tag[1:] \
166            and right_tag[0] == "/":
167            return True
168        else:
169            return False
170
171    def _is_oneliner(self, tag):
172        return (tag in ['hr', 'hr/'])
173
174    def run(self, lines):
175        text = "\n".join(lines)
176        new_blocks = []
177        text = text.rsplit("\n\n")
178        items = []
179        left_tag = ''
180        right_tag = ''
181        in_tag = False # flag
182
183        while text:
184            block = text[0]
185            if block.startswith("\n"):
186                block = block[1:]
187            text = text[1:]
188
189            if block.startswith("\n"):
190                block = block[1:]
191
192            if not in_tag:
193                if block.startswith("<") and len(block.strip()) > 1:
194
195                    if block[1] == "!":
196                        # is a comment block
197                        left_tag, left_index, attrs  = "--", 2, {}
198                    else:
199                        left_tag, left_index, attrs = self._get_left_tag(block)
200                    right_tag, data_index = self._get_right_tag(left_tag,
201                                                                left_index,
202                                                                block)
203                    # keep checking conditions below and maybe just append
204
205                    if data_index < len(block) \
206                        and (util.isBlockLevel(left_tag)
207                        or left_tag == '--'):
208                        text.insert(0, block[data_index:])
209                        block = block[:data_index]
210
211                    if not (util.isBlockLevel(left_tag) \
212                        or block[1] in ["!", "?", "@", "%"]):
213                        new_blocks.append(block)
214                        continue
215
216                    if self._is_oneliner(left_tag):
217                        new_blocks.append(block.strip())
218                        continue
219
220                    if block.rstrip().endswith(">") \
221                        and self._equal_tags(left_tag, right_tag):
222                        if self.markdown_in_raw and 'markdown' in attrs.keys():
223                            start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
224                                           '', block[:left_index])
225                            end = block[-len(right_tag)-2:]
226                            block = block[left_index:-len(right_tag)-2]
227                            new_blocks.append(
228                                self.markdown.htmlStash.store(start))
229                            new_blocks.append(block)
230                            new_blocks.append(
231                                self.markdown.htmlStash.store(end))
232                        else:
233                            new_blocks.append(
234                                self.markdown.htmlStash.store(block.strip()))
235                        continue
236                    else:
237                        # if is block level tag and is not complete
238
239                        if util.isBlockLevel(left_tag) or left_tag == "--" \
240                            and not block.rstrip().endswith(">"):
241                            items.append(block.strip())
242                            in_tag = True
243                        else:
244                            new_blocks.append(
245                            self.markdown.htmlStash.store(block.strip()))
246
247                        continue
248
249                new_blocks.append(block)
250
251            else:
252                items.append(block)
253
254                right_tag, data_index = self._get_right_tag(left_tag, 0, block)
255
256                if self._equal_tags(left_tag, right_tag):
257                    # if find closing tag
258
259                    if data_index < len(block):
260                        # we have more text after right_tag
261                        items[-1] = block[:data_index]
262                        text.insert(0, block[data_index:])
263
264                    in_tag = False
265                    if self.markdown_in_raw and 'markdown' in attrs.keys():
266                        start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
267                                       '', items[0][:left_index])
268                        items[0] = items[0][left_index:]
269                        end = items[-1][-len(right_tag)-2:]
270                        items[-1] = items[-1][:-len(right_tag)-2]
271                        new_blocks.append(
272                            self.markdown.htmlStash.store(start))
273                        new_blocks.extend(items)
274                        new_blocks.append(
275                            self.markdown.htmlStash.store(end))
276                    else:
277                        new_blocks.append(
278                            self.markdown.htmlStash.store('\n\n'.join(items)))
279                    items = []
280
281        if items:
282            if self.markdown_in_raw and 'markdown' in attrs.keys():
283                start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
284                               '', items[0][:left_index])
285                items[0] = items[0][left_index:]
286                end = items[-1][-len(right_tag)-2:]
287                items[-1] = items[-1][:-len(right_tag)-2]
288                new_blocks.append(
289                    self.markdown.htmlStash.store(start))
290                new_blocks.extend(items)
291                if end.strip():
292                    new_blocks.append(
293                        self.markdown.htmlStash.store(end))
294            else:
295                new_blocks.append(
296                    self.markdown.htmlStash.store('\n\n'.join(items)))
297            #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
298            new_blocks.append('\n')
299
300        new_text = "\n\n".join(new_blocks)
301        return new_text.split("\n")
302
303
304class ReferencePreprocessor(Preprocessor):
305    """ Remove reference definitions from text and store for later use. """
306
307    TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
308    RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL)
309    TITLE_RE = re.compile(r'^%s$' % TITLE)
310
311    def run (self, lines):
312        new_text = [];
313        while lines:
314            line = lines.pop(0)
315            m = self.RE.match(line)
316            if m:
317                id = m.group(1).strip().lower()
318                link = m.group(2).lstrip('<').rstrip('>')
319                t = m.group(5) or m.group(6) or m.group(7)
320                if not t:
321                    # Check next line for title
322                    tm = self.TITLE_RE.match(lines[0])
323                    if tm:
324                        lines.pop(0)
325                        t = tm.group(2) or tm.group(3) or tm.group(4)
326                self.markdown.references[id] = (link, t)
327            else:
328                new_text.append(line)
329
330        return new_text #+ "\n"
331