1
2"""
3PRE-PROCESSORS
4=============================================================================
5
6Preprocessors work on source text before we start doing anything too
7complicated.
8"""
9
10import re
11import markdown
12
13HTML_PLACEHOLDER_PREFIX = markdown.STX+"wzxhzdk:"
14HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + markdown.ETX
15
16class Processor:
17    def __init__(self, markdown_instance=None):
18        if markdown_instance:
19            self.markdown = markdown_instance
20
21class Preprocessor (Processor):
22    """
23    Preprocessors are run after the text is broken into lines.
24
25    Each preprocessor implements a "run" method that takes a pointer to a
26    list of lines of the document, modifies it as necessary and returns
27    either the same pointer or a pointer to a new list.
28
29    Preprocessors must extend markdown.Preprocessor.
30
31    """
32    def run(self, lines):
33        """
34        Each subclass of Preprocessor should override the `run` method, which
35        takes the document as a list of strings split by newlines and returns
36        the (possibly modified) list of lines.
37
38        """
39        pass
40
41class HtmlStash:
42    """
43    This class is used for stashing HTML objects that we extract
44    in the beginning and replace with place-holders.
45    """
46
47    def __init__ (self):
48        """ Create a HtmlStash. """
49        self.html_counter = 0 # for counting inline html segments
50        self.rawHtmlBlocks=[]
51
52    def store(self, html, safe=False):
53        """
54        Saves an HTML segment for later reinsertion.  Returns a
55        placeholder string that needs to be inserted into the
56        document.
57
58        Keyword arguments:
59
60        * html: an html segment
61        * safe: label an html segment as safe for safemode
62
63        Returns : a placeholder string
64
65        """
66        self.rawHtmlBlocks.append((html, safe))
67        placeholder = HTML_PLACEHOLDER % self.html_counter
68        self.html_counter += 1
69        return placeholder
70
71    def reset(self):
72        self.html_counter = 0
73        self.rawHtmlBlocks = []
74
75
76class HtmlBlockPreprocessor(Preprocessor):
77    """Remove html blocks from the text and store them for later retrieval."""
78
79    right_tag_patterns = ["</%s>", "%s>"]
80
81    def _get_left_tag(self, block):
82        return block[1:].replace(">", " ", 1).split()[0].lower()
83
84    def _get_right_tag(self, left_tag, block):
85        for p in self.right_tag_patterns:
86            tag = p % left_tag
87            i = block.rfind(tag)
88            if i > 2:
89                return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag)
90        return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block)
91
92    def _equal_tags(self, left_tag, right_tag):
93        if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
94            return True
95        if ("/" + left_tag) == right_tag:
96            return True
97        if (right_tag == "--" and left_tag == "--"):
98            return True
99        elif left_tag == right_tag[1:] \
100            and right_tag[0] != "<":
101            return True
102        else:
103            return False
104
105    def _is_oneliner(self, tag):
106        return (tag in ['hr', 'hr/'])
107
108    def run(self, lines):
109        text = "\n".join(lines)
110        new_blocks = []
111        text = text.split("\n\n")
112        items = []
113        left_tag = ''
114        right_tag = ''
115        in_tag = False # flag
116
117        while text:
118            block = text[0]
119            if block.startswith("\n"):
120                block = block[1:]
121            text = text[1:]
122
123            if block.startswith("\n"):
124                block = block[1:]
125
126            if not in_tag:
127                if block.startswith("<"):
128                    left_tag = self._get_left_tag(block)
129                    right_tag, data_index = self._get_right_tag(left_tag, block)
130
131                    if block[1] == "!":
132                        # is a comment block
133                        left_tag = "--"
134                        right_tag, data_index = self._get_right_tag(left_tag, block)
135                        # keep checking conditions below and maybe just append
136
137                    if data_index < len(block) \
138                        and markdown.isBlockLevel(left_tag):
139                        text.insert(0, block[data_index:])
140                        block = block[:data_index]
141
142                    if not (markdown.isBlockLevel(left_tag) \
143                        or block[1] in ["!", "?", "@", "%"]):
144                        new_blocks.append(block)
145                        continue
146
147                    if self._is_oneliner(left_tag):
148                        new_blocks.append(block.strip())
149                        continue
150
151                    if block.rstrip().endswith(">") \
152                        and self._equal_tags(left_tag, right_tag):
153                        new_blocks.append(
154                            self.markdown.htmlStash.store(block.strip()))
155                        continue
156                    else: #if not block[1] == "!":
157                        # if is block level tag and is not complete
158
159                        if markdown.isBlockLevel(left_tag) or left_tag == "--" \
160                            and not block.rstrip().endswith(">"):
161                            items.append(block.strip())
162                            in_tag = True
163                        else:
164                            new_blocks.append(
165                            self.markdown.htmlStash.store(block.strip()))
166
167                        continue
168
169                new_blocks.append(block)
170
171            else:
172                items.append(block.strip())
173
174                right_tag, data_index = self._get_right_tag(left_tag, block)
175
176                if self._equal_tags(left_tag, right_tag):
177                    # if find closing tag
178                    in_tag = False
179                    new_blocks.append(
180                        self.markdown.htmlStash.store('\n\n'.join(items)))
181                    items = []
182
183        if items:
184            new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
185            new_blocks.append('\n')
186
187        new_text = "\n\n".join(new_blocks)
188        return new_text.split("\n")
189
190
191class ReferencePreprocessor(Preprocessor):
192    """ Remove reference definitions from text and store for later use. """
193
194    RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL)
195
196    def run (self, lines):
197        new_text = [];
198        for line in lines:
199            m = self.RE.match(line)
200            if m:
201                id = m.group(2).strip().lower()
202                t = m.group(4).strip()  # potential title
203                if not t:
204                    self.markdown.references[id] = (m.group(3), t)
205                elif (len(t) >= 2
206                      and (t[0] == t[-1] == "\""
207                           or t[0] == t[-1] == "\'"
208                           or (t[0] == "(" and t[-1] == ")") ) ):
209                    self.markdown.references[id] = (m.group(3), t[1:-1])
210                else:
211                    new_text.append(line)
212            else:
213                new_text.append(line)
214
215        return new_text #+ "\n"
216