1# markdown is released under the BSD license
2# Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
3# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
4# Copyright 2004 Manfred Stienstra (the original version)
5#
6# All rights reserved.
7#
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the following conditions are met:
10#
11# *   Redistributions of source code must retain the above copyright
12#     notice, this list of conditions and the following disclaimer.
13# *   Redistributions in binary form must reproduce the above copyright
14#     notice, this list of conditions and the following disclaimer in the
15#     documentation and/or other materials provided with the distribution.
16# *   Neither the name of the <organization> nor the
17#     names of its contributors may be used to endorse or promote products
18#     derived from this software without specific prior written permission.
19#
20# THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
21# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23# DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
24# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30# POSSIBILITY OF SUCH DAMAGE.
31
32
33"""
34Python Markdown
35===============
36
37Python Markdown converts Markdown to HTML and can be used as a library or
38called from the command line.
39
40## Basic usage as a module:
41
42    import markdown
43    html = markdown.markdown(your_text_string)
44
45See <http://packages.python.org/Markdown/> for more
46information and instructions on how to extend the functionality of
47Python Markdown.  Read that before you try modifying this file.
48
49## Authors and License
50
51Started by [Manfred Stienstra](http://www.dwerg.net/).  Continued and
52maintained  by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan
53Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
54
55Contact: markdown@freewisdom.org
56
57Copyright 2007-2013 The Python Markdown Project (v. 1.7 and later)
58Copyright 200? Django Software Foundation (OrderedDict implementation)
59Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
60Copyright 2004 Manfred Stienstra (the original version)
61
62License: BSD (see LICENSE for details).
63"""
64
65from __future__ import absolute_import
66from __future__ import unicode_literals
67from .__version__ import version, version_info
68import re
69import codecs
70import sys
71import logging
72from . import util
73from .preprocessors import build_preprocessors
74from .blockprocessors import build_block_parser
75from .treeprocessors import build_treeprocessors
76from .inlinepatterns import build_inlinepatterns
77from .postprocessors import build_postprocessors
78from .extensions import Extension
79from .serializers import to_html_string, to_xhtml_string
80
81__all__ = ['Markdown', 'markdown', 'markdownFromFile']
82
83logger = logging.getLogger('MARKDOWN')
84
85
86class Markdown(object):
87    """Convert Markdown to HTML."""
88
89    doc_tag = "div"     # Element used to wrap document - later removed
90
91    option_defaults = {
92        'html_replacement_text' : '[HTML_REMOVED]',
93        'tab_length'            : 4,
94        'enable_attributes'     : True,
95        'smart_emphasis'        : True,
96        'lazy_ol'               : True,
97    }
98
99    output_formats = {
100        'html'  : to_html_string,
101        'html4' : to_html_string,
102        'html5' : to_html_string,
103        'xhtml' : to_xhtml_string,
104        'xhtml1': to_xhtml_string,
105        'xhtml5': to_xhtml_string,
106    }
107
108    ESCAPED_CHARS = ['\\', '`', '*', '_', '{', '}', '[', ']',
109                    '(', ')', '>', '#', '+', '-', '.', '!']
110
111    def __init__(self, *args, **kwargs):
112        """
113        Creates a new Markdown instance.
114
115        Keyword arguments:
116
117        * extensions: A list of extensions.
118           If they are of type string, the module mdx_name.py will be loaded.
119           If they are a subclass of markdown.Extension, they will be used
120           as-is.
121        * extension_configs: Configuration settingis for extensions.
122        * output_format: Format of output. Supported formats are:
123            * "xhtml1": Outputs XHTML 1.x. Default.
124            * "xhtml5": Outputs XHTML style tags of HTML 5
125            * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
126            * "html4": Outputs HTML 4
127            * "html5": Outputs HTML style tags of HTML 5
128            * "html": Outputs latest supported version of HTML (currently HTML 4).
129            Note that it is suggested that the more specific formats ("xhtml1"
130            and "html4") be used as "xhtml" or "html" may change in the future
131            if it makes sense at that time.
132        * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
133        * html_replacement_text: Text used when safe_mode is set to "replace".
134        * tab_length: Length of tabs in the source. Default: 4
135        * enable_attributes: Enable the conversion of attributes. Default: True
136        * smart_emphasis: Treat `_connected_words_` intelegently Default: True
137        * lazy_ol: Ignore number of first item of ordered lists. Default: True
138
139        """
140
141        # For backward compatibility, loop through old positional args
142        pos = ['extensions', 'extension_configs', 'safe_mode', 'output_format']
143        c = 0
144        for arg in args:
145            if pos[c] not in kwargs:
146                kwargs[pos[c]] = arg
147            c += 1
148            if c == len(pos):
149                # ignore any additional args
150                break
151
152        # Loop through kwargs and assign defaults
153        for option, default in self.option_defaults.items():
154            setattr(self, option, kwargs.get(option, default))
155
156        self.safeMode = kwargs.get('safe_mode', False)
157        if self.safeMode and 'enable_attributes' not in kwargs:
158            # Disable attributes in safeMode when not explicitly set
159            self.enable_attributes = False
160
161        self.registeredExtensions = []
162        self.docType = ""
163        self.stripTopLevelTags = True
164
165        self.build_parser()
166
167        self.references = {}
168        self.htmlStash = util.HtmlStash()
169        self.set_output_format(kwargs.get('output_format', 'xhtml1'))
170        self.registerExtensions(extensions=kwargs.get('extensions', []),
171                                configs=kwargs.get('extension_configs', {}))
172        self.reset()
173
174    def build_parser(self):
175        """ Build the parser from the various parts. """
176        self.preprocessors = build_preprocessors(self)
177        self.parser = build_block_parser(self)
178        self.inlinePatterns = build_inlinepatterns(self)
179        self.treeprocessors = build_treeprocessors(self)
180        self.postprocessors = build_postprocessors(self)
181        return self
182
183    def registerExtensions(self, extensions, configs):
184        """
185        Register extensions with this instance of Markdown.
186
187        Keyword arguments:
188
189        * extensions: A list of extensions, which can either
190           be strings or objects.  See the docstring on Markdown.
191        * configs: A dictionary mapping module names to config options.
192
193        """
194        for ext in extensions:
195            if isinstance(ext, util.string_type):
196                ext = self.build_extension(ext, configs.get(ext, []))
197            if isinstance(ext, Extension):
198                ext.extendMarkdown(self, globals())
199            elif ext is not None:
200                raise TypeError(
201                    'Extension "%s.%s" must be of type: "markdown.Extension"'
202                    % (ext.__class__.__module__, ext.__class__.__name__))
203
204        return self
205
206    def build_extension(self, ext_name, configs = []):
207        """Build extension by name, then return the module.
208
209        The extension name may contain arguments as part of the string in the
210        following format: "extname(key1=value1,key2=value2)"
211
212        """
213
214        # Parse extensions config params (ignore the order)
215        configs = dict(configs)
216        pos = ext_name.find("(") # find the first "("
217        if pos > 0:
218            ext_args = ext_name[pos+1:-1]
219            ext_name = ext_name[:pos]
220            pairs = [x.split("=") for x in ext_args.split(",")]
221            configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
222
223        # Setup the module name
224        module_name = ext_name
225        if '.' not in ext_name:
226            module_name = '.'.join(['third_party.markdown.extensions', ext_name])
227
228        # Try loading the extension first from one place, then another
229        try: # New style (markdown.extensons.<extension>)
230            module = __import__(module_name, {}, {}, [module_name.rpartition('.')[0]])
231        except ImportError:
232            module_name_old_style = '_'.join(['mdx', ext_name])
233            try: # Old style (mdx_<extension>)
234                module = __import__(module_name_old_style)
235            except ImportError as e:
236                message = "Failed loading extension '%s' from '%s' or '%s'" \
237                    % (ext_name, module_name, module_name_old_style)
238                e.args = (message,) + e.args[1:]
239                raise
240
241        # If the module is loaded successfully, we expect it to define a
242        # function called makeExtension()
243        try:
244            return module.makeExtension(configs.items())
245        except AttributeError as e:
246            message = e.args[0]
247            message = "Failed to initiate extension " \
248                      "'%s': %s" % (ext_name, message)
249            e.args = (message,) + e.args[1:]
250            raise
251
252    def registerExtension(self, extension):
253        """ This gets called by the extension """
254        self.registeredExtensions.append(extension)
255        return self
256
257    def reset(self):
258        """
259        Resets all state variables so that we can start with a new text.
260        """
261        self.htmlStash.reset()
262        self.references.clear()
263
264        for extension in self.registeredExtensions:
265            if hasattr(extension, 'reset'):
266                extension.reset()
267
268        return self
269
270    def set_output_format(self, format):
271        """ Set the output format for the class instance. """
272        self.output_format = format.lower()
273        try:
274            self.serializer = self.output_formats[self.output_format]
275        except KeyError as e:
276            valid_formats = list(self.output_formats.keys())
277            valid_formats.sort()
278            message = 'Invalid Output Format: "%s". Use one of %s.' \
279                       % (self.output_format,
280                          '"' + '", "'.join(valid_formats) + '"')
281            e.args = (message,) + e.args[1:]
282            raise
283        return self
284
285    def convert(self, source):
286        """
287        Convert markdown to serialized XHTML or HTML.
288
289        Keyword arguments:
290
291        * source: Source text as a Unicode string.
292
293        Markdown processing takes place in five steps:
294
295        1. A bunch of "preprocessors" munge the input text.
296        2. BlockParser() parses the high-level structural elements of the
297           pre-processed text into an ElementTree.
298        3. A bunch of "treeprocessors" are run against the ElementTree. One
299           such treeprocessor runs InlinePatterns against the ElementTree,
300           detecting inline markup.
301        4. Some post-processors are run against the text after the ElementTree
302           has been serialized into text.
303        5. The output is written to a string.
304
305        """
306
307        # Fixup the source text
308        if not source.strip():
309            return ''  # a blank unicode string
310
311        try:
312            source = util.text_type(source)
313        except UnicodeDecodeError as e:
314            # Customise error message while maintaining original trackback
315            e.reason += '. -- Note: Markdown only accepts unicode input!'
316            raise
317
318        # Split into lines and run the line preprocessors.
319        self.lines = source.split("\n")
320        for prep in self.preprocessors.values():
321            self.lines = prep.run(self.lines)
322
323        # Parse the high-level elements.
324        root = self.parser.parseDocument(self.lines).getroot()
325
326        # Run the tree-processors
327        for treeprocessor in self.treeprocessors.values():
328            newRoot = treeprocessor.run(root)
329            if newRoot:
330                root = newRoot
331
332        # Serialize _properly_.  Strip top-level tags.
333        output = self.serializer(root)
334        if self.stripTopLevelTags:
335            try:
336                start = output.index('<%s>'%self.doc_tag)+len(self.doc_tag)+2
337                end = output.rindex('</%s>'%self.doc_tag)
338                output = output[start:end].strip()
339            except ValueError:
340                if output.strip().endswith('<%s />'%self.doc_tag):
341                    # We have an empty document
342                    output = ''
343                else:
344                    # We have a serious problem
345                    raise ValueError('Markdown failed to strip top-level tags. Document=%r' % output.strip())
346
347        # Run the text post-processors
348        for pp in self.postprocessors.values():
349            output = pp.run(output)
350
351        return output.strip()
352
353    def convertFile(self, input=None, output=None, encoding=None):
354        """Converts a markdown file and returns the HTML as a unicode string.
355
356        Decodes the file using the provided encoding (defaults to utf-8),
357        passes the file content to markdown, and outputs the html to either
358        the provided stream or the file with provided name, using the same
359        encoding as the source file. The 'xmlcharrefreplace' error handler is
360        used when encoding the output.
361
362        **Note:** This is the only place that decoding and encoding of unicode
363        takes place in Python-Markdown.  (All other code is unicode-in /
364        unicode-out.)
365
366        Keyword arguments:
367
368        * input: File object or path. Reads from stdin if `None`.
369        * output: File object or path. Writes to stdout if `None`.
370        * encoding: Encoding of input and output files. Defaults to utf-8.
371
372        """
373
374        encoding = encoding or "utf-8"
375
376        # Read the source
377        if input:
378            if isinstance(input, util.string_type):
379                input_file = codecs.open(input, mode="r", encoding=encoding)
380            else:
381                input_file = codecs.getreader(encoding)(input)
382            text = input_file.read()
383            input_file.close()
384        else:
385            text = sys.stdin.read()
386            if not isinstance(text, util.text_type):
387                text = text.decode(encoding)
388
389        text = text.lstrip('\ufeff') # remove the byte-order mark
390
391        # Convert
392        html = self.convert(text)
393
394        # Write to file or stdout
395        if output:
396            if isinstance(output, util.string_type):
397                output_file = codecs.open(output, "w",
398                                          encoding=encoding,
399                                          errors="xmlcharrefreplace")
400                output_file.write(html)
401                output_file.close()
402            else:
403                writer = codecs.getwriter(encoding)
404                output_file = writer(output, errors="xmlcharrefreplace")
405                output_file.write(html)
406                # Don't close here. User may want to write more.
407        else:
408            # Encode manually and write bytes to stdout.
409            html = html.encode(encoding, "xmlcharrefreplace")
410            try:
411                # Write bytes directly to buffer (Python 3).
412                sys.stdout.buffer.write(html)
413            except AttributeError:
414                # Probably Python 2, which works with bytes by default.
415                sys.stdout.write(html)
416
417        return self
418
419
420"""
421EXPORTED FUNCTIONS
422=============================================================================
423
424Those are the two functions we really mean to export: markdown() and
425markdownFromFile().
426"""
427
428def markdown(text, *args, **kwargs):
429    """Convert a markdown string to HTML and return HTML as a unicode string.
430
431    This is a shortcut function for `Markdown` class to cover the most
432    basic use case.  It initializes an instance of Markdown, loads the
433    necessary extensions and runs the parser on the given text.
434
435    Keyword arguments:
436
437    * text: Markdown formatted text as Unicode or ASCII string.
438    * Any arguments accepted by the Markdown class.
439
440    Returns: An HTML document as a string.
441
442    """
443    md = Markdown(*args, **kwargs)
444    return md.convert(text)
445
446
447def markdownFromFile(*args, **kwargs):
448    """Read markdown code from a file and write it to a file or a stream.
449
450    This is a shortcut function which initializes an instance of Markdown,
451    and calls the convertFile method rather than convert.
452
453    Keyword arguments:
454
455    * input: a file name or readable object.
456    * output: a file name or writable object.
457    * encoding: Encoding of input and output.
458    * Any arguments accepted by the Markdown class.
459
460    """
461    # For backward compatibility loop through positional args
462    pos = ['input', 'output', 'extensions', 'encoding']
463    c = 0
464    for arg in args:
465        if pos[c] not in kwargs:
466            kwargs[pos[c]] = arg
467        c += 1
468        if c == len(pos):
469            break
470
471    md = Markdown(**kwargs)
472    md.convertFile(kwargs.get('input', None),
473                   kwargs.get('output', None),
474                   kwargs.get('encoding', None))
475
476