1# markdown is released under the BSD license
2# Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
3# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
4# Copyright 2004 Manfred Stienstra (the original version)
5#
6# All rights reserved.
7#
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the following conditions are met:
10#
11# *   Redistributions of source code must retain the above copyright
12#     notice, this list of conditions and the following disclaimer.
13# *   Redistributions in binary form must reproduce the above copyright
14#     notice, this list of conditions and the following disclaimer in the
15#     documentation and/or other materials provided with the distribution.
16# *   Neither the name of the <organization> nor the
17#     names of its contributors may be used to endorse or promote products
18#     derived from this software without specific prior written permission.
19#
20# THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
21# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23# DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
24# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30# POSSIBILITY OF SUCH DAMAGE.
31
32
33# markdown/searializers.py
34#
35# Add x/html serialization to Elementree
36# Taken from ElementTree 1.3 preview with slight modifications
37#
38# Copyright (c) 1999-2007 by Fredrik Lundh.  All rights reserved.
39#
40# fredrik@pythonware.com
41# http://www.pythonware.com
42#
43# --------------------------------------------------------------------
44# The ElementTree toolkit is
45#
46# Copyright (c) 1999-2007 by Fredrik Lundh
47#
48# By obtaining, using, and/or copying this software and/or its
49# associated documentation, you agree that you have read, understood,
50# and will comply with the following terms and conditions:
51#
52# Permission to use, copy, modify, and distribute this software and
53# its associated documentation for any purpose and without fee is
54# hereby granted, provided that the above copyright notice appears in
55# all copies, and that both that copyright notice and this permission
56# notice appear in supporting documentation, and that the name of
57# Secret Labs AB or the author not be used in advertising or publicity
58# pertaining to distribution of the software without specific, written
59# prior permission.
60#
61# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
62# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
63# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
64# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
65# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
66# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
67# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
68# OF THIS SOFTWARE.
69# --------------------------------------------------------------------
70
71
72from __future__ import absolute_import
73from __future__ import unicode_literals
74from . import util
75ElementTree = util.etree.ElementTree
76QName = util.etree.QName
77if hasattr(util.etree, 'test_comment'):
78    Comment = util.etree.test_comment
79else:
80    Comment = util.etree.Comment
81PI = util.etree.PI
82ProcessingInstruction = util.etree.ProcessingInstruction
83
84__all__ = ['to_html_string', 'to_xhtml_string']
85
86HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
87              "img", "input", "isindex", "link", "meta" "param")
88
89try:
90    HTML_EMPTY = set(HTML_EMPTY)
91except NameError:
92    pass
93
94_namespace_map = {
95    # "well-known" namespace prefixes
96    "http://www.w3.org/XML/1998/namespace": "xml",
97    "http://www.w3.org/1999/xhtml": "html",
98    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
99    "http://schemas.xmlsoap.org/wsdl/": "wsdl",
100    # xml schema
101    "http://www.w3.org/2001/XMLSchema": "xs",
102    "http://www.w3.org/2001/XMLSchema-instance": "xsi",
103    # dublic core
104    "http://purl.org/dc/elements/1.1/": "dc",
105}
106
107
108def _raise_serialization_error(text):
109    raise TypeError(
110        "cannot serialize %r (type %s)" % (text, type(text).__name__)
111        )
112
113def _encode(text, encoding):
114    try:
115        return text.encode(encoding, "xmlcharrefreplace")
116    except (TypeError, AttributeError):
117        _raise_serialization_error(text)
118
119def _escape_cdata(text):
120    # escape character data
121    try:
122        # it's worth avoiding do-nothing calls for strings that are
123        # shorter than 500 character, or so.  assume that's, by far,
124        # the most common case in most applications.
125        if "&" in text:
126            text = text.replace("&", "&amp;")
127        if "<" in text:
128            text = text.replace("<", "&lt;")
129        if ">" in text:
130            text = text.replace(">", "&gt;")
131        return text
132    except (TypeError, AttributeError):
133        _raise_serialization_error(text)
134
135
136def _escape_attrib(text):
137    # escape attribute value
138    try:
139        if "&" in text:
140            text = text.replace("&", "&amp;")
141        if "<" in text:
142            text = text.replace("<", "&lt;")
143        if ">" in text:
144            text = text.replace(">", "&gt;")
145        if "\"" in text:
146            text = text.replace("\"", "&quot;")
147        if "\n" in text:
148            text = text.replace("\n", "&#10;")
149        return text
150    except (TypeError, AttributeError):
151        _raise_serialization_error(text)
152
153def _escape_attrib_html(text):
154    # escape attribute value
155    try:
156        if "&" in text:
157            text = text.replace("&", "&amp;")
158        if "<" in text:
159            text = text.replace("<", "&lt;")
160        if ">" in text:
161            text = text.replace(">", "&gt;")
162        if "\"" in text:
163            text = text.replace("\"", "&quot;")
164        return text
165    except (TypeError, AttributeError):
166        _raise_serialization_error(text)
167
168
169def _serialize_html(write, elem, qnames, namespaces, format):
170    tag = elem.tag
171    text = elem.text
172    if tag is Comment:
173        write("<!--%s-->" % _escape_cdata(text))
174    elif tag is ProcessingInstruction:
175        write("<?%s?>" % _escape_cdata(text))
176    else:
177        tag = qnames[tag]
178        if tag is None:
179            if text:
180                write(_escape_cdata(text))
181            for e in elem:
182                _serialize_html(write, e, qnames, None, format)
183        else:
184            write("<" + tag)
185            items = elem.items()
186            if items or namespaces:
187                items.sort() # lexical order
188                for k, v in items:
189                    if isinstance(k, QName):
190                        k = k.text
191                    if isinstance(v, QName):
192                        v = qnames[v.text]
193                    else:
194                        v = _escape_attrib_html(v)
195                    if qnames[k] == v and format == 'html':
196                        # handle boolean attributes
197                        write(" %s" % v)
198                    else:
199                        write(" %s=\"%s\"" % (qnames[k], v))
200                if namespaces:
201                    items = namespaces.items()
202                    items.sort(key=lambda x: x[1]) # sort on prefix
203                    for v, k in items:
204                        if k:
205                            k = ":" + k
206                        write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v)))
207            if format == "xhtml" and tag in HTML_EMPTY:
208                write(" />")
209            else:
210                write(">")
211                tag = tag.lower()
212                if text:
213                    if tag == "script" or tag == "style":
214                        write(text)
215                    else:
216                        write(_escape_cdata(text))
217                for e in elem:
218                    _serialize_html(write, e, qnames, None, format)
219                if tag not in HTML_EMPTY:
220                    write("</" + tag + ">")
221    if elem.tail:
222        write(_escape_cdata(elem.tail))
223
224def _write_html(root,
225                encoding=None,
226                default_namespace=None,
227                format="html"):
228    assert root is not None
229    data = []
230    write = data.append
231    qnames, namespaces = _namespaces(root, default_namespace)
232    _serialize_html(write, root, qnames, namespaces, format)
233    if encoding is None:
234        return "".join(data)
235    else:
236        return _encode("".join(data))
237
238
239# --------------------------------------------------------------------
240# serialization support
241
242def _namespaces(elem, default_namespace=None):
243    # identify namespaces used in this tree
244
245    # maps qnames to *encoded* prefix:local names
246    qnames = {None: None}
247
248    # maps uri:s to prefixes
249    namespaces = {}
250    if default_namespace:
251        namespaces[default_namespace] = ""
252
253    def add_qname(qname):
254        # calculate serialized qname representation
255        try:
256            if qname[:1] == "{":
257                uri, tag = qname[1:].split("}", 1)
258                prefix = namespaces.get(uri)
259                if prefix is None:
260                    prefix = _namespace_map.get(uri)
261                    if prefix is None:
262                        prefix = "ns%d" % len(namespaces)
263                    if prefix != "xml":
264                        namespaces[uri] = prefix
265                if prefix:
266                    qnames[qname] = "%s:%s" % (prefix, tag)
267                else:
268                    qnames[qname] = tag # default element
269            else:
270                if default_namespace:
271                    raise ValueError(
272                        "cannot use non-qualified names with "
273                        "default_namespace option"
274                        )
275                qnames[qname] = qname
276        except TypeError:
277            _raise_serialization_error(qname)
278
279    # populate qname and namespaces table
280    try:
281        iterate = elem.iter
282    except AttributeError:
283        iterate = elem.getiterator # cET compatibility
284    for elem in iterate():
285        tag = elem.tag
286        if isinstance(tag, QName) and tag.text not in qnames:
287            add_qname(tag.text)
288        elif isinstance(tag, util.string_type):
289            if tag not in qnames:
290                add_qname(tag)
291        elif tag is not None and tag is not Comment and tag is not PI:
292            _raise_serialization_error(tag)
293        for key, value in elem.items():
294            if isinstance(key, QName):
295                key = key.text
296            if key not in qnames:
297                add_qname(key)
298            if isinstance(value, QName) and value.text not in qnames:
299                add_qname(value.text)
300        text = elem.text
301        if isinstance(text, QName) and text.text not in qnames:
302            add_qname(text.text)
303    return qnames, namespaces
304
305def to_html_string(element):
306    return _write_html(ElementTree(element).getroot(), format="html")
307
308def to_xhtml_string(element):
309    return _write_html(ElementTree(element).getroot(), format="xhtml")
310