1from __future__ import absolute_import, division, unicode_literals
2
3import re
4from xml.sax.saxutils import escape, unescape
5from six.moves import urllib_parse as urlparse
6
7from .tokenizer import HTMLTokenizer
8from .constants import tokenTypes
9
10
11content_type_rgx = re.compile(r'''
12                               ^
13                               # Match a content type <application>/<type>
14                               (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
15                               # Match any character set and encoding
16                               (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
17                                 |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
18                               # Assume the rest is data
19                               ,.*
20                               $
21                               ''',
22                              re.VERBOSE)
23
24
25class HTMLSanitizerMixin(object):
26    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
27
28    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
29                           'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
30                           'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
31                           'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
32                           'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
33                           'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
34                           'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
35                           'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
36                           'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
37                           'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
38                           'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
39                           'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
40                           'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
41
42    mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
43                       'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
44                       'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
45                       'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
46                       'munderover', 'none']
47
48    svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
49                    'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
50                    'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
51                    'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
52                    'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
53                    'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
54
55    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
56                             'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
57                             'background', 'balance', 'bgcolor', 'bgproperties', 'border',
58                             'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
59                             'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
60                             'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
61                             'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
62                             'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
63                             'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
64                             'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
65                             'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
66                             'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
67                             'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
68                             'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
69                             'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
70                             'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
71                             'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
72                             'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
73                             'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
74                             'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
75                             'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
76                             'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
77                             'width', 'wrap', 'xml:lang']
78
79    mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
80                         'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
81                         'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
82                         'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
83                         'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
84                         'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
85                         'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
86                         'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
87                         'xlink:type', 'xmlns', 'xmlns:xlink']
88
89    svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
90                      'arabic-form', 'ascent', 'attributeName', 'attributeType',
91                      'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
92                      'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
93                      'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
94                      'fill-opacity', 'fill-rule', 'font-family', 'font-size',
95                      'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
96                      'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
97                      'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
98                      'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
99                      'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
100                      'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
101                      'opacity', 'orient', 'origin', 'overline-position',
102                      'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
103                      'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
104                      'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
105                      'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
106                      'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
107                      'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
108                      'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
109                      'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
110                      'transform', 'type', 'u1', 'u2', 'underline-position',
111                      'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
112                      'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
113                      'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
114                      'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
115                      'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
116                      'y1', 'y2', 'zoomAndPan']
117
118    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
119                       'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']
120
121    svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
122                               'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
123                               'mask', 'stroke']
124
125    svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
126                            'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
127                            'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
128                            'set', 'use']
129
130    acceptable_css_properties = ['azimuth', 'background-color',
131                                 'border-bottom-color', 'border-collapse', 'border-color',
132                                 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
133                                 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
134                                 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
135                                 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
136                                 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
137                                 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
138                                 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
139                                 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
140                                 'white-space', 'width']
141
142    acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
143                               'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
144                               'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
145                               'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
146                               'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
147                               'transparent', 'underline', 'white', 'yellow']
148
149    acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
150                                 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
151                                 'stroke-opacity']
152
153    acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
154                            'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
155                            'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
156                            'ssh', 'sftp', 'rtsp', 'afs', 'data']
157
158    acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
159
160    # subclasses may define their own versions of these constants
161    allowed_elements = acceptable_elements + mathml_elements + svg_elements
162    allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
163    allowed_css_properties = acceptable_css_properties
164    allowed_css_keywords = acceptable_css_keywords
165    allowed_svg_properties = acceptable_svg_properties
166    allowed_protocols = acceptable_protocols
167    allowed_content_types = acceptable_content_types
168
169    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
170    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
171    # attributes are parsed, and a restricted set, # specified by
172    # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
173    # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
174    # in ALLOWED_PROTOCOLS are allowed.
175    #
176    #   sanitize_html('<script> do_nasty_stuff() </script>')
177    #    => &lt;script> do_nasty_stuff() &lt;/script>
178    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
179    #    => <a>Click here for $100</a>
180    def sanitize_token(self, token):
181
182        # accommodate filters which use token_type differently
183        token_type = token["type"]
184        if token_type in list(tokenTypes.keys()):
185            token_type = tokenTypes[token_type]
186
187        if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
188                          tokenTypes["EmptyTag"]):
189            if token["name"] in self.allowed_elements:
190                return self.allowed_token(token, token_type)
191            else:
192                return self.disallowed_token(token, token_type)
193        elif token_type == tokenTypes["Comment"]:
194            pass
195        else:
196            return token
197
198    def allowed_token(self, token, token_type):
199        if "data" in token:
200            attrs = dict([(name, val) for name, val in
201                          token["data"][::-1]
202                          if name in self.allowed_attributes])
203            for attr in self.attr_val_is_uri:
204                if attr not in attrs:
205                    continue
206                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
207                                       unescape(attrs[attr])).lower()
208                # remove replacement characters from unescaped characters
209                val_unescaped = val_unescaped.replace("\ufffd", "")
210                uri = urlparse.urlparse(val_unescaped)
211                if uri and uri.scheme:
212                    if uri.scheme not in self.allowed_protocols:
213                        del attrs[attr]
214                    if uri.scheme == 'data':
215                        m = content_type_rgx.match(uri.path)
216                        if not m:
217                            del attrs[attr]
218                        elif m.group('content_type') not in self.allowed_content_types:
219                            del attrs[attr]
220
221            for attr in self.svg_attr_val_allows_ref:
222                if attr in attrs:
223                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
224                                         ' ',
225                                         unescape(attrs[attr]))
226            if (token["name"] in self.svg_allow_local_href and
227                'xlink:href' in attrs and re.search('^\s*[^#\s].*',
228                                                    attrs['xlink:href'])):
229                del attrs['xlink:href']
230            if 'style' in attrs:
231                attrs['style'] = self.sanitize_css(attrs['style'])
232            token["data"] = [[name, val] for name, val in list(attrs.items())]
233        return token
234
235    def disallowed_token(self, token, token_type):
236        if token_type == tokenTypes["EndTag"]:
237            token["data"] = "</%s>" % token["name"]
238        elif token["data"]:
239            attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
240            token["data"] = "<%s%s>" % (token["name"], attrs)
241        else:
242            token["data"] = "<%s>" % token["name"]
243        if token.get("selfClosing"):
244            token["data"] = token["data"][:-1] + "/>"
245
246        if token["type"] in list(tokenTypes.keys()):
247            token["type"] = "Characters"
248        else:
249            token["type"] = tokenTypes["Characters"]
250
251        del token["name"]
252        return token
253
254    def sanitize_css(self, style):
255        # disallow urls
256        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
257
258        # gauntlet
259        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
260            return ''
261        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
262            return ''
263
264        clean = []
265        for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
266            if not value:
267                continue
268            if prop.lower() in self.allowed_css_properties:
269                clean.append(prop + ': ' + value + ';')
270            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
271                                                'padding']:
272                for keyword in value.split():
273                    if keyword not in self.acceptable_css_keywords and \
274                            not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
275                        break
276                else:
277                    clean.append(prop + ': ' + value + ';')
278            elif prop.lower() in self.allowed_svg_properties:
279                clean.append(prop + ': ' + value + ';')
280
281        return ' '.join(clean)
282
283
284class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
285    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
286                 lowercaseElementName=False, lowercaseAttrName=False, parser=None):
287        # Change case matching defaults as we only output lowercase html anyway
288        # This solution doesn't seem ideal...
289        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
290                               lowercaseElementName, lowercaseAttrName, parser=parser)
291
292    def __iter__(self):
293        for token in HTMLTokenizer.__iter__(self):
294            token = self.sanitize_token(token)
295            if token:
296                yield token
297