1from __future__ import absolute_import, division, unicode_literals 2 3import re 4from xml.sax.saxutils import escape, unescape 5from six.moves import urllib_parse as urlparse 6 7from .tokenizer import HTMLTokenizer 8from .constants import tokenTypes 9 10 11content_type_rgx = re.compile(r''' 12 ^ 13 # Match a content type <application>/<type> 14 (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+) 15 # Match any character set and encoding 16 (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?) 17 |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?) 18 # Assume the rest is data 19 ,.* 20 $ 21 ''', 22 re.VERBOSE) 23 24 25class HTMLSanitizerMixin(object): 26 """ sanitization of XHTML+MathML+SVG and of inline style attributes.""" 27 28 acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 29 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 30 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 31 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 32 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 33 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1', 34 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 35 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', 36 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', 37 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', 38 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 39 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 40 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video'] 41 42 mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi', 43 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom', 44 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub', 45 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 46 'munderover', 'none'] 47 48 svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', 49 'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse', 50 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 51 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 52 'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 53 'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use'] 54 55 acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', 56 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis', 57 'background', 'balance', 'bgcolor', 'bgproperties', 'border', 58 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding', 59 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff', 60 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 61 'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 62 'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 63 'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 64 'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers', 65 'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace', 66 'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing', 67 'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend', 68 'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method', 69 'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open', 70 'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload', 71 'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min', 72 'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan', 73 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start', 74 'step', 'style', 'summary', 'suppress', 'tabindex', 'target', 75 'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap', 76 'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml', 77 'width', 'wrap', 'xml:lang'] 78 79 mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign', 80 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth', 81 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence', 82 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace', 83 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize', 84 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines', 85 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', 86 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show', 87 'xlink:type', 'xmlns', 'xmlns:xlink'] 88 89 svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic', 90 'arabic-form', 'ascent', 'attributeName', 'attributeType', 91 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', 92 'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx', 93 'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill', 94 'fill-opacity', 'fill-rule', 'font-family', 'font-size', 95 'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from', 96 'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging', 97 'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k', 98 'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end', 99 'marker-mid', 'marker-start', 'markerHeight', 'markerUnits', 100 'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset', 101 'opacity', 'orient', 'origin', 'overline-position', 102 'overline-thickness', 'panose-1', 'path', 'pathLength', 'points', 103 'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 104 'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart', 105 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color', 106 'stop-opacity', 'strikethrough-position', 'strikethrough-thickness', 107 'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap', 108 'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity', 109 'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to', 110 'transform', 'type', 'u1', 'u2', 'underline-position', 111 'underline-thickness', 'unicode', 'unicode-range', 'units-per-em', 112 'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x', 113 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole', 114 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type', 115 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 116 'y1', 'y2', 'zoomAndPan'] 117 118 attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc', 119 'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base'] 120 121 svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill', 122 'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 123 'mask', 'stroke'] 124 125 svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 126 'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter', 127 'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref', 128 'set', 'use'] 129 130 acceptable_css_properties = ['azimuth', 'background-color', 131 'border-bottom-color', 'border-collapse', 'border-color', 132 'border-left-color', 'border-right-color', 'border-top-color', 'clear', 133 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', 134 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', 135 'height', 'letter-spacing', 'line-height', 'overflow', 'pause', 136 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', 137 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', 138 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', 139 'unicode-bidi', 'vertical-align', 'voice-family', 'volume', 140 'white-space', 'width'] 141 142 acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue', 143 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', 144 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', 145 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', 146 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', 147 'transparent', 'underline', 'white', 'yellow'] 148 149 acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule', 150 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', 151 'stroke-opacity'] 152 153 acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc', 154 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal', 155 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', 156 'ssh', 'sftp', 'rtsp', 'afs', 'data'] 157 158 acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain'] 159 160 # subclasses may define their own versions of these constants 161 allowed_elements = acceptable_elements + mathml_elements + svg_elements 162 allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes 163 allowed_css_properties = acceptable_css_properties 164 allowed_css_keywords = acceptable_css_keywords 165 allowed_svg_properties = acceptable_svg_properties 166 allowed_protocols = acceptable_protocols 167 allowed_content_types = acceptable_content_types 168 169 # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and 170 # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style 171 # attributes are parsed, and a restricted set, # specified by 172 # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through. 173 # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified 174 # in ALLOWED_PROTOCOLS are allowed. 175 # 176 # sanitize_html('<script> do_nasty_stuff() </script>') 177 # => <script> do_nasty_stuff() </script> 178 # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>') 179 # => <a>Click here for $100</a> 180 def sanitize_token(self, token): 181 182 # accommodate filters which use token_type differently 183 token_type = token["type"] 184 if token_type in list(tokenTypes.keys()): 185 token_type = tokenTypes[token_type] 186 187 if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"], 188 tokenTypes["EmptyTag"]): 189 if token["name"] in self.allowed_elements: 190 return self.allowed_token(token, token_type) 191 else: 192 return self.disallowed_token(token, token_type) 193 elif token_type == tokenTypes["Comment"]: 194 pass 195 else: 196 return token 197 198 def allowed_token(self, token, token_type): 199 if "data" in token: 200 attrs = dict([(name, val) for name, val in 201 token["data"][::-1] 202 if name in self.allowed_attributes]) 203 for attr in self.attr_val_is_uri: 204 if attr not in attrs: 205 continue 206 val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', 207 unescape(attrs[attr])).lower() 208 # remove replacement characters from unescaped characters 209 val_unescaped = val_unescaped.replace("\ufffd", "") 210 uri = urlparse.urlparse(val_unescaped) 211 if uri and uri.scheme: 212 if uri.scheme not in self.allowed_protocols: 213 del attrs[attr] 214 if uri.scheme == 'data': 215 m = content_type_rgx.match(uri.path) 216 if not m: 217 del attrs[attr] 218 elif m.group('content_type') not in self.allowed_content_types: 219 del attrs[attr] 220 221 for attr in self.svg_attr_val_allows_ref: 222 if attr in attrs: 223 attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', 224 ' ', 225 unescape(attrs[attr])) 226 if (token["name"] in self.svg_allow_local_href and 227 'xlink:href' in attrs and re.search('^\s*[^#\s].*', 228 attrs['xlink:href'])): 229 del attrs['xlink:href'] 230 if 'style' in attrs: 231 attrs['style'] = self.sanitize_css(attrs['style']) 232 token["data"] = [[name, val] for name, val in list(attrs.items())] 233 return token 234 235 def disallowed_token(self, token, token_type): 236 if token_type == tokenTypes["EndTag"]: 237 token["data"] = "</%s>" % token["name"] 238 elif token["data"]: 239 attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]]) 240 token["data"] = "<%s%s>" % (token["name"], attrs) 241 else: 242 token["data"] = "<%s>" % token["name"] 243 if token.get("selfClosing"): 244 token["data"] = token["data"][:-1] + "/>" 245 246 if token["type"] in list(tokenTypes.keys()): 247 token["type"] = "Characters" 248 else: 249 token["type"] = tokenTypes["Characters"] 250 251 del token["name"] 252 return token 253 254 def sanitize_css(self, style): 255 # disallow urls 256 style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) 257 258 # gauntlet 259 if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): 260 return '' 261 if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): 262 return '' 263 264 clean = [] 265 for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style): 266 if not value: 267 continue 268 if prop.lower() in self.allowed_css_properties: 269 clean.append(prop + ': ' + value + ';') 270 elif prop.split('-')[0].lower() in ['background', 'border', 'margin', 271 'padding']: 272 for keyword in value.split(): 273 if keyword not in self.acceptable_css_keywords and \ 274 not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): 275 break 276 else: 277 clean.append(prop + ': ' + value + ';') 278 elif prop.lower() in self.allowed_svg_properties: 279 clean.append(prop + ': ' + value + ';') 280 281 return ' '.join(clean) 282 283 284class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin): 285 def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, 286 lowercaseElementName=False, lowercaseAttrName=False, parser=None): 287 # Change case matching defaults as we only output lowercase html anyway 288 # This solution doesn't seem ideal... 289 HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, 290 lowercaseElementName, lowercaseAttrName, parser=parser) 291 292 def __iter__(self): 293 for token in HTMLTokenizer.__iter__(self): 294 token = self.sanitize_token(token) 295 if token: 296 yield token 297