HtmlSanitizer.java revision 8403881c365ab36b721ccc4500af1b3a5bd25870
18403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Copyright (c) 2011, Mike Samuel 28403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// All rights reserved. 38403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// 48403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistribution and use in source and binary forms, with or without 58403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// modification, are permitted provided that the following conditions 68403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// are met: 78403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// 88403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistributions of source code must retain the above copyright 98403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// notice, this list of conditions and the following disclaimer. 108403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistributions in binary form must reproduce the above copyright 118403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// notice, this list of conditions and the following disclaimer in the 128403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// documentation and/or other materials provided with the distribution. 138403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Neither the name of the OWASP nor the names of its contributors may 148403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// be used to endorse or promote products derived from this software 158403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// without specific prior written permission. 168403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 178403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 188403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 198403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 208403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 218403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 228403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 238403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 248403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 258403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 268403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 278403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// POSSIBILITY OF SUCH DAMAGE. 288403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel 295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.compackage org.owasp.html; 305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport java.util.LinkedList; 325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport java.util.List; 335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport com.google.common.annotations.VisibleForTesting; 355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport com.google.common.collect.Lists; 365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com/** 385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Consumes an HTML stream, and dispatches events to a policy object which 395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * decides which elements and attributes to allow. 405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.compublic final class HtmlSanitizer { 425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Receives events based on the HTML stream, and applies a policy to decide 455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * what HTML constructs to allow. 465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Typically, implementations use an {@link HtmlStreamRenderer} to produce 475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * the sanitized output. 485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * <p> 505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * <b>Implementations of this class are in the TCB.</b> 515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com @TCB 535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com interface Policy extends HtmlStreamEventReceiver { 545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input. 565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * @param elementName a normalized (lower-case for non-namespaced names) 585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * element name. 595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * @param attrs a list of alternating attribute name and value pairs. 605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * For efficiency, this list may be mutated by this during this method 615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * call, but ownership reverts to the caller on method exit. 625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * The values are raw -- HTML entities have been decoded. 635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Specifically, implementations are allowed to use a list iterator 645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * and remove all disallowed attributes, add necessary attributes, and 655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * then pass the list to an {@link HtmlStreamRenderer}. 665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com void openTag(String elementName, List<String> attrs); 685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Called when an HTML tag like {@code </foo>} is seen in the input. 715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * @param elementName a normalized (lower-case for non-namespaced names) 735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * element name. 745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com void closeTag(String elementName); 765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Called when textual content is seen. 795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * @param textChunk raw content -- HTML entities have been decoded. 805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com void text(String textChunk); 825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Sanitizes the given HTML by applying the given policy to it. 865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * <p> 885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * This method is not in the TCB. 895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * <p> 915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * This method has no return value since policies are assumed to render things 925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * they accept and do nothing on things they reject. 935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Use {@link HtmlStreamRenderer} to render content to an output buffer. 945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * @param html The html to sanitize. 965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * @param policy The policy that should receive events based on the . 975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 984e867904c8295537803c1c8a076e130df5674b58mikesamuel public static void sanitize(String html, final Policy policy) { 995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlStreamEventReceiver balancer = new TagBalancingHtmlStreamEventReceiver( 1005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com policy); 1015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com balancer.openDocument(); 1035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlLexer lexer = new HtmlLexer(html); 1055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Use a linked list so that policies can use Iterator.remove() in an O(1) 1065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // way. 1075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com LinkedList<String> attrs = Lists.newLinkedList(); 1085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com while (lexer.hasNext()) { 1095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlToken token = lexer.next(); 1105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com switch (token.type) { 1115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case TEXT: 1125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com balancer.text(decodeHtml(html.substring(token.start, token.end))); 1135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case UNESCAPED: 1155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com balancer.text(html.substring(token.start, token.end)); 1165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case TAGBEGIN: 1185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (html.charAt(token.start + 1) == '/') { // A close tag. 1195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com balancer.closeTag(HtmlLexer.canonicalName( 1205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com html.substring(token.start + 2, token.end))); 1215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com while (lexer.hasNext() 1225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com && lexer.next().type != HtmlTokenType.TAGEND) { 1235c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // skip tokens until we see a ">" 1245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 1265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrs.clear(); 1275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com boolean attrsReadyForName = true; 1295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com tagBody: 1305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com while (lexer.hasNext()) { 1315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlToken tagBodyToken = lexer.next(); 1325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com switch (tagBodyToken.type) { 1335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case ATTRNAME: 1345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (!attrsReadyForName) { 1355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Last attribute added was valueless. 1365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrs.add(attrs.getLast()); 1375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 1385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrsReadyForName = false; 1395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrs.add(HtmlLexer.canonicalName( 1415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com html.substring(tagBodyToken.start, tagBodyToken.end))); 1425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case ATTRVALUE: 1445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrs.add(decodeHtml(stripQuotes( 1455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com html.substring(tagBodyToken.start, tagBodyToken.end)))); 1465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrsReadyForName = true; 1475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case TAGEND: 1495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break tagBody; 1505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com default: 1515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Just drop anything not recognized 1525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (!attrsReadyForName) { 1555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrs.add(attrs.getLast()); 1565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com balancer.openTag( 1585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlLexer.canonicalName( 1595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com html.substring(token.start + 1, token.end)), 1605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrs); 1615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com default: 1645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Ignore comments, directives, and other stuff that shouldn't show 1655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // up in the output. 1665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com balancer.closeDocument(); 1715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1734e867904c8295537803c1c8a076e130df5674b58mikesamuel private static String stripQuotes(String encodedAttributeValue) { 1745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int n = encodedAttributeValue.length(); 1755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (n > 0) { 1765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com char last = encodedAttributeValue.charAt(n - 1); 1775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (last == '"' || last == '\'') { 1785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int start = 0; 1795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (n != 1 && last == encodedAttributeValue.charAt(0)) { 1805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com start = 1; 1815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 1825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Browsers deal with missing left quotes : <img src=foo.png"> 1835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // but generally do not deal with missing right : <img src="foo.png> 1845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return encodedAttributeValue.substring(start, n - 1); 1865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return encodedAttributeValue; 1895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com @VisibleForTesting 1925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com static String decodeHtml(String s) { 1935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int amp = s.indexOf('&'); 1945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (amp < 0) { return s; } 1955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int pos = 0; 1965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int n = s.length(); 1975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com StringBuilder sb = new StringBuilder(n); 1985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int end; 1995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com do { 2005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n); 2015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com end = (int) (endAndCodepoint >>> 32); 2025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int codepoint = (int) endAndCodepoint; 2035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com sb.append(s, pos, amp).appendCodePoint(codepoint); 2045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com pos = end; 2055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } while ((amp = s.indexOf('&', end)) >= 0); 2065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return sb.append(s, pos, n).toString(); 2075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com} 210