HtmlSanitizer.java revision 4d17cd9ce55e109898d50a4e54f01838f3cb93dc
18403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Copyright (c) 2011, Mike Samuel 28403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// All rights reserved. 38403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// 48403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistribution and use in source and binary forms, with or without 58403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// modification, are permitted provided that the following conditions 68403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// are met: 78403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// 88403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistributions of source code must retain the above copyright 98403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// notice, this list of conditions and the following disclaimer. 108403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistributions in binary form must reproduce the above copyright 118403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// notice, this list of conditions and the following disclaimer in the 128403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// documentation and/or other materials provided with the distribution. 138403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Neither the name of the OWASP nor the names of its contributors may 148403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// be used to endorse or promote products derived from this software 158403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// without specific prior written permission. 168403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 178403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 188403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 198403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 208403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 218403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 228403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 238403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 248403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 258403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 268403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 278403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// POSSIBILITY OF SUCH DAMAGE. 288403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel 295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.compackage org.owasp.html; 305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport java.util.LinkedList; 325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport java.util.List; 33ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuelimport javax.annotation.Nullable; 345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport com.google.common.annotations.VisibleForTesting; 365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport com.google.common.collect.Lists; 375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com/** 395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Consumes an HTML stream, and dispatches events to a policy object which 405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * decides which elements and attributes to allow. 415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.compublic final class HtmlSanitizer { 435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Receives events based on the HTML stream, and applies a policy to decide 465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * what HTML constructs to allow. 475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Typically, implementations use an {@link HtmlStreamRenderer} to produce 485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * the sanitized output. 495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * <p> 51ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel * <b>Implementations of this class are in the TCB.</b></p> 525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com @TCB 5404fec67bccd1004fba68e662ba9709747aa65d30mikesamuel public interface Policy extends HtmlStreamEventReceiver { 555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input. 575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * @param elementName a normalized (lower-case for non-namespaced names) 595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * element name. 605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * @param attrs a list of alternating attribute name and value pairs. 615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * For efficiency, this list may be mutated by this during this method 625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * call, but ownership reverts to the caller on method exit. 635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * The values are raw -- HTML entities have been decoded. 645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Specifically, implementations are allowed to use a list iterator 655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * and remove all disallowed attributes, add necessary attributes, and 665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * then pass the list to an {@link HtmlStreamRenderer}. 675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com void openTag(String elementName, List<String> attrs); 695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Called when an HTML tag like {@code </foo>} is seen in the input. 725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * @param elementName a normalized (lower-case for non-namespaced names) 745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * element name. 755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com void closeTag(String elementName); 775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Called when textual content is seen. 805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * @param textChunk raw content -- HTML entities have been decoded. 815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com void text(String textChunk); 835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Sanitizes the given HTML by applying the given policy to it. 875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * <p> 895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * This method is not in the TCB. 905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * <p> 925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * This method has no return value since policies are assumed to render things 935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * they accept and do nothing on things they reject. 945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Use {@link HtmlStreamRenderer} to render content to an output buffer. 955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 96ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel * @param html A snippet of HTML to sanitize. {@code null} is treated as the 97ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel * empty string and will not result in a {@code NullPointerException}. 98ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel * @param policy The Policy that will receive events based on the tokens in 99ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel * html. Typically, this policy ends up routing the events to an 100ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel * {@link HtmlStreamRenderer} after filtering. 101ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel * {@link HtmlPolicyBuilder} provides an easy way to create policies. 1025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 103ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel public static void sanitize(@Nullable String html, final Policy policy) { 10475d905c90100b9b05602b1878f847142e39836aamikesamuel if (html == null) { html = ""; } 10575d905c90100b9b05602b1878f847142e39836aamikesamuel 1063f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel TagBalancingHtmlStreamEventReceiver balancer 1073f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel = new TagBalancingHtmlStreamEventReceiver(policy); 1083f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel 1093f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel // According to Opera the maximum table nesting depth seen in the wild is 1103f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel // 795, but 99.99% of documents have a table nesting depth of less than 22. 1113f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel // Since each table has a nesting depth of 4 (incl. TBODY), this leads to a 1123f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel // document depth of 90 (incl. HTML & BODY). 1133f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel // Obviously table nesting depth is not the same as whole document depth, 1143f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel // but it is the best proxy I have available. 1153f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel // See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for 1163f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel // the original data. 1174d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel 1184d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel // Webkit defines the maximum HTML parser tree depth as 512. 1194d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel // http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408 1204d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel // static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512; 1214d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel 1224d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel // The first number gives us a lower bound on the nesting depth we allow, 1234d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel // 90, and the second gives us an upper bound: 512. 1244d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel // We do not want to bump right up against that limit. 1254d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel // 256 is substantially larger than the lower bound and well clear of the 1264d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel // upper bound. 1274d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel balancer.setNestingLimit(256); 1285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com balancer.openDocument(); 1305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 13175d905c90100b9b05602b1878f847142e39836aamikesamuel HtmlLexer lexer = new HtmlLexer(html); 1325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Use a linked list so that policies can use Iterator.remove() in an O(1) 1335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // way. 1345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com LinkedList<String> attrs = Lists.newLinkedList(); 1355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com while (lexer.hasNext()) { 1365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlToken token = lexer.next(); 1375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com switch (token.type) { 1385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case TEXT: 1395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com balancer.text(decodeHtml(html.substring(token.start, token.end))); 1405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case UNESCAPED: 1425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com balancer.text(html.substring(token.start, token.end)); 1435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case TAGBEGIN: 1455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (html.charAt(token.start + 1) == '/') { // A close tag. 1465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com balancer.closeTag(HtmlLexer.canonicalName( 1475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com html.substring(token.start + 2, token.end))); 1485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com while (lexer.hasNext() 1495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com && lexer.next().type != HtmlTokenType.TAGEND) { 1505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // skip tokens until we see a ">" 1515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 1535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrs.clear(); 1545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com boolean attrsReadyForName = true; 1565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com tagBody: 1575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com while (lexer.hasNext()) { 1585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlToken tagBodyToken = lexer.next(); 1595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com switch (tagBodyToken.type) { 1605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case ATTRNAME: 1615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (!attrsReadyForName) { 1625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Last attribute added was valueless. 1635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrs.add(attrs.getLast()); 1645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 1655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrsReadyForName = false; 1665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrs.add(HtmlLexer.canonicalName( 1685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com html.substring(tagBodyToken.start, tagBodyToken.end))); 1695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case ATTRVALUE: 1715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrs.add(decodeHtml(stripQuotes( 1725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com html.substring(tagBodyToken.start, tagBodyToken.end)))); 1735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrsReadyForName = true; 1745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case TAGEND: 1765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break tagBody; 1775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com default: 1785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Just drop anything not recognized 1795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (!attrsReadyForName) { 1825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrs.add(attrs.getLast()); 1835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com balancer.openTag( 1855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlLexer.canonicalName( 1865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com html.substring(token.start + 1, token.end)), 1875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com attrs); 1885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com default: 1915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Ignore comments, directives, and other stuff that shouldn't show 1925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // up in the output. 1935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com balancer.closeDocument(); 1985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2004e867904c8295537803c1c8a076e130df5674b58mikesamuel private static String stripQuotes(String encodedAttributeValue) { 2015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int n = encodedAttributeValue.length(); 2025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (n > 0) { 2035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com char last = encodedAttributeValue.charAt(n - 1); 2045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (last == '"' || last == '\'') { 2055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int start = 0; 2065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (n != 1 && last == encodedAttributeValue.charAt(0)) { 2075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com start = 1; 2085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 2095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Browsers deal with missing left quotes : <img src=foo.png"> 2105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // but generally do not deal with missing right : <img src="foo.png> 2115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return encodedAttributeValue.substring(start, n - 1); 2135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return encodedAttributeValue; 2165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com @VisibleForTesting 2195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com static String decodeHtml(String s) { 2205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int amp = s.indexOf('&'); 2215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (amp < 0) { return s; } 2225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int pos = 0; 2235c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int n = s.length(); 2245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com StringBuilder sb = new StringBuilder(n); 2255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int end; 2265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com do { 2275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n); 2285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com end = (int) (endAndCodepoint >>> 32); 2295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int codepoint = (int) endAndCodepoint; 2305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com sb.append(s, pos, amp).appendCodePoint(codepoint); 2315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com pos = end; 2325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } while ((amp = s.indexOf('&', end)) >= 0); 2335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return sb.append(s, pos, n).toString(); 2345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com} 237