HtmlSanitizer.java revision 4d17cd9ce55e109898d50a4e54f01838f3cb93dc
18403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Copyright (c) 2011, Mike Samuel
28403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// All rights reserved.
38403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel//
48403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistribution and use in source and binary forms, with or without
58403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// modification, are permitted provided that the following conditions
68403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// are met:
78403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel//
88403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistributions of source code must retain the above copyright
98403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// notice, this list of conditions and the following disclaimer.
108403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistributions in binary form must reproduce the above copyright
118403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// notice, this list of conditions and the following disclaimer in the
128403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// documentation and/or other materials provided with the distribution.
138403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Neither the name of the OWASP nor the names of its contributors may
148403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// be used to endorse or promote products derived from this software
158403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// without specific prior written permission.
168403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
178403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
188403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
198403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
208403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
218403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
228403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
238403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
248403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
258403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
268403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
278403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// POSSIBILITY OF SUCH DAMAGE.
288403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel
295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.compackage org.owasp.html;
305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport java.util.LinkedList;
325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport java.util.List;
33ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuelimport javax.annotation.Nullable;
345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport com.google.common.annotations.VisibleForTesting;
365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport com.google.common.collect.Lists;
375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com/**
395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Consumes an HTML stream, and dispatches events to a policy object which
405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * decides which elements and attributes to allow.
415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */
425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.compublic final class HtmlSanitizer {
435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /**
455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * Receives events based on the HTML stream, and applies a policy to decide
465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * what HTML constructs to allow.
475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * Typically, implementations use an {@link HtmlStreamRenderer} to produce
485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * the sanitized output.
495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   *
505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * <p>
51ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel   * <b>Implementations of this class are in the TCB.</b></p>
525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   */
535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  @TCB
5404fec67bccd1004fba68e662ba9709747aa65d30mikesamuel  public interface Policy extends HtmlStreamEventReceiver {
555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    /**
565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input.
575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *
585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     * @param elementName a normalized (lower-case for non-namespaced names)
595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     element name.
605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     * @param attrs a list of alternating attribute name and value pairs.
615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     For efficiency, this list may be mutated by this during this method
625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     call, but ownership reverts to the caller on method exit.
635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     The values are raw -- HTML entities have been decoded.
645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     Specifically, implementations are allowed to use a list iterator
655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     and remove all disallowed attributes, add necessary attributes, and
665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     then pass the list to an {@link HtmlStreamRenderer}.
675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     */
685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    void openTag(String elementName, List<String> attrs);
695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    /**
715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     * Called when an HTML tag like {@code </foo>} is seen in the input.
725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *
735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     * @param elementName a normalized (lower-case for non-namespaced names)
745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     *     element name.
755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     */
765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    void closeTag(String elementName);
775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    /**
795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     * Called when textual content is seen.
805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     * @param textChunk raw content -- HTML entities have been decoded.
815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com     */
825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    void text(String textChunk);
835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /**
865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * Sanitizes the given HTML by applying the given policy to it.
875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   *
885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * <p>
895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * This method is not in the TCB.
905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   *
915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * <p>
925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * This method has no return value since policies are assumed to render things
935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * they accept and do nothing on things they reject.
945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * Use {@link HtmlStreamRenderer} to render content to an output buffer.
955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   *
96ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel   * @param html A snippet of HTML to sanitize.  {@code null} is treated as the
97ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel   *     empty string and will not result in a {@code NullPointerException}.
98ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel   * @param policy The Policy that will receive events based on the tokens in
99ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel   *     html.  Typically, this policy ends up routing the events to an
100ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel   *     {@link HtmlStreamRenderer} after filtering.
101ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel   *     {@link HtmlPolicyBuilder} provides an easy way to create policies.
1025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   */
103ee7fe14ffd97ab25e70f4403c56e5637f4239b9dmikesamuel  public static void sanitize(@Nullable String html, final Policy policy) {
10475d905c90100b9b05602b1878f847142e39836aamikesamuel    if (html == null) { html = ""; }
10575d905c90100b9b05602b1878f847142e39836aamikesamuel
1063f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel    TagBalancingHtmlStreamEventReceiver balancer
1073f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel        = new TagBalancingHtmlStreamEventReceiver(policy);
1083f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel
1093f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel    // According to Opera the maximum table nesting depth seen in the wild is
1103f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel    // 795, but 99.99% of documents have a table nesting depth of less than 22.
1113f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel    // Since each table has a nesting depth of 4 (incl. TBODY), this leads to a
1123f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel    // document depth of 90 (incl. HTML & BODY).
1133f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel    // Obviously table nesting depth is not the same as whole document depth,
1143f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel    // but it is the best proxy I have available.
1153f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel    // See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for
1163f54e49f2181c52ca40d99fbe738b2484ba91528mikesamuel    // the original data.
1174d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel
1184d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel    // Webkit defines the maximum HTML parser tree depth as 512.
1194d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel    // http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408
1204d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel    // static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512;
1214d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel
1224d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel    // The first number gives us a lower bound on the nesting depth we allow,
1234d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel    // 90, and the second gives us an upper bound: 512.
1244d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel    // We do not want to bump right up against that limit.
1254d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel    // 256 is substantially larger than the lower bound and well clear of the
1264d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel    // upper bound.
1274d17cd9ce55e109898d50a4e54f01838f3cb93dcmikesamuel    balancer.setNestingLimit(256);
1285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    balancer.openDocument();
1305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
13175d905c90100b9b05602b1878f847142e39836aamikesamuel    HtmlLexer lexer = new HtmlLexer(html);
1325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // Use a linked list so that policies can use Iterator.remove() in an O(1)
1335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // way.
1345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    LinkedList<String> attrs = Lists.newLinkedList();
1355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    while (lexer.hasNext()) {
1365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      HtmlToken token = lexer.next();
1375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      switch (token.type) {
1385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        case TEXT:
1395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          balancer.text(decodeHtml(html.substring(token.start, token.end)));
1405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          break;
1415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        case UNESCAPED:
1425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          balancer.text(html.substring(token.start, token.end));
1435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          break;
1445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        case TAGBEGIN:
1455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          if (html.charAt(token.start + 1) == '/') {  // A close tag.
1465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            balancer.closeTag(HtmlLexer.canonicalName(
1475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                html.substring(token.start + 2, token.end)));
1485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            while (lexer.hasNext()
1495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                   && lexer.next().type != HtmlTokenType.TAGEND) {
1505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              // skip tokens until we see a ">"
1515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            }
1525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          } else {
1535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            attrs.clear();
1545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            boolean attrsReadyForName = true;
1565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            tagBody:
1575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            while (lexer.hasNext()) {
1585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              HtmlToken tagBodyToken = lexer.next();
1595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              switch (tagBodyToken.type) {
1605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case ATTRNAME:
1615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if (!attrsReadyForName) {
1625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    // Last attribute added was valueless.
1635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    attrs.add(attrs.getLast());
1645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  } else {
1655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    attrsReadyForName = false;
1665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
1675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  attrs.add(HtmlLexer.canonicalName(
1685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                      html.substring(tagBodyToken.start, tagBodyToken.end)));
1695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
1705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case ATTRVALUE:
1715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  attrs.add(decodeHtml(stripQuotes(
1725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                      html.substring(tagBodyToken.start, tagBodyToken.end))));
1735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  attrsReadyForName = true;
1745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
1755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case TAGEND:
1765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break tagBody;
1775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                default:
1785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  // Just drop anything not recognized
1795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              }
1805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            }
1815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            if (!attrsReadyForName) {
1825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              attrs.add(attrs.getLast());
1835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            }
1845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            balancer.openTag(
1855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                HtmlLexer.canonicalName(
1865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    html.substring(token.start + 1, token.end)),
1875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                attrs);
1885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          }
1895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          break;
1905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        default:
1915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          // Ignore comments, directives, and other stuff that shouldn't show
1925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          // up in the output.
1935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          break;
1945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      }
1955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    }
1965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    balancer.closeDocument();
1985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
1995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2004e867904c8295537803c1c8a076e130df5674b58mikesamuel  private static String stripQuotes(String encodedAttributeValue) {
2015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int n = encodedAttributeValue.length();
2025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (n > 0) {
2035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      char last = encodedAttributeValue.charAt(n - 1);
2045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      if (last == '"' || last == '\'') {
2055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        int start = 0;
2065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        if (n != 1 && last == encodedAttributeValue.charAt(0)) {
2075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          start = 1;
2085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        } else {
2095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          // Browsers deal with missing left quotes : <img src=foo.png">
2105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          // but generally do not deal with missing right : <img src="foo.png>
2115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        }
2125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        return encodedAttributeValue.substring(start, n - 1);
2135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      }
2145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    }
2155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return encodedAttributeValue;
2165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
2175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  @VisibleForTesting
2195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  static String decodeHtml(String s) {
2205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int amp = s.indexOf('&');
2215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (amp < 0) { return s; }
2225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int pos = 0;
2235c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int n = s.length();
2245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    StringBuilder sb = new StringBuilder(n);
2255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int end;
2265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    do {
2275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n);
2285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      end = (int) (endAndCodepoint >>> 32);
2295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      int codepoint = (int) endAndCodepoint;
2305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      sb.append(s, pos, amp).appendCodePoint(codepoint);
2315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      pos = end;
2325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    } while ((amp = s.indexOf('&', end)) >= 0);
2335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return sb.append(s, pos, n).toString();
2345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
2355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com}
237