10df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// Copyright (c) 2012, Mike Samuel
20df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// All rights reserved.
30df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel//
40df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// Redistribution and use in source and binary forms, with or without
50df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// modification, are permitted provided that the following conditions
60df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// are met:
70df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel//
80df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// Redistributions of source code must retain the above copyright
90df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// notice, this list of conditions and the following disclaimer.
100df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// Redistributions in binary form must reproduce the above copyright
110df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// notice, this list of conditions and the following disclaimer in the
120df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// documentation and/or other materials provided with the distribution.
130df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// Neither the name of the OWASP nor the names of its contributors may
140df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// be used to endorse or promote products derived from this software
150df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// without specific prior written permission.
160df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
170df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
180df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
190df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
200df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
210df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
220df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
230df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
240df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
250df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
260df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
270df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// POSSIBILITY OF SUCH DAMAGE.
280df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
290df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuelpackage org.owasp.html;
300df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
310df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuelimport java.io.IOException;
320df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
330df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuelimport com.google.common.annotations.VisibleForTesting;
340df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
350df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel/** Encoders and decoders for HTML. */
360df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuelfinal class Encoding {
370df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
380df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  /**
390df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * Decodes HTML entities to produce a string containing only valid
400df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * Unicode scalar values.
410df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   */
420df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  @VisibleForTesting
430df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  static String decodeHtml(String s) {
440df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    int firstAmp = s.indexOf('&');
450df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    int safeLimit = longestPrefixOfGoodCodeunits(s);
460df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    if ((firstAmp & safeLimit) < 0) { return s; }
470df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
480df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    StringBuilder sb;
490df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    {
500df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      int n = s.length();
510df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      sb = new StringBuilder(n);
520df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      int pos = 0;
530df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      int amp = firstAmp;
540df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      while (amp >= 0) {
550df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n);
560df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        int end = (int) (endAndCodepoint >>> 32);
570df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        int codepoint = (int) endAndCodepoint;
580df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        sb.append(s, pos, amp).appendCodePoint(codepoint);
590df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        pos = end;
600df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        amp = s.indexOf('&', end);
610df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      }
620df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      sb.append(s, pos, n);
630df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    }
640df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
650df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    stripBannedCodeunits(
660df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        sb,
670df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        firstAmp < 0
680df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          ? safeLimit : safeLimit < 0
690df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          ? firstAmp : Math.min(firstAmp, safeLimit));
700df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
710df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    return sb.toString();
720df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  }
730df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
740df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  /**
750df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * Returns the portion of its input that consists of XML safe chars.
760df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a>
770df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   */
780df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  @TCB
790df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  static String stripBannedCodeunits(String s) {
800df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    int safeLimit = longestPrefixOfGoodCodeunits(s);
810df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    if (safeLimit < 0) { return s; }
820df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
830df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    StringBuilder sb = new StringBuilder(s);
840df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    stripBannedCodeunits(sb, safeLimit);
850df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    return sb.toString();
860df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  }
870df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
880df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  /**
890df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * Leaves in the input buffer only code-units that comprise XML safe chars.
900df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a>
910df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   */
920df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  @TCB
930df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  static void stripBannedCodeunits(StringBuilder sb) {
940df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    stripBannedCodeunits(sb, 0);
950df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  }
960df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
970df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  @TCB
980df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  private static void stripBannedCodeunits(StringBuilder sb, int start) {
990df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    int k = start;
1000df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    for (int i = start, n = sb.length(); i < n; ++i) {
1010df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      char ch = sb.charAt(i);
1020df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      if (ch < 0x20) {
1030df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        if (IS_BANNED_ASCII[ch]) {
1040df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          continue;
1050df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        }
1060df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      } else if (0xd800 <= ch) {
1070df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        if (ch <= 0xdfff) {
1080df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          if (i+1 < n) {
1090df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            char next = sb.charAt(i+1);
1100df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            if (Character.isSurrogatePair(ch, next)) {
1110df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel              sb.setCharAt(k++, ch);
1120df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel              sb.setCharAt(k++, next);
1130df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel              ++i;
1140df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            }
1150df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          }
1160df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          continue;
1170df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        } else if ((ch & 0xfffe) == 0xfffe) {
1180df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          continue;
1190df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        }
1200df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      }
1210df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      sb.setCharAt(k++, ch);
1220df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    }
1230df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    sb.setLength(k);
1240df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  }
1250df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
1260df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  /**
1270df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * The number of code-units at the front of s that form code-points in the
1280df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * XML Character production.
1290df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * @return -1 if all of s is in the XML Character production.
1300df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   */
1310df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  @TCB
1320df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  private static int longestPrefixOfGoodCodeunits(String s) {
1330df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    int n = s.length(), i;
1340df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    for (i = 0; i < n; ++i) {
1350df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      char ch = s.charAt(i);
1360df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      if (ch < 0x20) {
1370df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        if (IS_BANNED_ASCII[ch]) {
1380df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          return i;
1390df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        }
1400df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      } else if (0xd800 <= ch) {
1410df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        if (ch <= 0xdfff) {
1420df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          if (i+1 < n && Character.isSurrogatePair(ch, s.charAt(i+1))) {
1430df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            ++i;  // Skip over low surrogate since we know it's ok.
1440df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          } else {
1450df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            return i;
1460df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          }
1470df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        } else if ((ch & 0xfffe) == 0xfffe) {
1480df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          return i;
1490df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        }
1500df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      }
1510df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    }
1520df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    return -1;
1530df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  }
1540df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
1550df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  /**
1560df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * Writes the HTML equivalent of the given plain text to output.
1570df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * For example, {@code escapeHtmlOnto("1 < 2", w)},
1580df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * is equivalent to {@code w.append("1 &lt; 2")} but possibly with fewer
1590df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * smaller appends.
1600df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * Elides code-units that are not valid XML Characters.
1610df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a>
1620df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   */
1630df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  @TCB
1640df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  static void encodeHtmlOnto(String plainText, Appendable output)
1650df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      throws IOException {
1660df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    int n = plainText.length();
1670df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    int pos = 0;
1680df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    for (int i = 0; i < n; ++i) {
1690df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      char ch = plainText.charAt(i);
1700df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      if (ch < REPLACEMENTS.length) {
1710df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        String repl = REPLACEMENTS[ch];
1720df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        if (repl != null) {
1730df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          output.append(plainText, pos, i).append(repl);
1740df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          pos = i + 1;
1750df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        }
1760df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      } else if (((char) 0xd800) <= ch) {
1770df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        if (ch <= ((char) 0xdfff)) {
1780df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          char next;
1790df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          if (i + 1 < n
180ce5bde40e2e126de05105f09f1f965a5c70aaa94mikesamuel              && Character.isSurrogatePair(
181ce5bde40e2e126de05105f09f1f965a5c70aaa94mikesamuel                  ch, next = plainText.charAt(i + 1))) {
1820df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            // Emit supplemental codepoints as entity so that they cannot
1830df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper
1840df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            // and get involved in UTF-16/UCS-2 confusion.
1850df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            int codepoint = Character.toCodePoint(ch, next);
1860df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            output.append(plainText, pos, i);
1870df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            appendNumericEntity(codepoint, output);
1880df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            ++i;
1890df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            pos = i + 1;
1900df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          } else {
1910df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            output.append(plainText, pos, i);
1920df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            // Elide the orphaned surrogate.
1930df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            pos = i + 1;
1940df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          }
1950df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        } else if (0xff00 <= ch) {
1960df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          output.append(plainText, pos, i);
1970df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          pos = i + 1;
1980df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          // Is a control character or possible full-width version of a
1990df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          // special character.
2000df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          if ((ch & 0xfffe) == 0xfffe) {
2010df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            // Elide since not an the XML Character.
2020df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          } else {
2030df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel            appendNumericEntity(ch, output);
2040df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel          }
2050df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        }
2060df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      }
2070df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    }
2080df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    output.append(plainText, pos, n);
2090df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  }
2100df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
2110df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  @TCB
2120df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  static void appendNumericEntity(int codepoint, Appendable output)
2130df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      throws IOException {
2140df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    if (codepoint < 100) {
2150df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      // TODO: is this dead code due to REPLACEMENTS above.
2160df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      output.append("&#");
2170df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      if (codepoint < 10) {
2180df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        output.append((char) ('0' + codepoint));
2190df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      } else {
2200df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        output.append((char) ('0' + (codepoint / 10)));
2210df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        output.append((char) ('0' + (codepoint % 10)));
2220df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      }
2230df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      output.append(";");
2240df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    } else {
2250df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      int nDigits = (codepoint < 0x1000
2260df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel                     ? codepoint < 0x100 ? 2 : 3
2270df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel                     : (codepoint < 0x10000 ? 4
2280df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel                        : codepoint < 0x100000 ? 5 : 6));
2290df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      output.append("&#x");
2300df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      for (int digit = nDigits; --digit >= 0;) {
2310df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        int hexDigit = (codepoint >>> (digit << 2)) & 0xf;
2320df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        output.append(HEX_NUMERAL[hexDigit]);
2330df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      }
2340df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      output.append(";");
2350df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    }
2360df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  }
2370df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
2380df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  private static final char[] HEX_NUMERAL = {
2390df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   '0', '1', '2', '3', '4', '5', '6', '7',
2400df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
2410df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  };
2420df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
2430df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  /** Maps ASCII chars that need to be encoded to an equivalent HTML entity. */
2440df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  static final String[] REPLACEMENTS = new String[0x61];
2450df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  static {
2460df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    for (int i = 0; i < ' '; ++i) {
2470df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      // We elide control characters so that we can ensure that our output is
2480df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      // in the intersection of valid HTML5 and XML.  According to
2490df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      // http://www.w3.org/TR/2008/REC-xml-20081126/#charsets
2500df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      // Char      ::=          #x9 | #xA | #xD | [#x20-#xD7FF]
2510df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      //             |          [#xE000-#xFFFD] | [#x10000-#x10FFFF]
2520df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      if (i != '\t' && i != '\n' && i != '\r') {
2530df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel        REPLACEMENTS[i] = "";  // Elide
2540df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      }
2550df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    }
2560df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    // "&#34;" is shorter than "&quot;"
2570df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    REPLACEMENTS['"']  = "&#" + ((int) '"')  + ";";  // Attribute delimiter.
2580df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    REPLACEMENTS['&']  = "&amp;";                    // HTML special.
2590df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    // We don't use &apos; since that is not in the intersection of HTML&XML.
2600df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    REPLACEMENTS['\''] = "&#" + ((int) '\'') + ";";  // Attribute delimiter.
2610df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    REPLACEMENTS['+']  = "&#" + ((int) '+')  + ";";  // UTF-7 special.
2620df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    REPLACEMENTS['<']  = "&lt;";                     // HTML special.
2630df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    REPLACEMENTS['=']  = "&#" + ((int) '=')  + ";";  // Special in attributes.
2640df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    REPLACEMENTS['>']  = "&gt;";                     // HTML special.
2650df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    REPLACEMENTS['@']  = "&#" + ((int) '@')  + ";";  // Conditional compilation.
2660df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    REPLACEMENTS['`']  = "&#" + ((int) '`')  + ";";  // Attribute delimiter.
2670df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  }
2680df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
2690df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  /**
2700df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * {@code DECODES_TO_SELF[c]} is true iff the codepoint c decodes to itself in
2710df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   * an HTML5 text node or properly quoted attribute value.
2720df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel   */
2730df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  private static boolean[] IS_BANNED_ASCII = new boolean[0x20];
2740df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  static {
2750df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    for (int i = 0; i < IS_BANNED_ASCII.length; ++i) {
2760df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel      IS_BANNED_ASCII[i] = !(i == '\t' || i == '\n' || i == '\r');
2770df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel    }
2780df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel  }
2790df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel
2800df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel}
281