10df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// Copyright (c) 2012, Mike Samuel 20df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// All rights reserved. 30df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// 40df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// Redistribution and use in source and binary forms, with or without 50df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// modification, are permitted provided that the following conditions 60df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// are met: 70df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// 80df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// Redistributions of source code must retain the above copyright 90df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// notice, this list of conditions and the following disclaimer. 100df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// Redistributions in binary form must reproduce the above copyright 110df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// notice, this list of conditions and the following disclaimer in the 120df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// documentation and/or other materials provided with the distribution. 130df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// Neither the name of the OWASP nor the names of its contributors may 140df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// be used to endorse or promote products derived from this software 150df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// without specific prior written permission. 160df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 170df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 180df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 190df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 200df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 210df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 220df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 230df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 240df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 250df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 260df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 270df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel// POSSIBILITY OF SUCH DAMAGE. 280df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 290df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuelpackage org.owasp.html; 300df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 310df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuelimport java.io.IOException; 320df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 330df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuelimport com.google.common.annotations.VisibleForTesting; 340df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 350df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel/** Encoders and decoders for HTML. */ 360df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuelfinal class Encoding { 370df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 380df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel /** 390df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * Decodes HTML entities to produce a string containing only valid 400df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * Unicode scalar values. 410df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel */ 420df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel @VisibleForTesting 430df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel static String decodeHtml(String s) { 440df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int firstAmp = s.indexOf('&'); 450df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int safeLimit = longestPrefixOfGoodCodeunits(s); 460df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if ((firstAmp & safeLimit) < 0) { return s; } 470df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 480df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel StringBuilder sb; 490df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel { 500df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int n = s.length(); 510df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel sb = new StringBuilder(n); 520df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int pos = 0; 530df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int amp = firstAmp; 540df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel while (amp >= 0) { 550df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n); 560df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int end = (int) (endAndCodepoint >>> 32); 570df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int codepoint = (int) endAndCodepoint; 580df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel sb.append(s, pos, amp).appendCodePoint(codepoint); 590df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel pos = end; 600df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel amp = s.indexOf('&', end); 610df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 620df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel sb.append(s, pos, n); 630df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 640df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 650df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel stripBannedCodeunits( 660df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel sb, 670df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel firstAmp < 0 680df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel ? safeLimit : safeLimit < 0 690df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel ? firstAmp : Math.min(firstAmp, safeLimit)); 700df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 710df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel return sb.toString(); 720df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 730df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 740df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel /** 750df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * Returns the portion of its input that consists of XML safe chars. 760df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a> 770df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel */ 780df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel @TCB 790df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel static String stripBannedCodeunits(String s) { 800df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int safeLimit = longestPrefixOfGoodCodeunits(s); 810df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (safeLimit < 0) { return s; } 820df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 830df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel StringBuilder sb = new StringBuilder(s); 840df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel stripBannedCodeunits(sb, safeLimit); 850df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel return sb.toString(); 860df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 870df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 880df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel /** 890df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * Leaves in the input buffer only code-units that comprise XML safe chars. 900df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a> 910df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel */ 920df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel @TCB 930df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel static void stripBannedCodeunits(StringBuilder sb) { 940df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel stripBannedCodeunits(sb, 0); 950df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 960df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 970df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel @TCB 980df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel private static void stripBannedCodeunits(StringBuilder sb, int start) { 990df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int k = start; 1000df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel for (int i = start, n = sb.length(); i < n; ++i) { 1010df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel char ch = sb.charAt(i); 1020df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (ch < 0x20) { 1030df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (IS_BANNED_ASCII[ch]) { 1040df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel continue; 1050df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1060df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } else if (0xd800 <= ch) { 1070df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (ch <= 0xdfff) { 1080df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (i+1 < n) { 1090df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel char next = sb.charAt(i+1); 1100df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (Character.isSurrogatePair(ch, next)) { 1110df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel sb.setCharAt(k++, ch); 1120df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel sb.setCharAt(k++, next); 1130df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel ++i; 1140df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1150df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1160df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel continue; 1170df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } else if ((ch & 0xfffe) == 0xfffe) { 1180df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel continue; 1190df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1200df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1210df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel sb.setCharAt(k++, ch); 1220df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1230df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel sb.setLength(k); 1240df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1250df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 1260df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel /** 1270df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * The number of code-units at the front of s that form code-points in the 1280df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * XML Character production. 1290df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * @return -1 if all of s is in the XML Character production. 1300df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel */ 1310df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel @TCB 1320df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel private static int longestPrefixOfGoodCodeunits(String s) { 1330df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int n = s.length(), i; 1340df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel for (i = 0; i < n; ++i) { 1350df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel char ch = s.charAt(i); 1360df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (ch < 0x20) { 1370df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (IS_BANNED_ASCII[ch]) { 1380df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel return i; 1390df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1400df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } else if (0xd800 <= ch) { 1410df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (ch <= 0xdfff) { 1420df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (i+1 < n && Character.isSurrogatePair(ch, s.charAt(i+1))) { 1430df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel ++i; // Skip over low surrogate since we know it's ok. 1440df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } else { 1450df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel return i; 1460df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1470df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } else if ((ch & 0xfffe) == 0xfffe) { 1480df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel return i; 1490df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1500df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1510df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1520df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel return -1; 1530df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1540df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 1550df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel /** 1560df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * Writes the HTML equivalent of the given plain text to output. 1570df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * For example, {@code escapeHtmlOnto("1 < 2", w)}, 1580df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * is equivalent to {@code w.append("1 < 2")} but possibly with fewer 1590df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * smaller appends. 1600df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * Elides code-units that are not valid XML Characters. 1610df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a> 1620df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel */ 1630df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel @TCB 1640df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel static void encodeHtmlOnto(String plainText, Appendable output) 1650df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel throws IOException { 1660df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int n = plainText.length(); 1670df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int pos = 0; 1680df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel for (int i = 0; i < n; ++i) { 1690df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel char ch = plainText.charAt(i); 1700df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (ch < REPLACEMENTS.length) { 1710df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel String repl = REPLACEMENTS[ch]; 1720df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (repl != null) { 1730df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel output.append(plainText, pos, i).append(repl); 1740df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel pos = i + 1; 1750df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1760df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } else if (((char) 0xd800) <= ch) { 1770df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (ch <= ((char) 0xdfff)) { 1780df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel char next; 1790df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (i + 1 < n 180ce5bde40e2e126de05105f09f1f965a5c70aaa94mikesamuel && Character.isSurrogatePair( 181ce5bde40e2e126de05105f09f1f965a5c70aaa94mikesamuel ch, next = plainText.charAt(i + 1))) { 1820df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // Emit supplemental codepoints as entity so that they cannot 1830df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper 1840df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // and get involved in UTF-16/UCS-2 confusion. 1850df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int codepoint = Character.toCodePoint(ch, next); 1860df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel output.append(plainText, pos, i); 1870df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel appendNumericEntity(codepoint, output); 1880df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel ++i; 1890df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel pos = i + 1; 1900df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } else { 1910df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel output.append(plainText, pos, i); 1920df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // Elide the orphaned surrogate. 1930df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel pos = i + 1; 1940df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 1950df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } else if (0xff00 <= ch) { 1960df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel output.append(plainText, pos, i); 1970df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel pos = i + 1; 1980df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // Is a control character or possible full-width version of a 1990df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // special character. 2000df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if ((ch & 0xfffe) == 0xfffe) { 2010df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // Elide since not an the XML Character. 2020df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } else { 2030df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel appendNumericEntity(ch, output); 2040df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 2050df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 2060df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 2070df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 2080df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel output.append(plainText, pos, n); 2090df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 2100df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 2110df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel @TCB 2120df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel static void appendNumericEntity(int codepoint, Appendable output) 2130df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel throws IOException { 2140df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (codepoint < 100) { 2150df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // TODO: is this dead code due to REPLACEMENTS above. 2160df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel output.append("&#"); 2170df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (codepoint < 10) { 2180df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel output.append((char) ('0' + codepoint)); 2190df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } else { 2200df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel output.append((char) ('0' + (codepoint / 10))); 2210df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel output.append((char) ('0' + (codepoint % 10))); 2220df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 2230df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel output.append(";"); 2240df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } else { 2250df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int nDigits = (codepoint < 0x1000 2260df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel ? codepoint < 0x100 ? 2 : 3 2270df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel : (codepoint < 0x10000 ? 4 2280df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel : codepoint < 0x100000 ? 5 : 6)); 2290df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel output.append("&#x"); 2300df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel for (int digit = nDigits; --digit >= 0;) { 2310df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel int hexDigit = (codepoint >>> (digit << 2)) & 0xf; 2320df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel output.append(HEX_NUMERAL[hexDigit]); 2330df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 2340df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel output.append(";"); 2350df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 2360df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 2370df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 2380df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel private static final char[] HEX_NUMERAL = { 2390df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel '0', '1', '2', '3', '4', '5', '6', '7', 2400df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 2410df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel }; 2420df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 2430df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel /** Maps ASCII chars that need to be encoded to an equivalent HTML entity. */ 2440df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel static final String[] REPLACEMENTS = new String[0x61]; 2450df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel static { 2460df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel for (int i = 0; i < ' '; ++i) { 2470df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // We elide control characters so that we can ensure that our output is 2480df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // in the intersection of valid HTML5 and XML. According to 2490df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // http://www.w3.org/TR/2008/REC-xml-20081126/#charsets 2500df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] 2510df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // | [#xE000-#xFFFD] | [#x10000-#x10FFFF] 2520df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel if (i != '\t' && i != '\n' && i != '\r') { 2530df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel REPLACEMENTS[i] = ""; // Elide 2540df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 2550df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 2560df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // """ is shorter than """ 2570df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel REPLACEMENTS['"'] = "&#" + ((int) '"') + ";"; // Attribute delimiter. 2580df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel REPLACEMENTS['&'] = "&"; // HTML special. 2590df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel // We don't use ' since that is not in the intersection of HTML&XML. 2600df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel REPLACEMENTS['\''] = "&#" + ((int) '\'') + ";"; // Attribute delimiter. 2610df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel REPLACEMENTS['+'] = "&#" + ((int) '+') + ";"; // UTF-7 special. 2620df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel REPLACEMENTS['<'] = "<"; // HTML special. 2630df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel REPLACEMENTS['='] = "&#" + ((int) '=') + ";"; // Special in attributes. 2640df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel REPLACEMENTS['>'] = ">"; // HTML special. 2650df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel REPLACEMENTS['@'] = "&#" + ((int) '@') + ";"; // Conditional compilation. 2660df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel REPLACEMENTS['`'] = "&#" + ((int) '`') + ";"; // Attribute delimiter. 2670df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 2680df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 2690df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel /** 2700df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * {@code DECODES_TO_SELF[c]} is true iff the codepoint c decodes to itself in 2710df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel * an HTML5 text node or properly quoted attribute value. 2720df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel */ 2730df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel private static boolean[] IS_BANNED_ASCII = new boolean[0x20]; 2740df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel static { 2750df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel for (int i = 0; i < IS_BANNED_ASCII.length; ++i) { 2760df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel IS_BANNED_ASCII[i] = !(i == '\t' || i == '\n' || i == '\r'); 2770df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 2780df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel } 2790df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel 2800df9131f7be5c0f90ce70d43b7e4239a6a6df016mikesamuel} 281