// Copyright (c) 2012, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.io.IOException; import com.google.common.annotations.VisibleForTesting; /** Encoders and decoders for HTML. */ final class Encoding { /** * Decodes HTML entities to produce a string containing only valid * Unicode scalar values. */ @VisibleForTesting static String decodeHtml(String s) { int firstAmp = s.indexOf('&'); int safeLimit = longestPrefixOfGoodCodeunits(s); if ((firstAmp & safeLimit) < 0) { return s; } StringBuilder sb; { int n = s.length(); sb = new StringBuilder(n); int pos = 0; int amp = firstAmp; while (amp >= 0) { long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n); int end = (int) (endAndCodepoint >>> 32); int codepoint = (int) endAndCodepoint; sb.append(s, pos, amp).appendCodePoint(codepoint); pos = end; amp = s.indexOf('&', end); } sb.append(s, pos, n); } stripBannedCodeunits( sb, firstAmp < 0 ? safeLimit : safeLimit < 0 ? firstAmp : Math.min(firstAmp, safeLimit)); return sb.toString(); } /** * Returns the portion of its input that consists of XML safe chars. * @see XML Ch. 2.2 - Characters */ @TCB static String stripBannedCodeunits(String s) { int safeLimit = longestPrefixOfGoodCodeunits(s); if (safeLimit < 0) { return s; } StringBuilder sb = new StringBuilder(s); stripBannedCodeunits(sb, safeLimit); return sb.toString(); } /** * Leaves in the input buffer only code-units that comprise XML safe chars. * @see XML Ch. 2.2 - Characters */ @TCB static void stripBannedCodeunits(StringBuilder sb) { stripBannedCodeunits(sb, 0); } @TCB private static void stripBannedCodeunits(StringBuilder sb, int start) { int k = start; for (int i = start, n = sb.length(); i < n; ++i) { char ch = sb.charAt(i); if (ch < 0x20) { if (IS_BANNED_ASCII[ch]) { continue; } } else if (0xd800 <= ch) { if (ch <= 0xdfff) { if (i+1 < n) { char next = sb.charAt(i+1); if (Character.isSurrogatePair(ch, next)) { sb.setCharAt(k++, ch); sb.setCharAt(k++, next); ++i; } } continue; } else if ((ch & 0xfffe) == 0xfffe) { continue; } } sb.setCharAt(k++, ch); } sb.setLength(k); } /** * The number of code-units at the front of s that form code-points in the * XML Character production. * @return -1 if all of s is in the XML Character production. */ @TCB private static int longestPrefixOfGoodCodeunits(String s) { int n = s.length(), i; for (i = 0; i < n; ++i) { char ch = s.charAt(i); if (ch < 0x20) { if (IS_BANNED_ASCII[ch]) { return i; } } else if (0xd800 <= ch) { if (ch <= 0xdfff) { if (i+1 < n && Character.isSurrogatePair(ch, s.charAt(i+1))) { ++i; // Skip over low surrogate since we know it's ok. } else { return i; } } else if ((ch & 0xfffe) == 0xfffe) { return i; } } } return -1; } /** * Writes the HTML equivalent of the given plain text to output. * For example, {@code escapeHtmlOnto("1 < 2", w)}, * is equivalent to {@code w.append("1 < 2")} but possibly with fewer * smaller appends. * Elides code-units that are not valid XML Characters. * @see XML Ch. 2.2 - Characters */ @TCB static void encodeHtmlOnto(String plainText, Appendable output) throws IOException { int n = plainText.length(); int pos = 0; for (int i = 0; i < n; ++i) { char ch = plainText.charAt(i); if (ch < REPLACEMENTS.length) { String repl = REPLACEMENTS[ch]; if (repl != null) { output.append(plainText, pos, i).append(repl); pos = i + 1; } } else if (((char) 0xd800) <= ch) { if (ch <= ((char) 0xdfff)) { char next; if (i + 1 < n && Character.isSurrogatePair( ch, next = plainText.charAt(i + 1))) { // Emit supplemental codepoints as entity so that they cannot // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper // and get involved in UTF-16/UCS-2 confusion. int codepoint = Character.toCodePoint(ch, next); output.append(plainText, pos, i); appendNumericEntity(codepoint, output); ++i; pos = i + 1; } else { output.append(plainText, pos, i); // Elide the orphaned surrogate. pos = i + 1; } } else if (0xff00 <= ch) { output.append(plainText, pos, i); pos = i + 1; // Is a control character or possible full-width version of a // special character. if ((ch & 0xfffe) == 0xfffe) { // Elide since not an the XML Character. } else { appendNumericEntity(ch, output); } } } } output.append(plainText, pos, n); } @TCB static void appendNumericEntity(int codepoint, Appendable output) throws IOException { if (codepoint < 100) { // TODO: is this dead code due to REPLACEMENTS above. output.append("&#"); if (codepoint < 10) { output.append((char) ('0' + codepoint)); } else { output.append((char) ('0' + (codepoint / 10))); output.append((char) ('0' + (codepoint % 10))); } output.append(";"); } else { int nDigits = (codepoint < 0x1000 ? codepoint < 0x100 ? 2 : 3 : (codepoint < 0x10000 ? 4 : codepoint < 0x100000 ? 5 : 6)); output.append("&#x"); for (int digit = nDigits; --digit >= 0;) { int hexDigit = (codepoint >>> (digit << 2)) & 0xf; output.append(HEX_NUMERAL[hexDigit]); } output.append(";"); } } private static final char[] HEX_NUMERAL = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', }; /** Maps ASCII chars that need to be encoded to an equivalent HTML entity. */ static final String[] REPLACEMENTS = new String[0x61]; static { for (int i = 0; i < ' '; ++i) { // We elide control characters so that we can ensure that our output is // in the intersection of valid HTML5 and XML. According to // http://www.w3.org/TR/2008/REC-xml-20081126/#charsets // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] // | [#xE000-#xFFFD] | [#x10000-#x10FFFF] if (i != '\t' && i != '\n' && i != '\r') { REPLACEMENTS[i] = ""; // Elide } } // """ is shorter than """ REPLACEMENTS['"'] = "&#" + ((int) '"') + ";"; // Attribute delimiter. REPLACEMENTS['&'] = "&"; // HTML special. // We don't use ' since that is not in the intersection of HTML&XML. REPLACEMENTS['\''] = "&#" + ((int) '\'') + ";"; // Attribute delimiter. REPLACEMENTS['+'] = "&#" + ((int) '+') + ";"; // UTF-7 special. REPLACEMENTS['<'] = "<"; // HTML special. REPLACEMENTS['='] = "&#" + ((int) '=') + ";"; // Special in attributes. REPLACEMENTS['>'] = ">"; // HTML special. REPLACEMENTS['@'] = "&#" + ((int) '@') + ";"; // Conditional compilation. REPLACEMENTS['`'] = "&#" + ((int) '`') + ";"; // Attribute delimiter. } /** * {@code DECODES_TO_SELF[c]} is true iff the codepoint c decodes to itself in * an HTML5 text node or properly quoted attribute value. */ private static boolean[] IS_BANNED_ASCII = new boolean[0x20]; static { for (int i = 0; i < IS_BANNED_ASCII.length; ++i) { IS_BANNED_ASCII[i] = !(i == '\t' || i == '\n' || i == '\r'); } } }