1// Copyright (c) 2012, Mike Samuel
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions
6// are met:
7//
8// Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// Redistributions in binary form must reproduce the above copyright
11// notice, this list of conditions and the following disclaimer in the
12// documentation and/or other materials provided with the distribution.
13// Neither the name of the OWASP nor the names of its contributors may
14// be used to endorse or promote products derived from this software
15// without specific prior written permission.
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28
29package org.owasp.html;
30
31import java.io.IOException;
32
33import com.google.common.annotations.VisibleForTesting;
34
35/** Encoders and decoders for HTML. */
36final class Encoding {
37
38  /**
39   * Decodes HTML entities to produce a string containing only valid
40   * Unicode scalar values.
41   */
42  @VisibleForTesting
43  static String decodeHtml(String s) {
44    int firstAmp = s.indexOf('&');
45    int safeLimit = longestPrefixOfGoodCodeunits(s);
46    if ((firstAmp & safeLimit) < 0) { return s; }
47
48    StringBuilder sb;
49    {
50      int n = s.length();
51      sb = new StringBuilder(n);
52      int pos = 0;
53      int amp = firstAmp;
54      while (amp >= 0) {
55        long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n);
56        int end = (int) (endAndCodepoint >>> 32);
57        int codepoint = (int) endAndCodepoint;
58        sb.append(s, pos, amp).appendCodePoint(codepoint);
59        pos = end;
60        amp = s.indexOf('&', end);
61      }
62      sb.append(s, pos, n);
63    }
64
65    stripBannedCodeunits(
66        sb,
67        firstAmp < 0
68          ? safeLimit : safeLimit < 0
69          ? firstAmp : Math.min(firstAmp, safeLimit));
70
71    return sb.toString();
72  }
73
74  /**
75   * Returns the portion of its input that consists of XML safe chars.
76   * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a>
77   */
78  @TCB
79  static String stripBannedCodeunits(String s) {
80    int safeLimit = longestPrefixOfGoodCodeunits(s);
81    if (safeLimit < 0) { return s; }
82
83    StringBuilder sb = new StringBuilder(s);
84    stripBannedCodeunits(sb, safeLimit);
85    return sb.toString();
86  }
87
88  /**
89   * Leaves in the input buffer only code-units that comprise XML safe chars.
90   * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a>
91   */
92  @TCB
93  static void stripBannedCodeunits(StringBuilder sb) {
94    stripBannedCodeunits(sb, 0);
95  }
96
97  @TCB
98  private static void stripBannedCodeunits(StringBuilder sb, int start) {
99    int k = start;
100    for (int i = start, n = sb.length(); i < n; ++i) {
101      char ch = sb.charAt(i);
102      if (ch < 0x20) {
103        if (IS_BANNED_ASCII[ch]) {
104          continue;
105        }
106      } else if (0xd800 <= ch) {
107        if (ch <= 0xdfff) {
108          if (i+1 < n) {
109            char next = sb.charAt(i+1);
110            if (Character.isSurrogatePair(ch, next)) {
111              sb.setCharAt(k++, ch);
112              sb.setCharAt(k++, next);
113              ++i;
114            }
115          }
116          continue;
117        } else if ((ch & 0xfffe) == 0xfffe) {
118          continue;
119        }
120      }
121      sb.setCharAt(k++, ch);
122    }
123    sb.setLength(k);
124  }
125
126  /**
127   * The number of code-units at the front of s that form code-points in the
128   * XML Character production.
129   * @return -1 if all of s is in the XML Character production.
130   */
131  @TCB
132  private static int longestPrefixOfGoodCodeunits(String s) {
133    int n = s.length(), i;
134    for (i = 0; i < n; ++i) {
135      char ch = s.charAt(i);
136      if (ch < 0x20) {
137        if (IS_BANNED_ASCII[ch]) {
138          return i;
139        }
140      } else if (0xd800 <= ch) {
141        if (ch <= 0xdfff) {
142          if (i+1 < n && Character.isSurrogatePair(ch, s.charAt(i+1))) {
143            ++i;  // Skip over low surrogate since we know it's ok.
144          } else {
145            return i;
146          }
147        } else if ((ch & 0xfffe) == 0xfffe) {
148          return i;
149        }
150      }
151    }
152    return -1;
153  }
154
155  /**
156   * Writes the HTML equivalent of the given plain text to output.
157   * For example, {@code escapeHtmlOnto("1 < 2", w)},
158   * is equivalent to {@code w.append("1 &lt; 2")} but possibly with fewer
159   * smaller appends.
160   * Elides code-units that are not valid XML Characters.
161   * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a>
162   */
163  @TCB
164  static void encodeHtmlOnto(String plainText, Appendable output)
165      throws IOException {
166    int n = plainText.length();
167    int pos = 0;
168    for (int i = 0; i < n; ++i) {
169      char ch = plainText.charAt(i);
170      if (ch < REPLACEMENTS.length) {
171        String repl = REPLACEMENTS[ch];
172        if (repl != null) {
173          output.append(plainText, pos, i).append(repl);
174          pos = i + 1;
175        }
176      } else if (((char) 0xd800) <= ch) {
177        if (ch <= ((char) 0xdfff)) {
178          char next;
179          if (i + 1 < n
180              && Character.isSurrogatePair(
181                  ch, next = plainText.charAt(i + 1))) {
182            // Emit supplemental codepoints as entity so that they cannot
183            // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper
184            // and get involved in UTF-16/UCS-2 confusion.
185            int codepoint = Character.toCodePoint(ch, next);
186            output.append(plainText, pos, i);
187            appendNumericEntity(codepoint, output);
188            ++i;
189            pos = i + 1;
190          } else {
191            output.append(plainText, pos, i);
192            // Elide the orphaned surrogate.
193            pos = i + 1;
194          }
195        } else if (0xff00 <= ch) {
196          output.append(plainText, pos, i);
197          pos = i + 1;
198          // Is a control character or possible full-width version of a
199          // special character.
200          if ((ch & 0xfffe) == 0xfffe) {
201            // Elide since not an the XML Character.
202          } else {
203            appendNumericEntity(ch, output);
204          }
205        }
206      }
207    }
208    output.append(plainText, pos, n);
209  }
210
211  @TCB
212  static void appendNumericEntity(int codepoint, Appendable output)
213      throws IOException {
214    if (codepoint < 100) {
215      // TODO: is this dead code due to REPLACEMENTS above.
216      output.append("&#");
217      if (codepoint < 10) {
218        output.append((char) ('0' + codepoint));
219      } else {
220        output.append((char) ('0' + (codepoint / 10)));
221        output.append((char) ('0' + (codepoint % 10)));
222      }
223      output.append(";");
224    } else {
225      int nDigits = (codepoint < 0x1000
226                     ? codepoint < 0x100 ? 2 : 3
227                     : (codepoint < 0x10000 ? 4
228                        : codepoint < 0x100000 ? 5 : 6));
229      output.append("&#x");
230      for (int digit = nDigits; --digit >= 0;) {
231        int hexDigit = (codepoint >>> (digit << 2)) & 0xf;
232        output.append(HEX_NUMERAL[hexDigit]);
233      }
234      output.append(";");
235    }
236  }
237
238  private static final char[] HEX_NUMERAL = {
239   '0', '1', '2', '3', '4', '5', '6', '7',
240   '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
241  };
242
243  /** Maps ASCII chars that need to be encoded to an equivalent HTML entity. */
244  static final String[] REPLACEMENTS = new String[0x61];
245  static {
246    for (int i = 0; i < ' '; ++i) {
247      // We elide control characters so that we can ensure that our output is
248      // in the intersection of valid HTML5 and XML.  According to
249      // http://www.w3.org/TR/2008/REC-xml-20081126/#charsets
250      // Char      ::=          #x9 | #xA | #xD | [#x20-#xD7FF]
251      //             |          [#xE000-#xFFFD] | [#x10000-#x10FFFF]
252      if (i != '\t' && i != '\n' && i != '\r') {
253        REPLACEMENTS[i] = "";  // Elide
254      }
255    }
256    // "&#34;" is shorter than "&quot;"
257    REPLACEMENTS['"']  = "&#" + ((int) '"')  + ";";  // Attribute delimiter.
258    REPLACEMENTS['&']  = "&amp;";                    // HTML special.
259    // We don't use &apos; since that is not in the intersection of HTML&XML.
260    REPLACEMENTS['\''] = "&#" + ((int) '\'') + ";";  // Attribute delimiter.
261    REPLACEMENTS['+']  = "&#" + ((int) '+')  + ";";  // UTF-7 special.
262    REPLACEMENTS['<']  = "&lt;";                     // HTML special.
263    REPLACEMENTS['=']  = "&#" + ((int) '=')  + ";";  // Special in attributes.
264    REPLACEMENTS['>']  = "&gt;";                     // HTML special.
265    REPLACEMENTS['@']  = "&#" + ((int) '@')  + ";";  // Conditional compilation.
266    REPLACEMENTS['`']  = "&#" + ((int) '`')  + ";";  // Attribute delimiter.
267  }
268
269  /**
270   * {@code DECODES_TO_SELF[c]} is true iff the codepoint c decodes to itself in
271   * an HTML5 text node or properly quoted attribute value.
272   */
273  private static boolean[] IS_BANNED_ASCII = new boolean[0x20];
274  static {
275    for (int i = 0; i < IS_BANNED_ASCII.length; ++i) {
276      IS_BANNED_ASCII[i] = !(i == '\t' || i == '\n' || i == '\r');
277    }
278  }
279
280}
281