1// Copyright (c) 2012, Mike Samuel
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions
6// are met:
7//
8// Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// Redistributions in binary form must reproduce the above copyright
11// notice, this list of conditions and the following disclaimer in the
12// documentation and/or other materials provided with the distribution.
13// Neither the name of the OWASP nor the names of its contributors may
14// be used to endorse or promote products derived from this software
15// without specific prior written permission.
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28
29package org.owasp.html;
30
31import org.junit.Test;
32
33import junit.framework.TestCase;
34
35public class EncodingTest extends TestCase {
36
37  @Test
38  public static final void testDecodeHtml() {
39    String html =
40      "The quick brown fox
jumps over
the lazy dog
";
41    //          1         2         3         4         5         6
42    // 123456789012345678901234567890123456789012345678901234567890123456789
43    String golden =
44      "The quick\u00a0brown fox\njumps over\r\nthe lazy dog\n";
45    assertEquals(golden, Encoding.decodeHtml(html));
46
47    // Don't allocate a new string when no entities.
48    assertSame(golden, Encoding.decodeHtml(golden));
49
50    // test interrupted escapes and escapes at end of file handled gracefully
51    assertEquals(
52        "\\\\u000a",
53        Encoding.decodeHtml("\\\\u000a"));
54    assertEquals(
55        "\n",
56        Encoding.decodeHtml("
"));
57    assertEquals(
58        "\n",
59        Encoding.decodeHtml("
"));
60    assertEquals(
61        "\n",
62        Encoding.decodeHtml("
"));
63    assertEquals(
64        "\n",
65        Encoding.decodeHtml("
"));
66    assertEquals(
67        String.valueOf(Character.toChars(0x10000)),
68        Encoding.decodeHtml("𐀀"));
69    assertEquals(
70        "\n",
71        Encoding.decodeHtml("&#xa"));
72    assertEquals(
73        "&#x00ziggy",
74        Encoding.decodeHtml("&#x00ziggy"));
75    assertEquals(
76        "&#xa00z;",
77        Encoding.decodeHtml("&#xa00z;"));
78    assertEquals(
79        "&#\n",
80        Encoding.decodeHtml("&#
"));
81    assertEquals(
82        "&#x\n",
83        Encoding.decodeHtml("&#x
"));
84    assertEquals(
85        "\n\n",
86        Encoding.decodeHtml("&#xa
"));
87    assertEquals(
88        "&#\n",
89        Encoding.decodeHtml("&#
"));
90    assertEquals(
91        "&#x",
92        Encoding.decodeHtml("&#x"));
93    assertEquals(
94        "",  // NUL elided.
95        Encoding.decodeHtml("&#x0"));
96    assertEquals(
97        "&#",
98        Encoding.decodeHtml("&#"));
99
100    assertEquals(
101        "\\",
102        Encoding.decodeHtml("\\"));
103    assertEquals(
104        "&",
105        Encoding.decodeHtml("&"));
106
107    assertEquals(
108        "&#000a;",
109        Encoding.decodeHtml("&#000a;"));
110    assertEquals(
111        "\n",
112        Encoding.decodeHtml("
"));
113    assertEquals(
114        "\n",
115        Encoding.decodeHtml("
"));
116    assertEquals(
117        "\n",
118        Encoding.decodeHtml("
"));
119    assertEquals(
120        "\t",
121        Encoding.decodeHtml("	"));
122    assertEquals(
123        "\n",
124        Encoding.decodeHtml("&#10"));
125    assertEquals(
126        "&#00ziggy",
127        Encoding.decodeHtml("&#00ziggy"));
128    assertEquals(
129        "&#\n",
130        Encoding.decodeHtml("&#
"));
131    assertEquals(
132        "\n",
133        Encoding.decodeHtml("&#0
"));
134    assertEquals(
135        "\n",
136        Encoding.decodeHtml("&#01
"));
137    assertEquals(
138        "&#\n",
139        Encoding.decodeHtml("&#
"));
140    assertEquals(
141        "",  // Invalid XML char elided.
142        Encoding.decodeHtml("&#1"));
143    assertEquals(
144        "\t",
145        Encoding.decodeHtml("&#9"));
146    assertEquals(
147        "\n",
148        Encoding.decodeHtml("&#10"));
149
150    // test the named escapes
151    assertEquals(
152        "<",
153        Encoding.decodeHtml("&lt;"));
154    assertEquals(
155        ">",
156        Encoding.decodeHtml("&gt;"));
157    assertEquals(
158        "\"",
159        Encoding.decodeHtml("&quot;"));
160    assertEquals(
161        "'",
162        Encoding.decodeHtml("&apos;"));
163    assertEquals(
164        "'",
165        Encoding.decodeHtml("&#39;"));
166    assertEquals(
167        "'",
168        Encoding.decodeHtml("&#x27;"));
169    assertEquals(
170        "&",
171        Encoding.decodeHtml("&amp;"));
172    assertEquals(
173        "&lt;",
174        Encoding.decodeHtml("&amp;lt;"));
175    assertEquals(
176        "&",
177        Encoding.decodeHtml("&AMP;"));
178    assertEquals(
179        "&",
180        Encoding.decodeHtml("&AMP"));
181    assertEquals(
182        "&",
183        Encoding.decodeHtml("&AmP;"));
184    assertEquals(
185        "\u0391",
186        Encoding.decodeHtml("&Alpha;"));
187    assertEquals(
188        "\u03b1",
189        Encoding.decodeHtml("&alpha;"));
190
191    assertEquals(
192        "&;",
193        Encoding.decodeHtml("&;"));
194    assertEquals(
195        "&bogus;",
196        Encoding.decodeHtml("&bogus;"));
197  }
198
199  @Test
200  public static final void testAppendNumericEntityAndEncodeOnto()
201      throws Exception {
202    StringBuilder sb = new StringBuilder();
203    StringBuilder cps = new StringBuilder();
204    for (int codepoint : new int[] {
205        0, 9, '\n', '@', 0x80, 0xff, 0x100, 0xfff, 0x1000, 0x123a, 0xffff,
206        0x10000, Character.MAX_CODE_POINT }) {
207      Encoding.appendNumericEntity(codepoint, sb);
208      sb.append(' ');
209
210      cps.appendCodePoint(codepoint).append(' ');
211    }
212
213    assertEquals(
214         "&#0; &#9; &#10; &#64; &#x80; &#xff; &#x100; &#xfff; &#x1000; "
215         + "&#x123a; &#xffff; &#x10000; &#x10ffff; ",
216         sb.toString());
217
218    StringBuilder out = new StringBuilder();
219    Encoding.encodeHtmlOnto(cps.toString(), out);
220    assertEquals(
221        " \t \n &#64; \u0080 \u00ff \u0100 \u0fff \u1000 "
222        + "\u123a  &#x10000; &#x10ffff; ",
223        out.toString());
224  }
225
226  private static final void assertStripped(String stripped, String orig) {
227    String actual = Encoding.stripBannedCodeunits(orig);
228    assertEquals(orig, stripped, actual);
229    if (stripped.equals(orig)) {
230      assertSame(actual, orig);
231    }
232
233    StringBuilder sb = new StringBuilder(orig);
234    Encoding.stripBannedCodeunits(sb);
235    assertEquals(orig, stripped, sb.toString());
236  }
237
238  @Test
239  public static final void testStripBannedCodeunits() {
240    assertStripped("", "");
241    assertStripped("foo", "foo");
242    assertStripped("foobar", "foo\u0000bar");
243    assertStripped("foobar", "foo\u0000bar\u0000");
244    assertStripped("foobar", "foo\ufffebar\u0008");
245    assertStripped("foobar", "foo\ud800bar\udc00");
246    assertStripped("foo\ud800\udc00bar", "foo\ud800\ud800\udc00bar");
247    assertStripped("foo\ud800\udc00bar", "foo\ud800\udc00\ud800bar");
248    assertStripped("foo\ud800\udc00bar", "foo\ud800\udc00\udc00bar");
249    assertStripped("foo\ud800\udc00bar", "foo\udc00\ud800\udc00bar");
250    assertStripped("foo\ud834\udd1ebar", "foo\ud834\udd1ebar");
251    assertStripped("foo\ud834\udd1e", "foo\ud834\udd1e");
252    assertStripped("\uffef\ufffd", "\uffef\ufffd\ufffe\uffff");
253  }
254}
255