1// Copyright (c) 2011, Mike Samuel
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions
6// are met:
7//
8// Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// Redistributions in binary form must reproduce the above copyright
11// notice, this list of conditions and the following disclaimer in the
12// documentation and/or other materials provided with the distribution.
13// Neither the name of the OWASP nor the names of its contributors may
14// be used to endorse or promote products derived from this software
15// without specific prior written permission.
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28
29package org.owasp.html;
30
31import com.google.common.annotations.VisibleForTesting;
32import java.io.Closeable;
33import java.io.Flushable;
34import java.io.IOException;
35import java.util.Iterator;
36import java.util.List;
37import javax.annotation.WillCloseWhenClosed;
38import javax.annotation.concurrent.NotThreadSafe;
39
40/**
41 * Given a series of HTML tokens, writes valid, normalized HTML to the output.
42 * The output will have well-defined tag boundaries, but there may be orphaned
43 * or missing close and open tags.
44 * The result of two renderers can always be concatenated to produce a larger
45 * snippet of HTML, but if the first was called with
46 * {@code writeOpenTag("plaintext", ...)}, then any tags in the second will not
47 * be interpreted as tags in the concatenated version.
48 */
49@TCB
50@NotThreadSafe
51public class HtmlStreamRenderer implements HtmlStreamEventReceiver {
52
53  private final Appendable output;
54  private final Handler<? super IOException> ioExHandler;
55  private final Handler<? super String> badHtmlHandler;
56  private String lastTagOpened;
57  private StringBuilder pendingUnescaped;
58  private boolean open;
59
60  /**
61   * Factory.
62   * @param output the buffer to which HTML is streamed.
63   * @param ioExHandler called with any exception raised by output.
64   * @param badHtmlHandler receives alerts when HTML cannot be rendered because
65   *    there is not valid HTML tree that results from that series of calls.
66   *    E.g. it is not possible to create an HTML {@code <style>} element whose
67   *    textual content is {@code "</style>"}.
68   */
69  public static HtmlStreamRenderer create(
70      @WillCloseWhenClosed Appendable output,
71      Handler<? super IOException> ioExHandler,
72      Handler<? super String> badHtmlHandler) {
73    if (output instanceof Closeable) {
74      return new CloseableHtmlStreamRenderer(
75          output, ioExHandler, badHtmlHandler);
76    } else {
77      return new HtmlStreamRenderer(output, ioExHandler, badHtmlHandler);
78    }
79  }
80
81  /**
82   * Factory.
83   * @param output the buffer to which HTML is streamed.
84   * @param badHtmlHandler receives alerts when HTML cannot be rendered because
85   *    there is not valid HTML tree that results from that series of calls.
86   *    E.g. it is not possible to create an HTML {@code <style>} element whose
87   *    textual content is {@code "</style>"}.
88   */
89  public static HtmlStreamRenderer create(
90      StringBuilder output, Handler<? super String> badHtmlHandler) {
91    // Propagate since StringBuilder should not throw IOExceptions.
92    return create(output, Handler.PROPAGATE, badHtmlHandler);
93  }
94
95  private HtmlStreamRenderer(
96      Appendable output, Handler<? super IOException> ioExHandler,
97      Handler<? super String> badHtmlHandler) {
98    this.output = output;
99    this.ioExHandler = ioExHandler;
100    this.badHtmlHandler = badHtmlHandler;
101  }
102
103  /**
104   * Called when the series of calls make no sense.
105   * May be overridden to throw an unchecked throwable, to log, or to take some
106   * other action.
107   *
108   * @param message for human consumption.
109   * @param identifier an HTML identifier associated with the message.
110   */
111  private final void error(String message, CharSequence identifier) {
112    if (badHtmlHandler != Handler.DO_NOTHING) {   // Avoid string append.
113      badHtmlHandler.handle(message + " : " + identifier);
114    }
115  }
116
117  public final void openDocument() throws IllegalStateException {
118    if (open) { throw new IllegalStateException(); }
119    open = true;
120  }
121
122  public final void closeDocument() throws IllegalStateException {
123    if (!open) { throw new IllegalStateException(); }
124    if (pendingUnescaped != null) {
125      closeTag(lastTagOpened);
126    }
127    open = false;
128    if (output instanceof Flushable) {
129      try {
130        ((Flushable) output).flush();
131      } catch (IOException ex) {
132        ioExHandler.handle(ex);
133      }
134    }
135  }
136
137  public final boolean isDocumentOpen() {
138    return open;
139  }
140
141  public final void openTag(String elementName, List<String> attrs) {
142    try {
143      writeOpenTag(elementName, attrs);
144    } catch (IOException ex) {
145      ioExHandler.handle(ex);
146    }
147  }
148
149  private void writeOpenTag(String elementName, List<? extends String> attrs)
150      throws IOException {
151    if (!open) { throw new IllegalStateException(); }
152    elementName = safeName(elementName);
153    if (!isValidHtmlName(elementName)) {
154      error("Invalid element name", elementName);
155      return;
156    }
157    if (pendingUnescaped != null) {
158      error("Tag content cannot appear inside CDATA element", elementName);
159      return;
160    }
161
162    switch (HtmlTextEscapingMode.getModeForTag(elementName)) {
163      case CDATA_SOMETIMES:
164      case CDATA:
165      case PLAIN_TEXT:
166        lastTagOpened = elementName;
167        pendingUnescaped = new StringBuilder();
168        break;
169      default:
170    }
171
172    output.append('<').append(elementName);
173
174    for (Iterator<? extends String> attrIt = attrs.iterator();
175         attrIt.hasNext();) {
176      String name = attrIt.next();
177      String value = attrIt.next();
178      name = HtmlLexer.canonicalName(name);
179      if (!isValidHtmlName(name)) {
180        error("Invalid attr name", name);
181        continue;
182      }
183      output.append(' ').append(name).append('=').append('"');
184      Encoding.encodeHtmlOnto(value, output);
185      if (value.indexOf('`') != -1) {
186        // Apparently, in quirks mode, IE8 does a poor job producing innerHTML
187        // values.  Given
188        //     <div attr="``foo=bar">
189        // we encode &#96; but if JavaScript does:
190        //    nodeA.innerHTML = nodeB.innerHTML;
191        // and nodeB contains the DIV above, then IE8 will produce
192        //     <div attr=``foo=bar>
193        // as the value of nodeB.innerHTML and assign it to nodeA.
194        // IE8's HTML parser treats `` as a blank attribute value and foo=bar
195        // becomes a separate attribute.
196        // Adding a space at the end of the attribute prevents this by forcing
197        // IE8 to put double quotes around the attribute when computing
198        // nodeB.innerHTML.
199        output.append(' ');
200      }
201      output.append('"');
202    }
203
204    // Limit our output to the intersection of valid XML and valid HTML5 when
205    // the output contains no special HTML5 elements like <title>, <script>, or
206    // <textarea>.
207    if (HtmlTextEscapingMode.isVoidElement(elementName)) {
208      output.append(" /");
209    }
210
211    output.append('>');
212  }
213
214  public final void closeTag(String elementName) {
215    try {
216      writeCloseTag(safeName(elementName));
217    } catch (IOException ex) {
218      ioExHandler.handle(ex);
219    }
220  }
221
222  private final void writeCloseTag(String elementName)
223      throws IOException {
224    if (!open) { throw new IllegalStateException(); }
225    elementName = HtmlLexer.canonicalName(elementName);
226    if (!isValidHtmlName(elementName)) {
227      error("Invalid element name", elementName);
228      return;
229    }
230
231    if (pendingUnescaped != null) {
232      if (!lastTagOpened.equals(elementName)) {
233        error("Tag content cannot appear inside CDATA element", elementName);
234        return;
235      } else {
236        StringBuilder cdataContent = pendingUnescaped;
237        pendingUnescaped = null;
238        Encoding.stripBannedCodeunits(cdataContent);
239        int problemIndex = checkHtmlCdataCloseable(lastTagOpened, cdataContent);
240        if (problemIndex == -1) {
241          output.append(cdataContent);
242        } else {
243          error(
244              "Invalid CDATA text content",
245              cdataContent.subSequence(
246                  problemIndex,
247                  Math.min(problemIndex + 10, cdataContent.length())));
248          // Still output the close tag.
249        }
250      }
251      if ("plaintext".equals(elementName)) { return; }
252    }
253    output.append("</").append(elementName).append(">");
254  }
255
256  public final void text(String text) {
257    try {
258      writeText(text);
259    } catch (IOException ex) {
260      ioExHandler.handle(ex);
261    }
262  }
263
264  private final void writeText(String text) throws IOException {
265    if (!open) { throw new IllegalStateException(); }
266    if (pendingUnescaped != null) {
267      pendingUnescaped.append(text);
268    } else {
269      Encoding.encodeHtmlOnto(text, output);  // Works for RCDATA.
270    }
271  }
272
273  private static int checkHtmlCdataCloseable(
274      String localName, StringBuilder sb) {
275    int escapingTextSpanStart = -1;
276    for (int i = 0, n = sb.length(); i < n; ++i) {
277      char ch = sb.charAt(i);
278      switch (ch) {
279        case '<':
280          if (i + 3 < n
281              && '!' == sb.charAt(i + 1)
282              && '-' == sb.charAt(i + 2)
283              && '-' == sb.charAt(i + 3)) {
284            if (escapingTextSpanStart == -1) {
285              escapingTextSpanStart = i;
286            } else {
287              return i;
288            }
289          } else if (i + 1 + localName.length() < n
290                     && '/' == sb.charAt(i + 1)
291                     && Strings.regionMatchesIgnoreCase(
292                         sb, i + 2, localName, 0, localName.length())) {
293            // A close tag contained in the content.
294            if (escapingTextSpanStart < 0) {
295              // We could try some recovery strategies here.
296              // E.g. prepending "/<!--\n" to sb if "script".equals(localName)
297              return i;
298            }
299            if (!"script".equals(localName)) {
300              // Script tags are commonly included inside script tags.
301              // <script><!--document.write('<script>f()</script>');--></script>
302              // but this does not happen in other CDATA element types.
303              // Actually allowing an end tag inside others is problematic.
304              // Specifically,
305              // <style><!--</style>-->/* foo */</style>
306              // displays the text "/* foo */" on some browsers.
307              return i;
308            }
309          }
310          break;
311        case '>':
312          // From the HTML5 spec:
313          //    The text in style, script, title, and textarea elements must not
314          //    have an escaping text span start that is not followed by an
315          //    escaping text span end.
316          // We look left since the HTML 5 spec allows the escaping text span
317          // end to share dashes with the start.
318          if (i >= 2 && '-' == sb.charAt(i - 1) && '-' == sb.charAt(i - 2)) {
319            if (escapingTextSpanStart < 0) { return i - 2; }
320            escapingTextSpanStart = -1;
321          }
322          break;
323        default:
324          break;
325      }
326    }
327    if (escapingTextSpanStart >= 0) {
328      // We could try recovery strategies here.
329      // E.g. appending "//-->" to the buffer if "script".equals(localName)
330      return escapingTextSpanStart;
331    }
332    return -1;
333  }
334
335
336  @VisibleForTesting
337  static boolean isValidHtmlName(String name) {
338    int n = name.length();
339    if (n == 0) { return false; }
340    if (n > 128) { return false; }
341    boolean isNamespaced = false;
342    for (int i = 0; i < n; ++i) {
343      char ch = name.charAt(i);
344      switch (ch) {
345        case ':':
346          if (isNamespaced) { return false; }
347          isNamespaced = true;
348          if (i == 0 || i + 1 == n) { return false; }
349          break;
350        case '-':
351          if (i == 0 || i + 1 == n) { return false; }
352          break;
353        default:
354          if (ch <= '9') {
355            if (i == 0 || ch < '0') { return false; }
356          } else if ('A' <= ch && ch <= 'z') {
357            if ('Z' < ch && ch < 'a') { return false; }
358          } else {
359            return false;
360          }
361          break;
362      }
363    }
364    return true;
365  }
366
367  /**
368   * Canonicalizes the element name and possibly substitutes an alternative
369   * that has more consistent semantics.
370   */
371  static String safeName(String elementName) {
372    elementName = HtmlLexer.canonicalName(elementName);
373
374    // Substitute a reliably non-raw-text element for raw-text and
375    // plain-text elements.
376    switch (elementName.length()) {
377      case 3:
378        if ("xmp".equals(elementName)) { return "pre"; }
379        break;
380      case 7:
381        if ("listing".equals(elementName)) { return "pre"; }
382        break;
383      case 9:
384        if ("plaintext".equals(elementName)) { return "pre"; }
385        break;
386    }
387    return elementName;
388  }
389
390  static class CloseableHtmlStreamRenderer extends HtmlStreamRenderer
391      implements Closeable {
392    private final Closeable closeable;
393
394    CloseableHtmlStreamRenderer(
395        @WillCloseWhenClosed
396        Appendable output, Handler<? super IOException> errorHandler,
397        Handler<? super String> badHtmlHandler) {
398      super(output, errorHandler, badHtmlHandler);
399      this.closeable = (Closeable) output;
400    }
401
402    public void close() throws IOException {
403      if (isDocumentOpen()) { closeDocument(); }
404      closeable.close();
405    }
406  }
407}
408