1/*
2 * Copyright (C) 2010 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.clearsilver.jsilver.template;
18
19import java.io.IOException;
20
21/**
22 * HTML whitespace stripper to be used by JSilver.  It removes leading and
23 * trailing whitespace, it reduces contiguous whitespace characters with just
24 * the first character, and removes lines of nothing but whitespace.
25 *
26 * It does not strip whitespace inside the following elements:
27 * <ul>
28 * <li> PRE
29 * <li> VERBATIM
30 * <li> TEXTAREA
31 * <li> SCRIPT
32 * </ul>
33 * It also strips out empty lines and leading whitespace inside HTML tags (i.e.
34 * between '<' and '>') and inside SCRIPT elements.  It leaves trailing
35 * whitespace since that is more costly to remove and tends to not be common
36 * based on how templates are created (they don't have trailing whitespace).
37 * <p>
38 * Loadtests indicate that this class can strip whitespace almost as quickly
39 * as just reading every character from a string (20% slower).
40 * <p>
41 * While not strictly compatible with the JNI Clearsilver whitestripping
42 * function, we are not aware of any differences that yield functionally
43 * different HTML output. However, we encourage users to verify for themselves
44 * and report any differences.
45 */
46public class HtmlWhiteSpaceStripper implements Appendable {
47
48  // Object to output stripped content to.
49  private final Appendable out;
50  // Level of whitespace stripping to perform. (Currently not used).
51  // TODO: Determine what the exact differences are in levels in
52  // JNI Clearsilver and see if it is worth porting it.
53  private final int level;
54
55  // Has any non-whitespace character been seen since the start of the line.
56  private boolean nonWsSeen = false;
57  // Was there previously one or more whitespace chars? If so, we should output
58  // the first whitespace char in the sequence before any other non-whitespace
59  // character. 0 signifies no pending whitespace.
60  private char pendingWs = 0;
61
62  // We just saw the start of an HTML tag '<'.
63  private boolean startHtmlTag = false;
64  // Are we currently in an opening HTML tag (not "</").
65  private boolean inOpenTag = false;
66  // Are we currently in a closing HTML tag.
67  private boolean inCloseTag = false;
68  // Are we currently in an HTML tag name.
69  private boolean inTagName = false;
70
71  // Are we between <textarea> tags
72  private int textAreaScope = 0;
73  // Are we between <pre> tags
74  private int preScope = 0;
75  // Are we between verbatim flags
76  private int verbatimScope = 0;
77  // Are we between <script> tags
78  private int scriptScope = 0;
79
80  // Used to hold HTML tag element name.
81  private StringBuilder tagName = new StringBuilder(16);
82
83  /**
84   * Intermediate Appendable object that strips whitespace as it passes through characters to
85   * another Appendable object.
86   *
87   * @param out The Appendable object to dump the stripped output to.
88   */
89  public HtmlWhiteSpaceStripper(Appendable out) {
90    this(out, 1);
91  }
92
93  /**
94   * Intermediate Appendable object that strips whitespace as it passes through characters to
95   * another Appendable object.
96   *
97   * @param out The Appendable object to dump the stripped output to.
98   * @param level Ignored for now.
99   */
100  public HtmlWhiteSpaceStripper(Appendable out, int level) {
101    this.out = out;
102    this.level = level;
103  }
104
105  @Override
106  public String toString() {
107    return out.toString();
108  }
109
110  @Override
111  public Appendable append(CharSequence csq) throws IOException {
112    return append(csq, 0, csq.length());
113  }
114
115  @Override
116  public Appendable append(CharSequence csq, int start, int end) throws IOException {
117    for (int i = start; i < end; i++) {
118      append(csq.charAt(i));
119    }
120    return this;
121  }
122
123  @Override
124  public Appendable append(char c) throws IOException {
125    if (inOpenTag || inCloseTag) {
126      // In an HTML tag.
127      if (startHtmlTag) {
128        // This is the first character in an HTML tag.
129        if (c == '/') {
130          // We are in a close tag.
131          inOpenTag = false;
132          inCloseTag = true;
133        } else {
134          // This is the first non-'/' character in an HTML tag.
135          startHtmlTag = false;
136          if (isTagNameStartChar(c)) {
137            // we have a valid tag name first char.
138            inTagName = true;
139            tagName.append(c);
140          }
141        }
142      } else if (inTagName) {
143        // We were last parsing the name of an HTML attribute.
144        if (isTagNameChar(c)) {
145          tagName.append(c);
146        } else {
147          processTagName();
148          inTagName = false;
149        }
150      }
151      if (c == '>') {
152        // We are at the end of the tag.
153        inOpenTag = inCloseTag = false;
154        nonWsSeen = true;
155      }
156      stripLeadingWsAndEmptyLines(c);
157    } else {
158      // Outside of HTML tag.
159      if (c == '<') {
160        // Starting a new HTML tag.
161        inOpenTag = true;
162        startHtmlTag = true;
163      }
164      if (preScope > 0 || verbatimScope > 0 || textAreaScope > 0) {
165        // In an HTML element that we want to preserve whitespace in.
166        out.append(c);
167      } else if (scriptScope > 0) {
168        // Want to remove newlines only.
169        stripLeadingWsAndEmptyLines(c);
170      } else {
171        stripAll(c);
172      }
173    }
174
175    return this;
176  }
177
178  private void stripLeadingWsAndEmptyLines(char c) throws IOException {
179    // Detect and delete empty lines.
180    switch (c) {
181      case '\n':
182        if (nonWsSeen) {
183          out.append(c);
184        }
185        nonWsSeen = false;
186        break;
187      case ' ':
188      case '\t':
189      case '\r':
190        if (nonWsSeen) {
191          out.append(c);
192        }
193        break;
194      default:
195        if (!nonWsSeen) {
196          nonWsSeen = true;
197        }
198        out.append(c);
199    }
200  }
201
202  private void stripAll(char c) throws IOException {
203    // All that remains is content that is safe to remove whitespace from.
204    switch (c) {
205      case '\n':
206        if (nonWsSeen) {
207          // We don't want blank lines so we don't output linefeed unless we
208          // saw non-whitespace.
209          out.append(c);
210        }
211        // We don't want trailing whitespace.
212        pendingWs = 0;
213        nonWsSeen = false;
214        break;
215      case ' ':
216      case '\t':
217      case '\r':
218        if (nonWsSeen) {
219          pendingWs = c;
220        } else {
221          // Omit leading whitespace
222        }
223        break;
224      default:
225        if (pendingWs != 0) {
226          out.append(pendingWs);
227          pendingWs = 0;
228        }
229        nonWsSeen = true;
230        out.append(c);
231    }
232  }
233
234  private int updateScope(int current, int inc) {
235    current += inc;
236    return current < 0 ? 0 : current;
237  }
238
239  /**
240   * This code assumes well-formed HTML as input with HTML elements opening and closing properly in
241   * the right order.
242   */
243  private void processTagName() {
244    inTagName = false;
245    String name = tagName.toString();
246    tagName.delete(0, tagName.length());
247    int inc = inOpenTag ? 1 : -1;
248    if ("textarea".equalsIgnoreCase(name)) {
249      textAreaScope = updateScope(textAreaScope, inc);
250    } else if ("pre".equalsIgnoreCase(name)) {
251      preScope = updateScope(preScope, inc);
252    } else if ("verbatim".equalsIgnoreCase(name)) {
253      verbatimScope = updateScope(verbatimScope, inc);
254    } else if ("script".equalsIgnoreCase(name)) {
255      scriptScope = updateScope(scriptScope, inc);
256    }
257  }
258
259  private boolean isTagNameStartChar(char c) {
260    return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
261  }
262
263  // From W3C HTML spec.
264  private boolean isTagNameChar(char c) {
265    return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || (c == '_')
266        || (c == '-') || (c == ':') || (c == '.');
267  }
268
269  /**
270   * Note, we treat '\n' as a separate special character as it has special rules since it determines
271   * what a 'line' of content is for doing leading and trailing whitespace removal and empty line
272   * removal.
273   */
274  private boolean isWs(char c) {
275    return c == ' ' || c == '\t' || c == '\r';
276  }
277}
278