1/*
2 * Copyright (C) 2010 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.clearsilver.jsilver.functions.html;
18
19import com.google.clearsilver.jsilver.functions.TextFilter;
20
21import java.io.IOException;
22import java.util.Collections;
23import java.util.HashMap;
24import java.util.Map;
25
26/**
27 * This class implements the html_strip function. It removes html tags from text, and expands
28 * numbered and named html entities to their corresponding special characters.
29 */
30public class HtmlStripFunction implements TextFilter {
31
32  // The maximum length of an entity (preceded by an &)
33  private static final int MAX_AMP_LENGTH = 9;
34
35  // The state the strip function can be, normal, in an amp escaped entity or
36  // inside a html tag.
37  private enum State {
38    DEFAULT, IN_AMP, IN_TAG
39  }
40
41  // Map of entity names to special characters.
42  private static final Map<String, String> entityValues;
43
44  // Initialize the entityName lookup map.
45  static {
46    Map<String, String> tempMap = new HashMap<String, String>();
47
48    // Html specific characters.
49    tempMap.put("amp", "&");
50    tempMap.put("quot", "\"");
51    tempMap.put("gt", ">");
52    tempMap.put("lt", "<");
53
54    tempMap.put("agrave", "\u00e0");
55    tempMap.put("aacute", "\u00e1");
56    tempMap.put("acirc", "\u00e2");
57    tempMap.put("atilde", "\u00e3");
58    tempMap.put("auml", "\u00e4");
59    tempMap.put("aring", "\u00e5");
60    tempMap.put("aelig", "\u00e6");
61    tempMap.put("ccedil", "\u00e7");
62    tempMap.put("egrave", "\u00e8");
63    tempMap.put("eacute", "\u00e9");
64    tempMap.put("ecirc", "\u00ea");
65    tempMap.put("euml", "\u00eb");
66    tempMap.put("eth", "\u00f0");
67    tempMap.put("igrave", "\u00ec");
68    tempMap.put("iacute", "\u00ed");
69    tempMap.put("icirc", "\u00ee");
70    tempMap.put("iuml", "\u00ef");
71    tempMap.put("ntilde", "\u00f1");
72    tempMap.put("nbsp", " ");
73    tempMap.put("ograve", "\u00f2");
74    tempMap.put("oacute", "\u00f3");
75    tempMap.put("ocirc", "\u00f4");
76    tempMap.put("otilde", "\u00f5");
77    tempMap.put("ouml", "\u00f6");
78    tempMap.put("oslash", "\u00f8");
79    tempMap.put("szlig", "\u00df");
80    tempMap.put("thorn", "\u00fe");
81    tempMap.put("ugrave", "\u00f9");
82    tempMap.put("uacute", "\u00fa");
83    tempMap.put("ucirc", "\u00fb");
84    tempMap.put("uuml", "\u00fc");
85    tempMap.put("yacute", "\u00fd");
86
87    // Clearsilver's Copyright symbol!
88    tempMap.put("copy", "(C)");
89
90    // Copy the temporary map to an unmodifiable map for the static lookup.
91    entityValues = Collections.unmodifiableMap(tempMap);
92  }
93
94  @Override
95  public void filter(String in, Appendable out) throws IOException {
96    char[] inChars = in.toCharArray();
97
98    // Holds the contents of an & (amp) entity before its decoded.
99    StringBuilder amp = new StringBuilder();
100    State state = State.DEFAULT;
101
102    // Loop over the input string, ignoring tags, and decoding entities.
103    for (int i = 0; i < inChars.length; i++) {
104      char c = inChars[i];
105      switch (state) {
106
107        case DEFAULT:
108          switch (c) {
109            case '&':
110              state = State.IN_AMP;
111              break;
112            case '<':
113              state = State.IN_TAG;
114              break;
115            default:
116              // If this is isn't the start of an amp of a tag, treat as plain
117              // text and just output.
118              out.append(c);
119          }
120          break;
121
122        case IN_TAG:
123          // When in a tag, all input is ignored until the end of the tag.
124          if (c == '>') {
125            state = State.DEFAULT;
126          }
127          break;
128
129        case IN_AMP:
130          // Semi-colon terminates an entity, try and decode it.
131          if (c == ';') {
132            state = State.DEFAULT;
133            appendDecodedEntityReference(out, amp);
134            amp = new StringBuilder();
135          } else {
136            if (amp.length() < MAX_AMP_LENGTH) {
137              // If this is not the last character in the input, append to the
138              // amp buffer and continue, if it is the last, dump the buffer
139              // to stop the contents of it being lost.
140              if (i != inChars.length - 1) {
141                amp.append(c);
142              } else {
143                out.append('&').append(amp).append(c);
144              }
145            } else {
146              // More than 8 chars, so not a valid entity, dump as plain text.
147              out.append('&').append(amp).append(c);
148              amp = new StringBuilder();
149              state = State.DEFAULT;
150            }
151          }
152          break;
153      }
154    }
155  }
156
157  /**
158   * Attempts to decode the entity provided, if it succeeds appends it to the out string.
159   *
160   * @param out the string builder to add the decoded entity to.
161   * @param entityName to decode.
162   */
163  private void appendDecodedEntityReference(Appendable out, CharSequence entityName)
164      throws IOException {
165
166    // All the valid entities are at least two characters long.
167    if (entityName.length() < 2) {
168      return;
169    }
170
171    entityName = entityName.toString().toLowerCase();
172
173    // Numbered entity.
174    if (entityName.charAt(0) == '#') {
175      appendNumberedEntity(out, entityName.subSequence(1, entityName.length()));
176      return;
177    }
178
179    // If the entity is not a numeric value, try looking it up by name.
180    String entity = entityValues.get(entityName);
181
182    // If there is an entity by that name add it to the output.
183    if (entity != null) {
184      out.append(entity);
185    }
186  }
187
188  /**
189   * Appends an entity to a string by numeric code.
190   *
191   * @param out the string to add the entity to.
192   * @param entity the numeric code for the entity as a char sequence.
193   */
194  private void appendNumberedEntity(Appendable out, CharSequence entity) throws IOException {
195
196    if (entity.length() != 0) {
197      try {
198        char c;
199        // Hex numbered entities start with x.
200        if (entity.charAt(0) == 'x') {
201          c = (char) Integer.parseInt(entity.subSequence(1, entity.length()).toString(), 16);
202        } else {
203          // If its numbered, but not hex, its decimal.
204          c = (char) Integer.parseInt(entity.toString(), 10);
205        }
206
207        // Don't append null characters, this is to remain Clearsilver compatible.
208        if (c != '\u0000') {
209          out.append(c);
210        }
211      } catch (NumberFormatException e) {
212        // Do nothing if this is not a valid numbered entity.
213      }
214    }
215  }
216}
217