1/* 2 * Copyright (C) 2010 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.google.clearsilver.jsilver.functions.html; 18 19import com.google.clearsilver.jsilver.functions.TextFilter; 20 21import java.io.IOException; 22import java.util.Collections; 23import java.util.HashMap; 24import java.util.Map; 25 26/** 27 * This class implements the html_strip function. It removes html tags from text, and expands 28 * numbered and named html entities to their corresponding special characters. 29 */ 30public class HtmlStripFunction implements TextFilter { 31 32 // The maximum length of an entity (preceded by an &) 33 private static final int MAX_AMP_LENGTH = 9; 34 35 // The state the strip function can be, normal, in an amp escaped entity or 36 // inside a html tag. 37 private enum State { 38 DEFAULT, IN_AMP, IN_TAG 39 } 40 41 // Map of entity names to special characters. 42 private static final Map<String, String> entityValues; 43 44 // Initialize the entityName lookup map. 45 static { 46 Map<String, String> tempMap = new HashMap<String, String>(); 47 48 // Html specific characters. 49 tempMap.put("amp", "&"); 50 tempMap.put("quot", "\""); 51 tempMap.put("gt", ">"); 52 tempMap.put("lt", "<"); 53 54 tempMap.put("agrave", "\u00e0"); 55 tempMap.put("aacute", "\u00e1"); 56 tempMap.put("acirc", "\u00e2"); 57 tempMap.put("atilde", "\u00e3"); 58 tempMap.put("auml", "\u00e4"); 59 tempMap.put("aring", "\u00e5"); 60 tempMap.put("aelig", "\u00e6"); 61 tempMap.put("ccedil", "\u00e7"); 62 tempMap.put("egrave", "\u00e8"); 63 tempMap.put("eacute", "\u00e9"); 64 tempMap.put("ecirc", "\u00ea"); 65 tempMap.put("euml", "\u00eb"); 66 tempMap.put("eth", "\u00f0"); 67 tempMap.put("igrave", "\u00ec"); 68 tempMap.put("iacute", "\u00ed"); 69 tempMap.put("icirc", "\u00ee"); 70 tempMap.put("iuml", "\u00ef"); 71 tempMap.put("ntilde", "\u00f1"); 72 tempMap.put("nbsp", " "); 73 tempMap.put("ograve", "\u00f2"); 74 tempMap.put("oacute", "\u00f3"); 75 tempMap.put("ocirc", "\u00f4"); 76 tempMap.put("otilde", "\u00f5"); 77 tempMap.put("ouml", "\u00f6"); 78 tempMap.put("oslash", "\u00f8"); 79 tempMap.put("szlig", "\u00df"); 80 tempMap.put("thorn", "\u00fe"); 81 tempMap.put("ugrave", "\u00f9"); 82 tempMap.put("uacute", "\u00fa"); 83 tempMap.put("ucirc", "\u00fb"); 84 tempMap.put("uuml", "\u00fc"); 85 tempMap.put("yacute", "\u00fd"); 86 87 // Clearsilver's Copyright symbol! 88 tempMap.put("copy", "(C)"); 89 90 // Copy the temporary map to an unmodifiable map for the static lookup. 91 entityValues = Collections.unmodifiableMap(tempMap); 92 } 93 94 @Override 95 public void filter(String in, Appendable out) throws IOException { 96 char[] inChars = in.toCharArray(); 97 98 // Holds the contents of an & (amp) entity before its decoded. 99 StringBuilder amp = new StringBuilder(); 100 State state = State.DEFAULT; 101 102 // Loop over the input string, ignoring tags, and decoding entities. 103 for (int i = 0; i < inChars.length; i++) { 104 char c = inChars[i]; 105 switch (state) { 106 107 case DEFAULT: 108 switch (c) { 109 case '&': 110 state = State.IN_AMP; 111 break; 112 case '<': 113 state = State.IN_TAG; 114 break; 115 default: 116 // If this is isn't the start of an amp of a tag, treat as plain 117 // text and just output. 118 out.append(c); 119 } 120 break; 121 122 case IN_TAG: 123 // When in a tag, all input is ignored until the end of the tag. 124 if (c == '>') { 125 state = State.DEFAULT; 126 } 127 break; 128 129 case IN_AMP: 130 // Semi-colon terminates an entity, try and decode it. 131 if (c == ';') { 132 state = State.DEFAULT; 133 appendDecodedEntityReference(out, amp); 134 amp = new StringBuilder(); 135 } else { 136 if (amp.length() < MAX_AMP_LENGTH) { 137 // If this is not the last character in the input, append to the 138 // amp buffer and continue, if it is the last, dump the buffer 139 // to stop the contents of it being lost. 140 if (i != inChars.length - 1) { 141 amp.append(c); 142 } else { 143 out.append('&').append(amp).append(c); 144 } 145 } else { 146 // More than 8 chars, so not a valid entity, dump as plain text. 147 out.append('&').append(amp).append(c); 148 amp = new StringBuilder(); 149 state = State.DEFAULT; 150 } 151 } 152 break; 153 } 154 } 155 } 156 157 /** 158 * Attempts to decode the entity provided, if it succeeds appends it to the out string. 159 * 160 * @param out the string builder to add the decoded entity to. 161 * @param entityName to decode. 162 */ 163 private void appendDecodedEntityReference(Appendable out, CharSequence entityName) 164 throws IOException { 165 166 // All the valid entities are at least two characters long. 167 if (entityName.length() < 2) { 168 return; 169 } 170 171 entityName = entityName.toString().toLowerCase(); 172 173 // Numbered entity. 174 if (entityName.charAt(0) == '#') { 175 appendNumberedEntity(out, entityName.subSequence(1, entityName.length())); 176 return; 177 } 178 179 // If the entity is not a numeric value, try looking it up by name. 180 String entity = entityValues.get(entityName); 181 182 // If there is an entity by that name add it to the output. 183 if (entity != null) { 184 out.append(entity); 185 } 186 } 187 188 /** 189 * Appends an entity to a string by numeric code. 190 * 191 * @param out the string to add the entity to. 192 * @param entity the numeric code for the entity as a char sequence. 193 */ 194 private void appendNumberedEntity(Appendable out, CharSequence entity) throws IOException { 195 196 if (entity.length() != 0) { 197 try { 198 char c; 199 // Hex numbered entities start with x. 200 if (entity.charAt(0) == 'x') { 201 c = (char) Integer.parseInt(entity.subSequence(1, entity.length()).toString(), 16); 202 } else { 203 // If its numbered, but not hex, its decimal. 204 c = (char) Integer.parseInt(entity.toString(), 10); 205 } 206 207 // Don't append null characters, this is to remain Clearsilver compatible. 208 if (c != '\u0000') { 209 out.append(c); 210 } 211 } catch (NumberFormatException e) { 212 // Do nothing if this is not a valid numbered entity. 213 } 214 } 215 } 216} 217