1/* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "config.h" 29#include "core/html/parser/HTMLEntityParser.h" 30 31#include "core/html/parser/HTMLEntitySearch.h" 32#include "core/html/parser/HTMLEntityTable.h" 33#include "wtf/text/StringBuilder.h" 34 35using namespace WTF; 36 37namespace blink { 38 39static const UChar windowsLatin1ExtensionArray[32] = { 40 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 41 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F 42 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 43 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F 44}; 45 46static bool isAlphaNumeric(UChar cc) 47{ 48 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z'); 49} 50 51static UChar adjustEntity(UChar32 value) 52{ 53 if ((value & ~0x1F) != 0x0080) 54 return value; 55 return windowsLatin1ExtensionArray[value - 0x80]; 56} 57 58static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity) 59{ 60 // FIXME: A number of specific entity values generate parse errors. 61 if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) { 62 decodedEntity.append(0xFFFD); 63 return; 64 } 65 if (U_IS_BMP(c)) { 66 decodedEntity.append(adjustEntity(c)); 67 return; 68 } 69 decodedEntity.append(c); 70} 71 72static const UChar32 kInvalidUnicode = -1; 73 74static bool isHexDigit(UChar cc) 75{ 76 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'); 77} 78 79static UChar asHexDigit(UChar cc) 80{ 81 if (cc >= '0' && cc <= '9') 82 return cc - '0'; 83 if (cc >= 'a' && cc <= 'z') 84 return 10 + cc - 'a'; 85 if (cc >= 'A' && cc <= 'Z') 86 return 10 + cc - 'A'; 87 ASSERT_NOT_REACHED(); 88 return 0; 89} 90 91typedef Vector<UChar, 64> ConsumedCharacterBuffer; 92 93static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters) 94{ 95 if (consumedCharacters.size() == 1) 96 source.push(consumedCharacters[0]); 97 else if (consumedCharacters.size() == 2) { 98 source.push(consumedCharacters[0]); 99 source.push(consumedCharacters[1]); 100 } else 101 source.prepend(SegmentedString(String(consumedCharacters))); 102} 103 104static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc) 105{ 106 ConsumedCharacterBuffer consumedCharacters; 107 HTMLEntitySearch entitySearch; 108 while (!source.isEmpty()) { 109 cc = source.currentChar(); 110 entitySearch.advance(cc); 111 if (!entitySearch.isEntityPrefix()) 112 break; 113 consumedCharacters.append(cc); 114 source.advanceAndASSERT(cc); 115 } 116 notEnoughCharacters = source.isEmpty(); 117 if (notEnoughCharacters) { 118 // We can't decide on an entity because there might be a longer entity 119 // that we could match if we had more data. 120 unconsumeCharacters(source, consumedCharacters); 121 return false; 122 } 123 if (!entitySearch.mostRecentMatch()) { 124 unconsumeCharacters(source, consumedCharacters); 125 return false; 126 } 127 if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) { 128 // We've consumed too many characters. We need to walk the 129 // source back to the point at which we had consumed an 130 // actual entity. 131 unconsumeCharacters(source, consumedCharacters); 132 consumedCharacters.clear(); 133 const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch(); 134 const int length = mostRecent->length; 135 const LChar* reference = HTMLEntityTable::entityString(*mostRecent); 136 for (int i = 0; i < length; ++i) { 137 cc = source.currentChar(); 138 ASSERT_UNUSED(reference, cc == static_cast<UChar>(*reference++)); 139 consumedCharacters.append(cc); 140 source.advanceAndASSERT(cc); 141 ASSERT(!source.isEmpty()); 142 } 143 cc = source.currentChar(); 144 } 145 if (entitySearch.mostRecentMatch()->lastCharacter() == ';' 146 || !additionalAllowedCharacter 147 || !(isAlphaNumeric(cc) || cc == '=')) { 148 decodedEntity.append(entitySearch.mostRecentMatch()->firstValue); 149 if (UChar32 second = entitySearch.mostRecentMatch()->secondValue) 150 decodedEntity.append(second); 151 return true; 152 } 153 unconsumeCharacters(source, consumedCharacters); 154 return false; 155} 156 157bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter) 158{ 159 ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>'); 160 ASSERT(!notEnoughCharacters); 161 ASSERT(decodedEntity.isEmpty()); 162 163 enum EntityState { 164 Initial, 165 Number, 166 MaybeHexLowerCaseX, 167 MaybeHexUpperCaseX, 168 Hex, 169 Decimal, 170 Named 171 }; 172 EntityState entityState = Initial; 173 UChar32 result = 0; 174 ConsumedCharacterBuffer consumedCharacters; 175 176 while (!source.isEmpty()) { 177 UChar cc = source.currentChar(); 178 switch (entityState) { 179 case Initial: { 180 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&') 181 return false; 182 if (additionalAllowedCharacter && cc == additionalAllowedCharacter) 183 return false; 184 if (cc == '#') { 185 entityState = Number; 186 break; 187 } 188 if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) { 189 entityState = Named; 190 continue; 191 } 192 return false; 193 } 194 case Number: { 195 if (cc == 'x') { 196 entityState = MaybeHexLowerCaseX; 197 break; 198 } 199 if (cc == 'X') { 200 entityState = MaybeHexUpperCaseX; 201 break; 202 } 203 if (cc >= '0' && cc <= '9') { 204 entityState = Decimal; 205 continue; 206 } 207 source.push('#'); 208 return false; 209 } 210 case MaybeHexLowerCaseX: { 211 if (isHexDigit(cc)) { 212 entityState = Hex; 213 continue; 214 } 215 source.push('#'); 216 source.push('x'); 217 return false; 218 } 219 case MaybeHexUpperCaseX: { 220 if (isHexDigit(cc)) { 221 entityState = Hex; 222 continue; 223 } 224 source.push('#'); 225 source.push('X'); 226 return false; 227 } 228 case Hex: { 229 if (isHexDigit(cc)) { 230 if (result != kInvalidUnicode) 231 result = result * 16 + asHexDigit(cc); 232 } else if (cc == ';') { 233 source.advanceAndASSERT(cc); 234 appendLegalEntityFor(result, decodedEntity); 235 return true; 236 } else { 237 appendLegalEntityFor(result, decodedEntity); 238 return true; 239 } 240 break; 241 } 242 case Decimal: { 243 if (cc >= '0' && cc <= '9') { 244 if (result != kInvalidUnicode) 245 result = result * 10 + cc - '0'; 246 } else if (cc == ';') { 247 source.advanceAndASSERT(cc); 248 appendLegalEntityFor(result, decodedEntity); 249 return true; 250 } else { 251 appendLegalEntityFor(result, decodedEntity); 252 return true; 253 } 254 break; 255 } 256 case Named: { 257 return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc); 258 } 259 } 260 261 if (result > UCHAR_MAX_VALUE) 262 result = kInvalidUnicode; 263 264 consumedCharacters.append(cc); 265 source.advanceAndASSERT(cc); 266 } 267 ASSERT(source.isEmpty()); 268 notEnoughCharacters = true; 269 unconsumeCharacters(source, consumedCharacters); 270 return false; 271} 272 273static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result) 274{ 275 if (U_IS_BMP(value)) { 276 UChar character = static_cast<UChar>(value); 277 ASSERT(character == value); 278 result[0] = character; 279 return 1; 280 } 281 282 result[0] = U16_LEAD(value); 283 result[1] = U16_TRAIL(value); 284 return 2; 285} 286 287size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4]) 288{ 289 HTMLEntitySearch search; 290 while (*name) { 291 search.advance(*name++); 292 if (!search.isEntityPrefix()) 293 return 0; 294 } 295 search.advance(';'); 296 if (!search.isEntityPrefix()) 297 return 0; 298 299 size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result); 300 if (!search.mostRecentMatch()->secondValue) 301 return numberOfCodePoints; 302 return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints); 303} 304 305} // namespace blink 306