HTMLToken.h revision ab9e7a118cf1ea2e3a93dce683b2ded3e7291ddb
1/* 2 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26#ifndef HTMLToken_h 27#define HTMLToken_h 28 29#include "NamedNodeMap.h" 30#include <wtf/PassOwnPtr.h> 31#include <wtf/Vector.h> 32 33namespace WebCore { 34 35class HTMLToken { 36 WTF_MAKE_NONCOPYABLE(HTMLToken); WTF_MAKE_FAST_ALLOCATED; 37public: 38 enum Type { 39 Uninitialized, 40 DOCTYPE, 41 StartTag, 42 EndTag, 43 Comment, 44 Character, 45 EndOfFile, 46 }; 47 48 class Range { 49 public: 50 int m_start; 51 int m_end; 52 }; 53 54 class Attribute { 55 public: 56 Range m_nameRange; 57 Range m_valueRange; 58 WTF::Vector<UChar, 32> m_name; 59 WTF::Vector<UChar, 32> m_value; 60 }; 61 62 typedef WTF::Vector<Attribute, 10> AttributeList; 63 typedef WTF::Vector<UChar, 1024> DataVector; 64 65 HTMLToken() { clear(); } 66 67 void clear(int startIndex = 0) 68 { 69 m_type = Uninitialized; 70 m_range.m_start = startIndex; 71 m_range.m_end = startIndex; 72 m_data.clear(); 73 } 74 75 int startIndex() const { return m_range.m_start; } 76 int endIndex() const { return m_range.m_end; } 77 78 void end(int endIndex) 79 { 80 m_range.m_end = endIndex; 81 } 82 83 void makeEndOfFile() 84 { 85 ASSERT(m_type == Uninitialized); 86 m_type = EndOfFile; 87 } 88 89 void beginStartTag(UChar character) 90 { 91 ASSERT(character); 92 ASSERT(m_type == Uninitialized); 93 m_type = StartTag; 94 m_selfClosing = false; 95 m_currentAttribute = 0; 96 m_attributes.clear(); 97 98 m_data.append(character); 99 } 100 101 template<typename T> 102 void beginEndTag(T characters) 103 { 104 ASSERT(m_type == Uninitialized); 105 m_type = EndTag; 106 m_selfClosing = false; 107 m_currentAttribute = 0; 108 m_attributes.clear(); 109 110 m_data.append(characters); 111 } 112 113 // Starting a character token works slightly differently than starting 114 // other types of tokens because we want to save a per-character branch. 115 void ensureIsCharacterToken() 116 { 117 ASSERT(m_type == Uninitialized || m_type == Character); 118 m_type = Character; 119 } 120 121 void beginComment() 122 { 123 ASSERT(m_type == Uninitialized); 124 m_type = Comment; 125 } 126 127 void beginDOCTYPE() 128 { 129 ASSERT(m_type == Uninitialized); 130 m_type = DOCTYPE; 131 m_doctypeData = adoptPtr(new DoctypeData()); 132 } 133 134 void beginDOCTYPE(UChar character) 135 { 136 ASSERT(character); 137 beginDOCTYPE(); 138 m_data.append(character); 139 } 140 141 void appendToName(UChar character) 142 { 143 ASSERT(character); 144 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); 145 m_data.append(character); 146 } 147 148 template<typename T> 149 void appendToCharacter(T characters) 150 { 151 ASSERT(m_type == Character); 152 m_data.append(characters); 153 } 154 155 void appendToComment(UChar character) 156 { 157 ASSERT(character); 158 ASSERT(m_type == Comment); 159 m_data.append(character); 160 } 161 162 void addNewAttribute() 163 { 164 ASSERT(m_type == StartTag || m_type == EndTag); 165 m_attributes.grow(m_attributes.size() + 1); 166 m_currentAttribute = &m_attributes.last(); 167#ifndef NDEBUG 168 m_currentAttribute->m_nameRange.m_start = 0; 169 m_currentAttribute->m_nameRange.m_end = 0; 170 m_currentAttribute->m_valueRange.m_start = 0; 171 m_currentAttribute->m_valueRange.m_end = 0; 172#endif 173 } 174 175 void beginAttributeName(int index) 176 { 177 m_currentAttribute->m_nameRange.m_start = index; 178 } 179 180 void endAttributeName(int index) 181 { 182 m_currentAttribute->m_nameRange.m_end = index; 183 m_currentAttribute->m_valueRange.m_start = index; 184 m_currentAttribute->m_valueRange.m_end = index; 185 } 186 187 void beginAttributeValue(int index) 188 { 189 m_currentAttribute->m_valueRange.m_start = index; 190#ifndef NDEBUG 191 m_currentAttribute->m_valueRange.m_end = 0; 192#endif 193 } 194 195 void endAttributeValue(int index) 196 { 197 m_currentAttribute->m_valueRange.m_end = index; 198 } 199 200 void appendToAttributeName(UChar character) 201 { 202 ASSERT(character); 203 ASSERT(m_type == StartTag || m_type == EndTag); 204 ASSERT(m_currentAttribute->m_nameRange.m_start); 205 m_currentAttribute->m_name.append(character); 206 } 207 208 void appendToAttributeValue(UChar character) 209 { 210 ASSERT(character); 211 ASSERT(m_type == StartTag || m_type == EndTag); 212 ASSERT(m_currentAttribute->m_valueRange.m_start); 213 m_currentAttribute->m_value.append(character); 214 } 215 216 Type type() const { return m_type; } 217 218 bool selfClosing() const 219 { 220 ASSERT(m_type == StartTag || m_type == EndTag); 221 return m_selfClosing; 222 } 223 224 void setSelfClosing() 225 { 226 ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag); 227 m_selfClosing = true; 228 } 229 230 const AttributeList& attributes() const 231 { 232 ASSERT(m_type == StartTag || m_type == EndTag); 233 return m_attributes; 234 } 235 236 const DataVector& name() const 237 { 238 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); 239 return m_data; 240 } 241 242 const DataVector& characters() const 243 { 244 ASSERT(m_type == Character); 245 return m_data; 246 } 247 248 const DataVector& comment() const 249 { 250 ASSERT(m_type == Comment); 251 return m_data; 252 } 253 254 // FIXME: Distinguish between a missing public identifer and an empty one. 255 const WTF::Vector<UChar>& publicIdentifier() const 256 { 257 ASSERT(m_type == DOCTYPE); 258 return m_doctypeData->m_publicIdentifier; 259 } 260 261 // FIXME: Distinguish between a missing system identifer and an empty one. 262 const WTF::Vector<UChar>& systemIdentifier() const 263 { 264 ASSERT(m_type == DOCTYPE); 265 return m_doctypeData->m_systemIdentifier; 266 } 267 268 void setPublicIdentifierToEmptyString() 269 { 270 ASSERT(m_type == DOCTYPE); 271 m_doctypeData->m_hasPublicIdentifier = true; 272 m_doctypeData->m_publicIdentifier.clear(); 273 } 274 275 void setSystemIdentifierToEmptyString() 276 { 277 ASSERT(m_type == DOCTYPE); 278 m_doctypeData->m_hasSystemIdentifier = true; 279 m_doctypeData->m_systemIdentifier.clear(); 280 } 281 282 bool forceQuirks() const 283 { 284 ASSERT(m_type == DOCTYPE); 285 return m_doctypeData->m_forceQuirks; 286 } 287 288 void setForceQuirks() 289 { 290 ASSERT(m_type == DOCTYPE); 291 m_doctypeData->m_forceQuirks = true; 292 } 293 294 void appendToPublicIdentifier(UChar character) 295 { 296 ASSERT(character); 297 ASSERT(m_type == DOCTYPE); 298 ASSERT(m_doctypeData->m_hasPublicIdentifier); 299 m_doctypeData->m_publicIdentifier.append(character); 300 } 301 302 void appendToSystemIdentifier(UChar character) 303 { 304 ASSERT(character); 305 ASSERT(m_type == DOCTYPE); 306 ASSERT(m_doctypeData->m_hasSystemIdentifier); 307 m_doctypeData->m_systemIdentifier.append(character); 308 } 309 310private: 311 // FIXME: I'm not sure what the final relationship between HTMLToken and 312 // AtomicHTMLToken will be. I'm marking this a friend for now, but we'll 313 // want to end up with a cleaner interface between the two classes. 314 friend class AtomicHTMLToken; 315 316 class DoctypeData { 317 WTF_MAKE_NONCOPYABLE(DoctypeData); 318 public: 319 DoctypeData() 320 : m_hasPublicIdentifier(false) 321 , m_hasSystemIdentifier(false) 322 , m_forceQuirks(false) 323 { 324 } 325 326 bool m_hasPublicIdentifier; 327 bool m_hasSystemIdentifier; 328 bool m_forceQuirks; 329 WTF::Vector<UChar> m_publicIdentifier; 330 WTF::Vector<UChar> m_systemIdentifier; 331 }; 332 333 Type m_type; 334 335 // Which characters from the input stream are represented by this token. 336 Range m_range; 337 338 // "name" for DOCTYPE, StartTag, and EndTag 339 // "characters" for Character 340 // "data" for Comment 341 DataVector m_data; 342 343 // For DOCTYPE 344 OwnPtr<DoctypeData> m_doctypeData; 345 346 // For StartTag and EndTag 347 bool m_selfClosing; 348 AttributeList m_attributes; 349 350 // A pointer into m_attributes used during lexing. 351 Attribute* m_currentAttribute; 352}; 353 354// FIXME: This class should eventually be named HTMLToken once we move the 355// exiting HTMLToken to be internal to the HTMLTokenizer. 356class AtomicHTMLToken { 357 WTF_MAKE_NONCOPYABLE(AtomicHTMLToken); 358public: 359 AtomicHTMLToken(HTMLToken& token) 360 : m_type(token.type()) 361 { 362 switch (m_type) { 363 case HTMLToken::Uninitialized: 364 ASSERT_NOT_REACHED(); 365 break; 366 case HTMLToken::DOCTYPE: 367 m_name = AtomicString(token.name().data(), token.name().size()); 368 m_doctypeData = token.m_doctypeData.release(); 369 break; 370 case HTMLToken::EndOfFile: 371 break; 372 case HTMLToken::StartTag: 373 case HTMLToken::EndTag: { 374 m_selfClosing = token.selfClosing(); 375 m_name = AtomicString(token.name().data(), token.name().size()); 376 const HTMLToken::AttributeList& attributes = token.attributes(); 377 for (HTMLToken::AttributeList::const_iterator iter = attributes.begin(); 378 iter != attributes.end(); ++iter) { 379 if (!iter->m_name.isEmpty()) { 380 String name(iter->m_name.data(), iter->m_name.size()); 381 String value(iter->m_value.data(), iter->m_value.size()); 382 ASSERT(iter->m_nameRange.m_start); 383 ASSERT(iter->m_nameRange.m_end); 384 ASSERT(iter->m_valueRange.m_start); 385 ASSERT(iter->m_valueRange.m_end); 386 RefPtr<Attribute> mappedAttribute = Attribute::createMapped(name, value); 387 if (!m_attributes) { 388 m_attributes = NamedNodeMap::create(); 389 // Reserving capacity here improves the parser 390 // benchmark. It might be worth experimenting with 391 // the constant to see where the optimal point is. 392 m_attributes->reserveInitialCapacity(10); 393 } 394 m_attributes->insertAttribute(mappedAttribute.release(), false); 395 } 396 } 397 break; 398 } 399 case HTMLToken::Comment: 400 m_data = String(token.comment().data(), token.comment().size()); 401 break; 402 case HTMLToken::Character: 403 m_externalCharacters = &token.characters(); 404 break; 405 } 406 } 407 408 AtomicHTMLToken(HTMLToken::Type type, AtomicString name, PassRefPtr<NamedNodeMap> attributes = 0) 409 : m_type(type) 410 , m_name(name) 411 , m_attributes(attributes) 412 { 413 ASSERT(usesName()); 414 } 415 416 HTMLToken::Type type() const { return m_type; } 417 418 const AtomicString& name() const 419 { 420 ASSERT(usesName()); 421 return m_name; 422 } 423 424 void setName(const AtomicString& name) 425 { 426 ASSERT(usesName()); 427 m_name = name; 428 } 429 430 bool selfClosing() const 431 { 432 ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag); 433 return m_selfClosing; 434 } 435 436 Attribute* getAttributeItem(const QualifiedName& attributeName) 437 { 438 ASSERT(usesAttributes()); 439 if (!m_attributes) 440 return 0; 441 return m_attributes->getAttributeItem(attributeName); 442 } 443 444 NamedNodeMap* attributes() const 445 { 446 ASSERT(usesAttributes()); 447 return m_attributes.get(); 448 } 449 450 PassRefPtr<NamedNodeMap> takeAtributes() 451 { 452 ASSERT(usesAttributes()); 453 return m_attributes.release(); 454 } 455 456 const HTMLToken::DataVector& characters() const 457 { 458 ASSERT(m_type == HTMLToken::Character); 459 return *m_externalCharacters; 460 } 461 462 const String& comment() const 463 { 464 ASSERT(m_type == HTMLToken::Comment); 465 return m_data; 466 } 467 468 // FIXME: Distinguish between a missing public identifer and an empty one. 469 WTF::Vector<UChar>& publicIdentifier() const 470 { 471 ASSERT(m_type == HTMLToken::DOCTYPE); 472 return m_doctypeData->m_publicIdentifier; 473 } 474 475 // FIXME: Distinguish between a missing system identifer and an empty one. 476 WTF::Vector<UChar>& systemIdentifier() const 477 { 478 ASSERT(m_type == HTMLToken::DOCTYPE); 479 return m_doctypeData->m_systemIdentifier; 480 } 481 482 bool forceQuirks() const 483 { 484 ASSERT(m_type == HTMLToken::DOCTYPE); 485 return m_doctypeData->m_forceQuirks; 486 } 487 488private: 489 HTMLToken::Type m_type; 490 491 bool usesName() const 492 { 493 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE; 494 } 495 496 bool usesAttributes() const 497 { 498 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag; 499 } 500 501 // "name" for DOCTYPE, StartTag, and EndTag 502 AtomicString m_name; 503 504 // "data" for Comment 505 String m_data; 506 507 // "characters" for Character 508 // 509 // We don't want to copy the the characters out of the HTMLToken, so we 510 // keep a pointer to its buffer instead. This buffer is owned by the 511 // HTMLToken and causes a lifetime dependence between these objects. 512 // 513 // FIXME: Add a mechanism for "internalizing" the characters when the 514 // HTMLToken is destructed. 515 const HTMLToken::DataVector* m_externalCharacters; 516 517 // For DOCTYPE 518 OwnPtr<HTMLToken::DoctypeData> m_doctypeData; 519 520 // For StartTag and EndTag 521 bool m_selfClosing; 522 523 RefPtr<NamedNodeMap> m_attributes; 524}; 525 526} 527 528#endif 529