HTMLToken.h revision 2fc2651226baac27029e38c9d6ef883fa32084db
1/* 2 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26#ifndef HTMLToken_h 27#define HTMLToken_h 28 29#include "NamedNodeMap.h" 30#include <wtf/PassOwnPtr.h> 31#include <wtf/Vector.h> 32 33namespace WebCore { 34 35class HTMLToken { 36 WTF_MAKE_NONCOPYABLE(HTMLToken); WTF_MAKE_FAST_ALLOCATED; 37public: 38 enum Type { 39 Uninitialized, 40 DOCTYPE, 41 StartTag, 42 EndTag, 43 Comment, 44 Character, 45 EndOfFile, 46 }; 47 48 class Range { 49 public: 50 int m_start; 51 int m_end; 52 }; 53 54 class Attribute { 55 public: 56 Range m_nameRange; 57 Range m_valueRange; 58 WTF::Vector<UChar, 32> m_name; 59 WTF::Vector<UChar, 32> m_value; 60 }; 61 62 typedef WTF::Vector<Attribute, 10> AttributeList; 63 typedef WTF::Vector<UChar, 1024> DataVector; 64 65 HTMLToken() { clear(); } 66 67 void clear() 68 { 69 m_type = Uninitialized; 70 m_range.m_start = 0; 71 m_range.m_end = 0; 72 m_baseOffset = 0; 73 m_data.clear(); 74 } 75 76 int startIndex() const { return m_range.m_start; } 77 int endIndex() const { return m_range.m_end; } 78 79 void setBaseOffset(int offset) 80 { 81 m_baseOffset = offset; 82 } 83 84 void end(int endOffset) 85 { 86 m_range.m_end = endOffset - m_baseOffset; 87 } 88 89 void makeEndOfFile() 90 { 91 ASSERT(m_type == Uninitialized); 92 m_type = EndOfFile; 93 } 94 95 void beginStartTag(UChar character) 96 { 97 ASSERT(character); 98 ASSERT(m_type == Uninitialized); 99 m_type = StartTag; 100 m_selfClosing = false; 101 m_currentAttribute = 0; 102 m_attributes.clear(); 103 104 m_data.append(character); 105 } 106 107 template<typename T> 108 void beginEndTag(T characters) 109 { 110 ASSERT(m_type == Uninitialized); 111 m_type = EndTag; 112 m_selfClosing = false; 113 m_currentAttribute = 0; 114 m_attributes.clear(); 115 116 m_data.append(characters); 117 } 118 119 // Starting a character token works slightly differently than starting 120 // other types of tokens because we want to save a per-character branch. 121 void ensureIsCharacterToken() 122 { 123 ASSERT(m_type == Uninitialized || m_type == Character); 124 m_type = Character; 125 } 126 127 void beginComment() 128 { 129 ASSERT(m_type == Uninitialized); 130 m_type = Comment; 131 } 132 133 void beginDOCTYPE() 134 { 135 ASSERT(m_type == Uninitialized); 136 m_type = DOCTYPE; 137 m_doctypeData = adoptPtr(new DoctypeData()); 138 } 139 140 void beginDOCTYPE(UChar character) 141 { 142 ASSERT(character); 143 beginDOCTYPE(); 144 m_data.append(character); 145 } 146 147 void appendToName(UChar character) 148 { 149 ASSERT(character); 150 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); 151 m_data.append(character); 152 } 153 154 template<typename T> 155 void appendToCharacter(T characters) 156 { 157 ASSERT(m_type == Character); 158 m_data.append(characters); 159 } 160 161 void appendToComment(UChar character) 162 { 163 ASSERT(character); 164 ASSERT(m_type == Comment); 165 m_data.append(character); 166 } 167 168 void addNewAttribute() 169 { 170 ASSERT(m_type == StartTag || m_type == EndTag); 171 m_attributes.grow(m_attributes.size() + 1); 172 m_currentAttribute = &m_attributes.last(); 173#ifndef NDEBUG 174 m_currentAttribute->m_nameRange.m_start = 0; 175 m_currentAttribute->m_nameRange.m_end = 0; 176 m_currentAttribute->m_valueRange.m_start = 0; 177 m_currentAttribute->m_valueRange.m_end = 0; 178#endif 179 } 180 181 void beginAttributeName(int offset) 182 { 183 m_currentAttribute->m_nameRange.m_start = offset - m_baseOffset; 184 } 185 186 void endAttributeName(int offset) 187 { 188 int index = offset - m_baseOffset; 189 m_currentAttribute->m_nameRange.m_end = index; 190 m_currentAttribute->m_valueRange.m_start = index; 191 m_currentAttribute->m_valueRange.m_end = index; 192 } 193 194 void beginAttributeValue(int offset) 195 { 196 m_currentAttribute->m_valueRange.m_start = offset - m_baseOffset; 197#ifndef NDEBUG 198 m_currentAttribute->m_valueRange.m_end = 0; 199#endif 200 } 201 202 void endAttributeValue(int offset) 203 { 204 m_currentAttribute->m_valueRange.m_end = offset - m_baseOffset; 205 } 206 207 void appendToAttributeName(UChar character) 208 { 209 ASSERT(character); 210 ASSERT(m_type == StartTag || m_type == EndTag); 211 ASSERT(m_currentAttribute->m_nameRange.m_start); 212 m_currentAttribute->m_name.append(character); 213 } 214 215 void appendToAttributeValue(UChar character) 216 { 217 ASSERT(character); 218 ASSERT(m_type == StartTag || m_type == EndTag); 219 ASSERT(m_currentAttribute->m_valueRange.m_start); 220 m_currentAttribute->m_value.append(character); 221 } 222 223 void appendToAttributeValue(size_t i, const String& value) 224 { 225 ASSERT(!value.isEmpty()); 226 ASSERT(m_type == StartTag || m_type == EndTag); 227 m_attributes[i].m_value.append(value.characters(), value.length()); 228 } 229 230 Type type() const { return m_type; } 231 232 bool selfClosing() const 233 { 234 ASSERT(m_type == StartTag || m_type == EndTag); 235 return m_selfClosing; 236 } 237 238 void setSelfClosing() 239 { 240 ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag); 241 m_selfClosing = true; 242 } 243 244 const AttributeList& attributes() const 245 { 246 ASSERT(m_type == StartTag || m_type == EndTag); 247 return m_attributes; 248 } 249 250 const DataVector& name() const 251 { 252 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); 253 return m_data; 254 } 255 256 void eraseCharacters() 257 { 258 ASSERT(m_type == Character); 259 m_data.clear(); 260 } 261 262 void eraseValueOfAttribute(size_t i) 263 { 264 ASSERT(m_type == StartTag || m_type == EndTag); 265 m_attributes[i].m_value.clear(); 266 } 267 268 const DataVector& characters() const 269 { 270 ASSERT(m_type == Character); 271 return m_data; 272 } 273 274 const DataVector& comment() const 275 { 276 ASSERT(m_type == Comment); 277 return m_data; 278 } 279 280 // FIXME: Distinguish between a missing public identifer and an empty one. 281 const WTF::Vector<UChar>& publicIdentifier() const 282 { 283 ASSERT(m_type == DOCTYPE); 284 return m_doctypeData->m_publicIdentifier; 285 } 286 287 // FIXME: Distinguish between a missing system identifer and an empty one. 288 const WTF::Vector<UChar>& systemIdentifier() const 289 { 290 ASSERT(m_type == DOCTYPE); 291 return m_doctypeData->m_systemIdentifier; 292 } 293 294 void setPublicIdentifierToEmptyString() 295 { 296 ASSERT(m_type == DOCTYPE); 297 m_doctypeData->m_hasPublicIdentifier = true; 298 m_doctypeData->m_publicIdentifier.clear(); 299 } 300 301 void setSystemIdentifierToEmptyString() 302 { 303 ASSERT(m_type == DOCTYPE); 304 m_doctypeData->m_hasSystemIdentifier = true; 305 m_doctypeData->m_systemIdentifier.clear(); 306 } 307 308 bool forceQuirks() const 309 { 310 ASSERT(m_type == DOCTYPE); 311 return m_doctypeData->m_forceQuirks; 312 } 313 314 void setForceQuirks() 315 { 316 ASSERT(m_type == DOCTYPE); 317 m_doctypeData->m_forceQuirks = true; 318 } 319 320 void appendToPublicIdentifier(UChar character) 321 { 322 ASSERT(character); 323 ASSERT(m_type == DOCTYPE); 324 ASSERT(m_doctypeData->m_hasPublicIdentifier); 325 m_doctypeData->m_publicIdentifier.append(character); 326 } 327 328 void appendToSystemIdentifier(UChar character) 329 { 330 ASSERT(character); 331 ASSERT(m_type == DOCTYPE); 332 ASSERT(m_doctypeData->m_hasSystemIdentifier); 333 m_doctypeData->m_systemIdentifier.append(character); 334 } 335 336private: 337 // FIXME: I'm not sure what the final relationship between HTMLToken and 338 // AtomicHTMLToken will be. I'm marking this a friend for now, but we'll 339 // want to end up with a cleaner interface between the two classes. 340 friend class AtomicHTMLToken; 341 342 class DoctypeData { 343 WTF_MAKE_NONCOPYABLE(DoctypeData); 344 public: 345 DoctypeData() 346 : m_hasPublicIdentifier(false) 347 , m_hasSystemIdentifier(false) 348 , m_forceQuirks(false) 349 { 350 } 351 352 bool m_hasPublicIdentifier; 353 bool m_hasSystemIdentifier; 354 bool m_forceQuirks; 355 WTF::Vector<UChar> m_publicIdentifier; 356 WTF::Vector<UChar> m_systemIdentifier; 357 }; 358 359 Type m_type; 360 Range m_range; // Always starts at zero. 361 int m_baseOffset; 362 363 // "name" for DOCTYPE, StartTag, and EndTag 364 // "characters" for Character 365 // "data" for Comment 366 DataVector m_data; 367 368 // For DOCTYPE 369 OwnPtr<DoctypeData> m_doctypeData; 370 371 // For StartTag and EndTag 372 bool m_selfClosing; 373 AttributeList m_attributes; 374 375 // A pointer into m_attributes used during lexing. 376 Attribute* m_currentAttribute; 377}; 378 379// FIXME: This class should eventually be named HTMLToken once we move the 380// exiting HTMLToken to be internal to the HTMLTokenizer. 381class AtomicHTMLToken { 382 WTF_MAKE_NONCOPYABLE(AtomicHTMLToken); 383public: 384 AtomicHTMLToken(HTMLToken& token) 385 : m_type(token.type()) 386 { 387 switch (m_type) { 388 case HTMLToken::Uninitialized: 389 ASSERT_NOT_REACHED(); 390 break; 391 case HTMLToken::DOCTYPE: 392 m_name = AtomicString(token.name().data(), token.name().size()); 393 m_doctypeData = token.m_doctypeData.release(); 394 break; 395 case HTMLToken::EndOfFile: 396 break; 397 case HTMLToken::StartTag: 398 case HTMLToken::EndTag: { 399 m_selfClosing = token.selfClosing(); 400 m_name = AtomicString(token.name().data(), token.name().size()); 401 const HTMLToken::AttributeList& attributes = token.attributes(); 402 for (HTMLToken::AttributeList::const_iterator iter = attributes.begin(); 403 iter != attributes.end(); ++iter) { 404 if (!iter->m_name.isEmpty()) { 405 String name(iter->m_name.data(), iter->m_name.size()); 406 String value(iter->m_value.data(), iter->m_value.size()); 407 ASSERT(iter->m_nameRange.m_start); 408 ASSERT(iter->m_nameRange.m_end); 409 ASSERT(iter->m_valueRange.m_start); 410 ASSERT(iter->m_valueRange.m_end); 411 RefPtr<Attribute> mappedAttribute = Attribute::createMapped(name, value); 412 if (!m_attributes) { 413 m_attributes = NamedNodeMap::create(); 414 // Reserving capacity here improves the parser 415 // benchmark. It might be worth experimenting with 416 // the constant to see where the optimal point is. 417 m_attributes->reserveInitialCapacity(10); 418 } 419 m_attributes->insertAttribute(mappedAttribute.release(), false); 420 } 421 } 422 break; 423 } 424 case HTMLToken::Comment: 425 m_data = String(token.comment().data(), token.comment().size()); 426 break; 427 case HTMLToken::Character: 428 m_externalCharacters = &token.characters(); 429 break; 430 } 431 } 432 433 AtomicHTMLToken(HTMLToken::Type type, AtomicString name, PassRefPtr<NamedNodeMap> attributes = 0) 434 : m_type(type) 435 , m_name(name) 436 , m_attributes(attributes) 437 { 438 ASSERT(usesName()); 439 } 440 441 HTMLToken::Type type() const { return m_type; } 442 443 const AtomicString& name() const 444 { 445 ASSERT(usesName()); 446 return m_name; 447 } 448 449 void setName(const AtomicString& name) 450 { 451 ASSERT(usesName()); 452 m_name = name; 453 } 454 455 bool selfClosing() const 456 { 457 ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag); 458 return m_selfClosing; 459 } 460 461 Attribute* getAttributeItem(const QualifiedName& attributeName) 462 { 463 ASSERT(usesAttributes()); 464 if (!m_attributes) 465 return 0; 466 return m_attributes->getAttributeItem(attributeName); 467 } 468 469 NamedNodeMap* attributes() const 470 { 471 ASSERT(usesAttributes()); 472 return m_attributes.get(); 473 } 474 475 PassRefPtr<NamedNodeMap> takeAtributes() 476 { 477 ASSERT(usesAttributes()); 478 return m_attributes.release(); 479 } 480 481 const HTMLToken::DataVector& characters() const 482 { 483 ASSERT(m_type == HTMLToken::Character); 484 return *m_externalCharacters; 485 } 486 487 const String& comment() const 488 { 489 ASSERT(m_type == HTMLToken::Comment); 490 return m_data; 491 } 492 493 // FIXME: Distinguish between a missing public identifer and an empty one. 494 WTF::Vector<UChar>& publicIdentifier() const 495 { 496 ASSERT(m_type == HTMLToken::DOCTYPE); 497 return m_doctypeData->m_publicIdentifier; 498 } 499 500 // FIXME: Distinguish between a missing system identifer and an empty one. 501 WTF::Vector<UChar>& systemIdentifier() const 502 { 503 ASSERT(m_type == HTMLToken::DOCTYPE); 504 return m_doctypeData->m_systemIdentifier; 505 } 506 507 bool forceQuirks() const 508 { 509 ASSERT(m_type == HTMLToken::DOCTYPE); 510 return m_doctypeData->m_forceQuirks; 511 } 512 513private: 514 HTMLToken::Type m_type; 515 516 bool usesName() const 517 { 518 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE; 519 } 520 521 bool usesAttributes() const 522 { 523 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag; 524 } 525 526 // "name" for DOCTYPE, StartTag, and EndTag 527 AtomicString m_name; 528 529 // "data" for Comment 530 String m_data; 531 532 // "characters" for Character 533 // 534 // We don't want to copy the the characters out of the HTMLToken, so we 535 // keep a pointer to its buffer instead. This buffer is owned by the 536 // HTMLToken and causes a lifetime dependence between these objects. 537 // 538 // FIXME: Add a mechanism for "internalizing" the characters when the 539 // HTMLToken is destructed. 540 const HTMLToken::DataVector* m_externalCharacters; 541 542 // For DOCTYPE 543 OwnPtr<HTMLToken::DoctypeData> m_doctypeData; 544 545 // For StartTag and EndTag 546 bool m_selfClosing; 547 548 RefPtr<NamedNodeMap> m_attributes; 549}; 550 551} 552 553#endif 554