HTMLToken.h revision cad810f21b803229eb11403f9209855525a25d57
1/* 2 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26#ifndef HTMLToken_h 27#define HTMLToken_h 28 29#include "NamedNodeMap.h" 30#include <wtf/Noncopyable.h> 31#include <wtf/PassOwnPtr.h> 32#include <wtf/Vector.h> 33 34namespace WebCore { 35 36class HTMLToken : public Noncopyable { 37public: 38 enum Type { 39 Uninitialized, 40 DOCTYPE, 41 StartTag, 42 EndTag, 43 Comment, 44 Character, 45 EndOfFile, 46 }; 47 48 class Range { 49 public: 50 int m_start; 51 int m_end; 52 }; 53 54 class Attribute { 55 public: 56 Range m_nameRange; 57 Range m_valueRange; 58 WTF::Vector<UChar, 32> m_name; 59 WTF::Vector<UChar, 32> m_value; 60 }; 61 62 typedef WTF::Vector<Attribute, 10> AttributeList; 63 typedef WTF::Vector<UChar, 1024> DataVector; 64 65 HTMLToken() { clear(); } 66 67 void clear(int startIndex = 0) 68 { 69 m_type = Uninitialized; 70 m_range.m_start = startIndex; 71 m_range.m_end = startIndex; 72 m_data.clear(); 73 } 74 75 int startIndex() const { return m_range.m_start; } 76 int endIndex() const { return m_range.m_end; } 77 78 void end(int endIndex) 79 { 80 m_range.m_end = endIndex; 81 } 82 83 void makeEndOfFile() 84 { 85 ASSERT(m_type == Uninitialized); 86 m_type = EndOfFile; 87 } 88 89 void beginStartTag(UChar character) 90 { 91 ASSERT(character); 92 ASSERT(m_type == Uninitialized); 93 m_type = StartTag; 94 m_selfClosing = false; 95 m_currentAttribute = 0; 96 m_attributes.clear(); 97 98 m_data.append(character); 99 } 100 101 template<typename T> 102 void beginEndTag(T characters) 103 { 104 ASSERT(m_type == Uninitialized); 105 m_type = EndTag; 106 m_selfClosing = false; 107 m_currentAttribute = 0; 108 m_attributes.clear(); 109 110 m_data.append(characters); 111 } 112 113 // Starting a character token works slightly differently than starting 114 // other types of tokens because we want to save a per-character branch. 115 void ensureIsCharacterToken() 116 { 117 ASSERT(m_type == Uninitialized || m_type == Character); 118 m_type = Character; 119 } 120 121 void beginComment() 122 { 123 ASSERT(m_type == Uninitialized); 124 m_type = Comment; 125 } 126 127 void beginDOCTYPE() 128 { 129 ASSERT(m_type == Uninitialized); 130 m_type = DOCTYPE; 131 m_doctypeData = adoptPtr(new DoctypeData()); 132 } 133 134 void beginDOCTYPE(UChar character) 135 { 136 ASSERT(character); 137 beginDOCTYPE(); 138 m_data.append(character); 139 } 140 141 void appendToName(UChar character) 142 { 143 ASSERT(character); 144 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); 145 m_data.append(character); 146 } 147 148 template<typename T> 149 void appendToCharacter(T characters) 150 { 151 ASSERT(m_type == Character); 152 m_data.append(characters); 153 } 154 155 void appendToComment(UChar character) 156 { 157 ASSERT(character); 158 ASSERT(m_type == Comment); 159 m_data.append(character); 160 } 161 162 void addNewAttribute() 163 { 164 ASSERT(m_type == StartTag || m_type == EndTag); 165 m_attributes.grow(m_attributes.size() + 1); 166 m_currentAttribute = &m_attributes.last(); 167#ifndef NDEBUG 168 m_currentAttribute->m_nameRange.m_start = 0; 169 m_currentAttribute->m_nameRange.m_end = 0; 170 m_currentAttribute->m_valueRange.m_start = 0; 171 m_currentAttribute->m_valueRange.m_end = 0; 172#endif 173 } 174 175 void beginAttributeName(int index) 176 { 177 m_currentAttribute->m_nameRange.m_start = index; 178 } 179 180 void endAttributeName(int index) 181 { 182 m_currentAttribute->m_nameRange.m_end = index; 183 m_currentAttribute->m_valueRange.m_start = index; 184 m_currentAttribute->m_valueRange.m_end = index; 185 } 186 187 void beginAttributeValue(int index) 188 { 189 m_currentAttribute->m_valueRange.m_start = index; 190#ifndef NDEBUG 191 m_currentAttribute->m_valueRange.m_end = 0; 192#endif 193 } 194 195 void endAttributeValue(int index) 196 { 197 m_currentAttribute->m_valueRange.m_end = index; 198 } 199 200 void appendToAttributeName(UChar character) 201 { 202 ASSERT(character); 203 ASSERT(m_type == StartTag || m_type == EndTag); 204 ASSERT(m_currentAttribute->m_nameRange.m_start); 205 m_currentAttribute->m_name.append(character); 206 } 207 208 void appendToAttributeValue(UChar character) 209 { 210 ASSERT(character); 211 ASSERT(m_type == StartTag || m_type == EndTag); 212 ASSERT(m_currentAttribute->m_valueRange.m_start); 213 m_currentAttribute->m_value.append(character); 214 } 215 216 Type type() const { return m_type; } 217 218 bool selfClosing() const 219 { 220 ASSERT(m_type == StartTag || m_type == EndTag); 221 return m_selfClosing; 222 } 223 224 void setSelfClosing() 225 { 226 ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag); 227 m_selfClosing = true; 228 } 229 230 const AttributeList& attributes() const 231 { 232 ASSERT(m_type == StartTag || m_type == EndTag); 233 return m_attributes; 234 } 235 236 const DataVector& name() const 237 { 238 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); 239 return m_data; 240 } 241 242 const DataVector& characters() const 243 { 244 ASSERT(m_type == Character); 245 return m_data; 246 } 247 248 const DataVector& comment() const 249 { 250 ASSERT(m_type == Comment); 251 return m_data; 252 } 253 254 // FIXME: Distinguish between a missing public identifer and an empty one. 255 const WTF::Vector<UChar>& publicIdentifier() const 256 { 257 ASSERT(m_type == DOCTYPE); 258 return m_doctypeData->m_publicIdentifier; 259 } 260 261 // FIXME: Distinguish between a missing system identifer and an empty one. 262 const WTF::Vector<UChar>& systemIdentifier() const 263 { 264 ASSERT(m_type == DOCTYPE); 265 return m_doctypeData->m_systemIdentifier; 266 } 267 268 void setPublicIdentifierToEmptyString() 269 { 270 ASSERT(m_type == DOCTYPE); 271 m_doctypeData->m_hasPublicIdentifier = true; 272 m_doctypeData->m_publicIdentifier.clear(); 273 } 274 275 void setSystemIdentifierToEmptyString() 276 { 277 ASSERT(m_type == DOCTYPE); 278 m_doctypeData->m_hasSystemIdentifier = true; 279 m_doctypeData->m_systemIdentifier.clear(); 280 } 281 282 bool forceQuirks() const 283 { 284 ASSERT(m_type == DOCTYPE); 285 return m_doctypeData->m_forceQuirks; 286 } 287 288 void setForceQuirks() 289 { 290 ASSERT(m_type == DOCTYPE); 291 m_doctypeData->m_forceQuirks = true; 292 } 293 294 void appendToPublicIdentifier(UChar character) 295 { 296 ASSERT(character); 297 ASSERT(m_type == DOCTYPE); 298 ASSERT(m_doctypeData->m_hasPublicIdentifier); 299 m_doctypeData->m_publicIdentifier.append(character); 300 } 301 302 void appendToSystemIdentifier(UChar character) 303 { 304 ASSERT(character); 305 ASSERT(m_type == DOCTYPE); 306 ASSERT(m_doctypeData->m_hasSystemIdentifier); 307 m_doctypeData->m_systemIdentifier.append(character); 308 } 309 310private: 311 // FIXME: I'm not sure what the final relationship between HTMLToken and 312 // AtomicHTMLToken will be. I'm marking this a friend for now, but we'll 313 // want to end up with a cleaner interface between the two classes. 314 friend class AtomicHTMLToken; 315 316 class DoctypeData : public Noncopyable { 317 public: 318 DoctypeData() 319 : m_hasPublicIdentifier(false) 320 , m_hasSystemIdentifier(false) 321 , m_forceQuirks(false) 322 { 323 } 324 325 bool m_hasPublicIdentifier; 326 bool m_hasSystemIdentifier; 327 bool m_forceQuirks; 328 WTF::Vector<UChar> m_publicIdentifier; 329 WTF::Vector<UChar> m_systemIdentifier; 330 }; 331 332 Type m_type; 333 334 // Which characters from the input stream are represented by this token. 335 Range m_range; 336 337 // "name" for DOCTYPE, StartTag, and EndTag 338 // "characters" for Character 339 // "data" for Comment 340 DataVector m_data; 341 342 // For DOCTYPE 343 OwnPtr<DoctypeData> m_doctypeData; 344 345 // For StartTag and EndTag 346 bool m_selfClosing; 347 AttributeList m_attributes; 348 349 // A pointer into m_attributes used during lexing. 350 Attribute* m_currentAttribute; 351}; 352 353// FIXME: This class should eventually be named HTMLToken once we move the 354// exiting HTMLToken to be internal to the HTMLTokenizer. 355class AtomicHTMLToken : public Noncopyable { 356public: 357 AtomicHTMLToken(HTMLToken& token) 358 : m_type(token.type()) 359 { 360 switch (m_type) { 361 case HTMLToken::Uninitialized: 362 ASSERT_NOT_REACHED(); 363 break; 364 case HTMLToken::DOCTYPE: 365 m_name = AtomicString(token.name().data(), token.name().size()); 366 m_doctypeData = token.m_doctypeData.release(); 367 break; 368 case HTMLToken::EndOfFile: 369 break; 370 case HTMLToken::StartTag: 371 case HTMLToken::EndTag: { 372 m_selfClosing = token.selfClosing(); 373 m_name = AtomicString(token.name().data(), token.name().size()); 374 const HTMLToken::AttributeList& attributes = token.attributes(); 375 for (HTMLToken::AttributeList::const_iterator iter = attributes.begin(); 376 iter != attributes.end(); ++iter) { 377 if (!iter->m_name.isEmpty()) { 378 String name(iter->m_name.data(), iter->m_name.size()); 379 String value(iter->m_value.data(), iter->m_value.size()); 380 ASSERT(iter->m_nameRange.m_start); 381 ASSERT(iter->m_nameRange.m_end); 382 ASSERT(iter->m_valueRange.m_start); 383 ASSERT(iter->m_valueRange.m_end); 384 RefPtr<Attribute> mappedAttribute = Attribute::createMapped(name, value); 385 if (!m_attributes) { 386 m_attributes = NamedNodeMap::create(); 387 // Reserving capacity here improves the parser 388 // benchmark. It might be worth experimenting with 389 // the constant to see where the optimal point is. 390 m_attributes->reserveInitialCapacity(10); 391 } 392 m_attributes->insertAttribute(mappedAttribute.release(), false); 393 } 394 } 395 break; 396 } 397 case HTMLToken::Comment: 398 m_data = String(token.comment().data(), token.comment().size()); 399 break; 400 case HTMLToken::Character: 401 m_externalCharacters = &token.characters(); 402 break; 403 } 404 } 405 406 AtomicHTMLToken(HTMLToken::Type type, AtomicString name, PassRefPtr<NamedNodeMap> attributes = 0) 407 : m_type(type) 408 , m_name(name) 409 , m_attributes(attributes) 410 { 411 ASSERT(usesName()); 412 } 413 414 HTMLToken::Type type() const { return m_type; } 415 416 const AtomicString& name() const 417 { 418 ASSERT(usesName()); 419 return m_name; 420 } 421 422 void setName(const AtomicString& name) 423 { 424 ASSERT(usesName()); 425 m_name = name; 426 } 427 428 bool selfClosing() const 429 { 430 ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag); 431 return m_selfClosing; 432 } 433 434 Attribute* getAttributeItem(const QualifiedName& attributeName) 435 { 436 ASSERT(usesAttributes()); 437 if (!m_attributes) 438 return 0; 439 return m_attributes->getAttributeItem(attributeName); 440 } 441 442 NamedNodeMap* attributes() const 443 { 444 ASSERT(usesAttributes()); 445 return m_attributes.get(); 446 } 447 448 PassRefPtr<NamedNodeMap> takeAtributes() 449 { 450 ASSERT(usesAttributes()); 451 return m_attributes.release(); 452 } 453 454 const HTMLToken::DataVector& characters() const 455 { 456 ASSERT(m_type == HTMLToken::Character); 457 return *m_externalCharacters; 458 } 459 460 const String& comment() const 461 { 462 ASSERT(m_type == HTMLToken::Comment); 463 return m_data; 464 } 465 466 // FIXME: Distinguish between a missing public identifer and an empty one. 467 WTF::Vector<UChar>& publicIdentifier() const 468 { 469 ASSERT(m_type == HTMLToken::DOCTYPE); 470 return m_doctypeData->m_publicIdentifier; 471 } 472 473 // FIXME: Distinguish between a missing system identifer and an empty one. 474 WTF::Vector<UChar>& systemIdentifier() const 475 { 476 ASSERT(m_type == HTMLToken::DOCTYPE); 477 return m_doctypeData->m_systemIdentifier; 478 } 479 480 bool forceQuirks() const 481 { 482 ASSERT(m_type == HTMLToken::DOCTYPE); 483 return m_doctypeData->m_forceQuirks; 484 } 485 486private: 487 HTMLToken::Type m_type; 488 489 bool usesName() const 490 { 491 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE; 492 } 493 494 bool usesAttributes() const 495 { 496 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag; 497 } 498 499 // "name" for DOCTYPE, StartTag, and EndTag 500 AtomicString m_name; 501 502 // "data" for Comment 503 String m_data; 504 505 // "characters" for Character 506 // 507 // We don't want to copy the the characters out of the HTMLToken, so we 508 // keep a pointer to its buffer instead. This buffer is owned by the 509 // HTMLToken and causes a lifetime dependence between these objects. 510 // 511 // FIXME: Add a mechanism for "internalizing" the characters when the 512 // HTMLToken is destructed. 513 const HTMLToken::DataVector* m_externalCharacters; 514 515 // For DOCTYPE 516 OwnPtr<HTMLToken::DoctypeData> m_doctypeData; 517 518 // For StartTag and EndTag 519 bool m_selfClosing; 520 521 RefPtr<NamedNodeMap> m_attributes; 522}; 523 524} 525 526#endif 527