1//===--- YAMLParser.h - Simple YAML parser --------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This is a YAML 1.2 parser. 11// 12// See http://www.yaml.org/spec/1.2/spec.html for the full standard. 13// 14// This currently does not implement the following: 15// * Multi-line literal folding. 16// * Tag resolution. 17// * UTF-16. 18// * BOMs anywhere other than the first Unicode scalar value in the file. 19// 20// The most important class here is Stream. This represents a YAML stream with 21// 0, 1, or many documents. 22// 23// SourceMgr sm; 24// StringRef input = getInput(); 25// yaml::Stream stream(input, sm); 26// 27// for (yaml::document_iterator di = stream.begin(), de = stream.end(); 28// di != de; ++di) { 29// yaml::Node *n = di->getRoot(); 30// if (n) { 31// // Do something with n... 32// } else 33// break; 34// } 35// 36//===----------------------------------------------------------------------===// 37 38#ifndef LLVM_SUPPORT_YAMLPARSER_H 39#define LLVM_SUPPORT_YAMLPARSER_H 40 41#include "llvm/ADT/StringRef.h" 42#include "llvm/Support/Allocator.h" 43#include "llvm/Support/SMLoc.h" 44#include <map> 45#include <system_error> 46#include <utility> 47 48namespace llvm { 49class MemoryBufferRef; 50class SourceMgr; 51class Twine; 52class raw_ostream; 53 54namespace yaml { 55 56class document_iterator; 57class Document; 58class Node; 59class Scanner; 60struct Token; 61 62/// \brief Dump all the tokens in this stream to OS. 63/// \returns true if there was an error, false otherwise. 64bool dumpTokens(StringRef Input, raw_ostream &); 65 66/// \brief Scans all tokens in input without outputting anything. This is used 67/// for benchmarking the tokenizer. 68/// \returns true if there was an error, false otherwise. 69bool scanTokens(StringRef Input); 70 71/// \brief Escape \a Input for a double quoted scalar. 72std::string escape(StringRef Input); 73 74/// \brief This class represents a YAML stream potentially containing multiple 75/// documents. 76class Stream { 77public: 78 /// \brief This keeps a reference to the string referenced by \p Input. 79 Stream(StringRef Input, SourceMgr &, bool ShowColors = true, 80 std::error_code *EC = nullptr); 81 82 Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true, 83 std::error_code *EC = nullptr); 84 ~Stream(); 85 86 document_iterator begin(); 87 document_iterator end(); 88 void skip(); 89 bool failed(); 90 bool validate() { 91 skip(); 92 return !failed(); 93 } 94 95 void printError(Node *N, const Twine &Msg); 96 97private: 98 std::unique_ptr<Scanner> scanner; 99 std::unique_ptr<Document> CurrentDoc; 100 101 friend class Document; 102}; 103 104/// \brief Abstract base class for all Nodes. 105class Node { 106 virtual void anchor(); 107 108public: 109 enum NodeKind { 110 NK_Null, 111 NK_Scalar, 112 NK_BlockScalar, 113 NK_KeyValue, 114 NK_Mapping, 115 NK_Sequence, 116 NK_Alias 117 }; 118 119 Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor, 120 StringRef Tag); 121 122 /// \brief Get the value of the anchor attached to this node. If it does not 123 /// have one, getAnchor().size() will be 0. 124 StringRef getAnchor() const { return Anchor; } 125 126 /// \brief Get the tag as it was written in the document. This does not 127 /// perform tag resolution. 128 StringRef getRawTag() const { return Tag; } 129 130 /// \brief Get the verbatium tag for a given Node. This performs tag resoluton 131 /// and substitution. 132 std::string getVerbatimTag() const; 133 134 SMRange getSourceRange() const { return SourceRange; } 135 void setSourceRange(SMRange SR) { SourceRange = SR; } 136 137 // These functions forward to Document and Scanner. 138 Token &peekNext(); 139 Token getNext(); 140 Node *parseBlockNode(); 141 BumpPtrAllocator &getAllocator(); 142 void setError(const Twine &Message, Token &Location) const; 143 bool failed() const; 144 145 virtual void skip() {} 146 147 unsigned int getType() const { return TypeID; } 148 149 void *operator new(size_t Size, BumpPtrAllocator &Alloc, 150 size_t Alignment = 16) noexcept { 151 return Alloc.Allocate(Size, Alignment); 152 } 153 154 void operator delete(void *Ptr, BumpPtrAllocator &Alloc, 155 size_t Size) noexcept { 156 Alloc.Deallocate(Ptr, Size); 157 } 158 159protected: 160 std::unique_ptr<Document> &Doc; 161 SMRange SourceRange; 162 163 void operator delete(void *) noexcept = delete; 164 165 ~Node() = default; 166 167private: 168 unsigned int TypeID; 169 StringRef Anchor; 170 /// \brief The tag as typed in the document. 171 StringRef Tag; 172}; 173 174/// \brief A null value. 175/// 176/// Example: 177/// !!null null 178class NullNode final : public Node { 179 void anchor() override; 180 181public: 182 NullNode(std::unique_ptr<Document> &D) 183 : Node(NK_Null, D, StringRef(), StringRef()) {} 184 185 static inline bool classof(const Node *N) { return N->getType() == NK_Null; } 186}; 187 188/// \brief A scalar node is an opaque datum that can be presented as a 189/// series of zero or more Unicode scalar values. 190/// 191/// Example: 192/// Adena 193class ScalarNode final : public Node { 194 void anchor() override; 195 196public: 197 ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 198 StringRef Val) 199 : Node(NK_Scalar, D, Anchor, Tag), Value(Val) { 200 SMLoc Start = SMLoc::getFromPointer(Val.begin()); 201 SMLoc End = SMLoc::getFromPointer(Val.end()); 202 SourceRange = SMRange(Start, End); 203 } 204 205 // Return Value without any escaping or folding or other fun YAML stuff. This 206 // is the exact bytes that are contained in the file (after conversion to 207 // utf8). 208 StringRef getRawValue() const { return Value; } 209 210 /// \brief Gets the value of this node as a StringRef. 211 /// 212 /// \param Storage is used to store the content of the returned StringRef iff 213 /// it requires any modification from how it appeared in the source. 214 /// This happens with escaped characters and multi-line literals. 215 StringRef getValue(SmallVectorImpl<char> &Storage) const; 216 217 static inline bool classof(const Node *N) { 218 return N->getType() == NK_Scalar; 219 } 220 221private: 222 StringRef Value; 223 224 StringRef unescapeDoubleQuoted(StringRef UnquotedValue, 225 StringRef::size_type Start, 226 SmallVectorImpl<char> &Storage) const; 227}; 228 229/// \brief A block scalar node is an opaque datum that can be presented as a 230/// series of zero or more Unicode scalar values. 231/// 232/// Example: 233/// | 234/// Hello 235/// World 236class BlockScalarNode final : public Node { 237 void anchor() override; 238 239public: 240 BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 241 StringRef Value, StringRef RawVal) 242 : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) { 243 SMLoc Start = SMLoc::getFromPointer(RawVal.begin()); 244 SMLoc End = SMLoc::getFromPointer(RawVal.end()); 245 SourceRange = SMRange(Start, End); 246 } 247 248 /// \brief Gets the value of this node as a StringRef. 249 StringRef getValue() const { return Value; } 250 251 static inline bool classof(const Node *N) { 252 return N->getType() == NK_BlockScalar; 253 } 254 255private: 256 StringRef Value; 257}; 258 259/// \brief A key and value pair. While not technically a Node under the YAML 260/// representation graph, it is easier to treat them this way. 261/// 262/// TODO: Consider making this not a child of Node. 263/// 264/// Example: 265/// Section: .text 266class KeyValueNode final : public Node { 267 void anchor() override; 268 269public: 270 KeyValueNode(std::unique_ptr<Document> &D) 271 : Node(NK_KeyValue, D, StringRef(), StringRef()), Key(nullptr), 272 Value(nullptr) {} 273 274 /// \brief Parse and return the key. 275 /// 276 /// This may be called multiple times. 277 /// 278 /// \returns The key, or nullptr if failed() == true. 279 Node *getKey(); 280 281 /// \brief Parse and return the value. 282 /// 283 /// This may be called multiple times. 284 /// 285 /// \returns The value, or nullptr if failed() == true. 286 Node *getValue(); 287 288 void skip() override { 289 getKey()->skip(); 290 if (Node *Val = getValue()) 291 Val->skip(); 292 } 293 294 static inline bool classof(const Node *N) { 295 return N->getType() == NK_KeyValue; 296 } 297 298private: 299 Node *Key; 300 Node *Value; 301}; 302 303/// \brief This is an iterator abstraction over YAML collections shared by both 304/// sequences and maps. 305/// 306/// BaseT must have a ValueT* member named CurrentEntry and a member function 307/// increment() which must set CurrentEntry to 0 to create an end iterator. 308template <class BaseT, class ValueT> 309class basic_collection_iterator 310 : public std::iterator<std::input_iterator_tag, ValueT> { 311public: 312 basic_collection_iterator() : Base(nullptr) {} 313 basic_collection_iterator(BaseT *B) : Base(B) {} 314 315 ValueT *operator->() const { 316 assert(Base && Base->CurrentEntry && "Attempted to access end iterator!"); 317 return Base->CurrentEntry; 318 } 319 320 ValueT &operator*() const { 321 assert(Base && Base->CurrentEntry && 322 "Attempted to dereference end iterator!"); 323 return *Base->CurrentEntry; 324 } 325 326 operator ValueT *() const { 327 assert(Base && Base->CurrentEntry && "Attempted to access end iterator!"); 328 return Base->CurrentEntry; 329 } 330 331 /// Note on EqualityComparable: 332 /// 333 /// The iterator is not re-entrant, 334 /// it is meant to be used for parsing YAML on-demand 335 /// Once iteration started - it can point only to one entry at a time 336 /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal 337 /// iff Base and Other.Base are equal. 338 bool operator==(const basic_collection_iterator &Other) const { 339 if (Base && (Base == Other.Base)) { 340 assert((Base->CurrentEntry == Other.Base->CurrentEntry) 341 && "Equal Bases expected to point to equal Entries"); 342 } 343 344 return Base == Other.Base; 345 } 346 347 bool operator!=(const basic_collection_iterator &Other) const { 348 return !(Base == Other.Base); 349 } 350 351 basic_collection_iterator &operator++() { 352 assert(Base && "Attempted to advance iterator past end!"); 353 Base->increment(); 354 // Create an end iterator. 355 if (!Base->CurrentEntry) 356 Base = nullptr; 357 return *this; 358 } 359 360private: 361 BaseT *Base; 362}; 363 364// The following two templates are used for both MappingNode and Sequence Node. 365template <class CollectionType> 366typename CollectionType::iterator begin(CollectionType &C) { 367 assert(C.IsAtBeginning && "You may only iterate over a collection once!"); 368 C.IsAtBeginning = false; 369 typename CollectionType::iterator ret(&C); 370 ++ret; 371 return ret; 372} 373 374template <class CollectionType> void skip(CollectionType &C) { 375 // TODO: support skipping from the middle of a parsed collection ;/ 376 assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!"); 377 if (C.IsAtBeginning) 378 for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e; 379 ++i) 380 i->skip(); 381} 382 383/// \brief Represents a YAML map created from either a block map for a flow map. 384/// 385/// This parses the YAML stream as increment() is called. 386/// 387/// Example: 388/// Name: _main 389/// Scope: Global 390class MappingNode final : public Node { 391 void anchor() override; 392 393public: 394 enum MappingType { 395 MT_Block, 396 MT_Flow, 397 MT_Inline ///< An inline mapping node is used for "[key: value]". 398 }; 399 400 MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 401 MappingType MT) 402 : Node(NK_Mapping, D, Anchor, Tag), Type(MT), IsAtBeginning(true), 403 IsAtEnd(false), CurrentEntry(nullptr) {} 404 405 friend class basic_collection_iterator<MappingNode, KeyValueNode>; 406 typedef basic_collection_iterator<MappingNode, KeyValueNode> iterator; 407 template <class T> friend typename T::iterator yaml::begin(T &); 408 template <class T> friend void yaml::skip(T &); 409 410 iterator begin() { return yaml::begin(*this); } 411 412 iterator end() { return iterator(); } 413 414 void skip() override { yaml::skip(*this); } 415 416 static inline bool classof(const Node *N) { 417 return N->getType() == NK_Mapping; 418 } 419 420private: 421 MappingType Type; 422 bool IsAtBeginning; 423 bool IsAtEnd; 424 KeyValueNode *CurrentEntry; 425 426 void increment(); 427}; 428 429/// \brief Represents a YAML sequence created from either a block sequence for a 430/// flow sequence. 431/// 432/// This parses the YAML stream as increment() is called. 433/// 434/// Example: 435/// - Hello 436/// - World 437class SequenceNode final : public Node { 438 void anchor() override; 439 440public: 441 enum SequenceType { 442 ST_Block, 443 ST_Flow, 444 // Use for: 445 // 446 // key: 447 // - val1 448 // - val2 449 // 450 // As a BlockMappingEntry and BlockEnd are not created in this case. 451 ST_Indentless 452 }; 453 454 SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 455 SequenceType ST) 456 : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST), IsAtBeginning(true), 457 IsAtEnd(false), 458 WasPreviousTokenFlowEntry(true), // Start with an imaginary ','. 459 CurrentEntry(nullptr) {} 460 461 friend class basic_collection_iterator<SequenceNode, Node>; 462 typedef basic_collection_iterator<SequenceNode, Node> iterator; 463 template <class T> friend typename T::iterator yaml::begin(T &); 464 template <class T> friend void yaml::skip(T &); 465 466 void increment(); 467 468 iterator begin() { return yaml::begin(*this); } 469 470 iterator end() { return iterator(); } 471 472 void skip() override { yaml::skip(*this); } 473 474 static inline bool classof(const Node *N) { 475 return N->getType() == NK_Sequence; 476 } 477 478private: 479 SequenceType SeqType; 480 bool IsAtBeginning; 481 bool IsAtEnd; 482 bool WasPreviousTokenFlowEntry; 483 Node *CurrentEntry; 484}; 485 486/// \brief Represents an alias to a Node with an anchor. 487/// 488/// Example: 489/// *AnchorName 490class AliasNode final : public Node { 491 void anchor() override; 492 493public: 494 AliasNode(std::unique_ptr<Document> &D, StringRef Val) 495 : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {} 496 497 StringRef getName() const { return Name; } 498 Node *getTarget(); 499 500 static inline bool classof(const Node *N) { return N->getType() == NK_Alias; } 501 502private: 503 StringRef Name; 504}; 505 506/// \brief A YAML Stream is a sequence of Documents. A document contains a root 507/// node. 508class Document { 509public: 510 /// \brief Root for parsing a node. Returns a single node. 511 Node *parseBlockNode(); 512 513 Document(Stream &ParentStream); 514 515 /// \brief Finish parsing the current document and return true if there are 516 /// more. Return false otherwise. 517 bool skip(); 518 519 /// \brief Parse and return the root level node. 520 Node *getRoot() { 521 if (Root) 522 return Root; 523 return Root = parseBlockNode(); 524 } 525 526 const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; } 527 528private: 529 friend class Node; 530 friend class document_iterator; 531 532 /// \brief Stream to read tokens from. 533 Stream &stream; 534 535 /// \brief Used to allocate nodes to. All are destroyed without calling their 536 /// destructor when the document is destroyed. 537 BumpPtrAllocator NodeAllocator; 538 539 /// \brief The root node. Used to support skipping a partially parsed 540 /// document. 541 Node *Root; 542 543 /// \brief Maps tag prefixes to their expansion. 544 std::map<StringRef, StringRef> TagMap; 545 546 Token &peekNext(); 547 Token getNext(); 548 void setError(const Twine &Message, Token &Location) const; 549 bool failed() const; 550 551 /// \brief Parse %BLAH directives and return true if any were encountered. 552 bool parseDirectives(); 553 554 /// \brief Parse %YAML 555 void parseYAMLDirective(); 556 557 /// \brief Parse %TAG 558 void parseTAGDirective(); 559 560 /// \brief Consume the next token and error if it is not \a TK. 561 bool expectToken(int TK); 562}; 563 564/// \brief Iterator abstraction for Documents over a Stream. 565class document_iterator { 566public: 567 document_iterator() : Doc(nullptr) {} 568 document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {} 569 570 bool operator==(const document_iterator &Other) { 571 if (isAtEnd() || Other.isAtEnd()) 572 return isAtEnd() && Other.isAtEnd(); 573 574 return Doc == Other.Doc; 575 } 576 bool operator!=(const document_iterator &Other) { return !(*this == Other); } 577 578 document_iterator operator++() { 579 assert(Doc && "incrementing iterator past the end."); 580 if (!(*Doc)->skip()) { 581 Doc->reset(nullptr); 582 } else { 583 Stream &S = (*Doc)->stream; 584 Doc->reset(new Document(S)); 585 } 586 return *this; 587 } 588 589 Document &operator*() { return *Doc->get(); } 590 591 std::unique_ptr<Document> &operator->() { return *Doc; } 592 593private: 594 bool isAtEnd() const { return !Doc || !*Doc; } 595 596 std::unique_ptr<Document> *Doc; 597}; 598 599} // End namespace yaml. 600 601} // End namespace llvm. 602 603#endif 604