1//===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This is a YAML 1.2 parser.
11//
12//  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
13//
14//  This currently does not implement the following:
15//    * Multi-line literal folding.
16//    * Tag resolution.
17//    * UTF-16.
18//    * BOMs anywhere other than the first Unicode scalar value in the file.
19//
20//  The most important class here is Stream. This represents a YAML stream with
21//  0, 1, or many documents.
22//
23//  SourceMgr sm;
24//  StringRef input = getInput();
25//  yaml::Stream stream(input, sm);
26//
27//  for (yaml::document_iterator di = stream.begin(), de = stream.end();
28//       di != de; ++di) {
29//    yaml::Node *n = di->getRoot();
30//    if (n) {
31//      // Do something with n...
32//    } else
33//      break;
34//  }
35//
36//===----------------------------------------------------------------------===//
37
38#ifndef LLVM_SUPPORT_YAMLPARSER_H
39#define LLVM_SUPPORT_YAMLPARSER_H
40
41#include "llvm/ADT/StringRef.h"
42#include "llvm/Support/Allocator.h"
43#include "llvm/Support/SMLoc.h"
44#include <cassert>
45#include <cstddef>
46#include <iterator>
47#include <map>
48#include <memory>
49#include <string>
50#include <system_error>
51
52namespace llvm {
53
54class MemoryBufferRef;
55class SourceMgr;
56class raw_ostream;
57class Twine;
58
59namespace yaml {
60
61class Document;
62class document_iterator;
63class Node;
64class Scanner;
65struct Token;
66
67/// \brief Dump all the tokens in this stream to OS.
68/// \returns true if there was an error, false otherwise.
69bool dumpTokens(StringRef Input, raw_ostream &);
70
71/// \brief Scans all tokens in input without outputting anything. This is used
72///        for benchmarking the tokenizer.
73/// \returns true if there was an error, false otherwise.
74bool scanTokens(StringRef Input);
75
76/// \brief Escape \a Input for a double quoted scalar.
77std::string escape(StringRef Input);
78
79/// \brief This class represents a YAML stream potentially containing multiple
80///        documents.
81class Stream {
82public:
83  /// \brief This keeps a reference to the string referenced by \p Input.
84  Stream(StringRef Input, SourceMgr &, bool ShowColors = true,
85         std::error_code *EC = nullptr);
86
87  Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true,
88         std::error_code *EC = nullptr);
89  ~Stream();
90
91  document_iterator begin();
92  document_iterator end();
93  void skip();
94  bool failed();
95
96  bool validate() {
97    skip();
98    return !failed();
99  }
100
101  void printError(Node *N, const Twine &Msg);
102
103private:
104  friend class Document;
105
106  std::unique_ptr<Scanner> scanner;
107  std::unique_ptr<Document> CurrentDoc;
108};
109
110/// \brief Abstract base class for all Nodes.
111class Node {
112  virtual void anchor();
113
114public:
115  enum NodeKind {
116    NK_Null,
117    NK_Scalar,
118    NK_BlockScalar,
119    NK_KeyValue,
120    NK_Mapping,
121    NK_Sequence,
122    NK_Alias
123  };
124
125  Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
126       StringRef Tag);
127
128  void *operator new(size_t Size, BumpPtrAllocator &Alloc,
129                     size_t Alignment = 16) noexcept {
130    return Alloc.Allocate(Size, Alignment);
131  }
132
133  void operator delete(void *Ptr, BumpPtrAllocator &Alloc,
134                       size_t Size) noexcept {
135    Alloc.Deallocate(Ptr, Size);
136  }
137
138  void operator delete(void *) noexcept = delete;
139
140  /// \brief Get the value of the anchor attached to this node. If it does not
141  ///        have one, getAnchor().size() will be 0.
142  StringRef getAnchor() const { return Anchor; }
143
144  /// \brief Get the tag as it was written in the document. This does not
145  ///   perform tag resolution.
146  StringRef getRawTag() const { return Tag; }
147
148  /// \brief Get the verbatium tag for a given Node. This performs tag resoluton
149  ///   and substitution.
150  std::string getVerbatimTag() const;
151
152  SMRange getSourceRange() const { return SourceRange; }
153  void setSourceRange(SMRange SR) { SourceRange = SR; }
154
155  // These functions forward to Document and Scanner.
156  Token &peekNext();
157  Token getNext();
158  Node *parseBlockNode();
159  BumpPtrAllocator &getAllocator();
160  void setError(const Twine &Message, Token &Location) const;
161  bool failed() const;
162
163  virtual void skip() {}
164
165  unsigned int getType() const { return TypeID; }
166
167protected:
168  std::unique_ptr<Document> &Doc;
169  SMRange SourceRange;
170
171  ~Node() = default;
172
173private:
174  unsigned int TypeID;
175  StringRef Anchor;
176  /// \brief The tag as typed in the document.
177  StringRef Tag;
178};
179
180/// \brief A null value.
181///
182/// Example:
183///   !!null null
184class NullNode final : public Node {
185  void anchor() override;
186
187public:
188  NullNode(std::unique_ptr<Document> &D)
189      : Node(NK_Null, D, StringRef(), StringRef()) {}
190
191  static bool classof(const Node *N) { return N->getType() == NK_Null; }
192};
193
194/// \brief A scalar node is an opaque datum that can be presented as a
195///        series of zero or more Unicode scalar values.
196///
197/// Example:
198///   Adena
199class ScalarNode final : public Node {
200  void anchor() override;
201
202public:
203  ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
204             StringRef Val)
205      : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
206    SMLoc Start = SMLoc::getFromPointer(Val.begin());
207    SMLoc End = SMLoc::getFromPointer(Val.end());
208    SourceRange = SMRange(Start, End);
209  }
210
211  // Return Value without any escaping or folding or other fun YAML stuff. This
212  // is the exact bytes that are contained in the file (after conversion to
213  // utf8).
214  StringRef getRawValue() const { return Value; }
215
216  /// \brief Gets the value of this node as a StringRef.
217  ///
218  /// \param Storage is used to store the content of the returned StringRef iff
219  ///        it requires any modification from how it appeared in the source.
220  ///        This happens with escaped characters and multi-line literals.
221  StringRef getValue(SmallVectorImpl<char> &Storage) const;
222
223  static bool classof(const Node *N) {
224    return N->getType() == NK_Scalar;
225  }
226
227private:
228  StringRef Value;
229
230  StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
231                                 StringRef::size_type Start,
232                                 SmallVectorImpl<char> &Storage) const;
233};
234
235/// \brief A block scalar node is an opaque datum that can be presented as a
236///        series of zero or more Unicode scalar values.
237///
238/// Example:
239///   |
240///     Hello
241///     World
242class BlockScalarNode final : public Node {
243  void anchor() override;
244
245public:
246  BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
247                  StringRef Value, StringRef RawVal)
248      : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) {
249    SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
250    SMLoc End = SMLoc::getFromPointer(RawVal.end());
251    SourceRange = SMRange(Start, End);
252  }
253
254  /// \brief Gets the value of this node as a StringRef.
255  StringRef getValue() const { return Value; }
256
257  static bool classof(const Node *N) {
258    return N->getType() == NK_BlockScalar;
259  }
260
261private:
262  StringRef Value;
263};
264
265/// \brief A key and value pair. While not technically a Node under the YAML
266///        representation graph, it is easier to treat them this way.
267///
268/// TODO: Consider making this not a child of Node.
269///
270/// Example:
271///   Section: .text
272class KeyValueNode final : public Node {
273  void anchor() override;
274
275public:
276  KeyValueNode(std::unique_ptr<Document> &D)
277      : Node(NK_KeyValue, D, StringRef(), StringRef()) {}
278
279  /// \brief Parse and return the key.
280  ///
281  /// This may be called multiple times.
282  ///
283  /// \returns The key, or nullptr if failed() == true.
284  Node *getKey();
285
286  /// \brief Parse and return the value.
287  ///
288  /// This may be called multiple times.
289  ///
290  /// \returns The value, or nullptr if failed() == true.
291  Node *getValue();
292
293  void skip() override {
294    getKey()->skip();
295    if (Node *Val = getValue())
296      Val->skip();
297  }
298
299  static bool classof(const Node *N) {
300    return N->getType() == NK_KeyValue;
301  }
302
303private:
304  Node *Key = nullptr;
305  Node *Value = nullptr;
306};
307
308/// \brief This is an iterator abstraction over YAML collections shared by both
309///        sequences and maps.
310///
311/// BaseT must have a ValueT* member named CurrentEntry and a member function
312/// increment() which must set CurrentEntry to 0 to create an end iterator.
313template <class BaseT, class ValueT>
314class basic_collection_iterator
315    : public std::iterator<std::input_iterator_tag, ValueT> {
316public:
317  basic_collection_iterator() = default;
318  basic_collection_iterator(BaseT *B) : Base(B) {}
319
320  ValueT *operator->() const {
321    assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
322    return Base->CurrentEntry;
323  }
324
325  ValueT &operator*() const {
326    assert(Base && Base->CurrentEntry &&
327           "Attempted to dereference end iterator!");
328    return *Base->CurrentEntry;
329  }
330
331  operator ValueT *() const {
332    assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
333    return Base->CurrentEntry;
334  }
335
336  /// Note on EqualityComparable:
337  ///
338  /// The iterator is not re-entrant,
339  /// it is meant to be used for parsing YAML on-demand
340  /// Once iteration started - it can point only to one entry at a time
341  /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal
342  /// iff Base and Other.Base are equal.
343  bool operator==(const basic_collection_iterator &Other) const {
344    if (Base && (Base == Other.Base)) {
345      assert((Base->CurrentEntry == Other.Base->CurrentEntry)
346             && "Equal Bases expected to point to equal Entries");
347    }
348
349    return Base == Other.Base;
350  }
351
352  bool operator!=(const basic_collection_iterator &Other) const {
353    return !(Base == Other.Base);
354  }
355
356  basic_collection_iterator &operator++() {
357    assert(Base && "Attempted to advance iterator past end!");
358    Base->increment();
359    // Create an end iterator.
360    if (!Base->CurrentEntry)
361      Base = nullptr;
362    return *this;
363  }
364
365private:
366  BaseT *Base = nullptr;
367};
368
369// The following two templates are used for both MappingNode and Sequence Node.
370template <class CollectionType>
371typename CollectionType::iterator begin(CollectionType &C) {
372  assert(C.IsAtBeginning && "You may only iterate over a collection once!");
373  C.IsAtBeginning = false;
374  typename CollectionType::iterator ret(&C);
375  ++ret;
376  return ret;
377}
378
379template <class CollectionType> void skip(CollectionType &C) {
380  // TODO: support skipping from the middle of a parsed collection ;/
381  assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
382  if (C.IsAtBeginning)
383    for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
384         ++i)
385      i->skip();
386}
387
388/// \brief Represents a YAML map created from either a block map for a flow map.
389///
390/// This parses the YAML stream as increment() is called.
391///
392/// Example:
393///   Name: _main
394///   Scope: Global
395class MappingNode final : public Node {
396  void anchor() override;
397
398public:
399  enum MappingType {
400    MT_Block,
401    MT_Flow,
402    MT_Inline ///< An inline mapping node is used for "[key: value]".
403  };
404
405  MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
406              MappingType MT)
407      : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {}
408
409  friend class basic_collection_iterator<MappingNode, KeyValueNode>;
410
411  using iterator = basic_collection_iterator<MappingNode, KeyValueNode>;
412
413  template <class T> friend typename T::iterator yaml::begin(T &);
414  template <class T> friend void yaml::skip(T &);
415
416  iterator begin() { return yaml::begin(*this); }
417
418  iterator end() { return iterator(); }
419
420  void skip() override { yaml::skip(*this); }
421
422  static bool classof(const Node *N) {
423    return N->getType() == NK_Mapping;
424  }
425
426private:
427  MappingType Type;
428  bool IsAtBeginning = true;
429  bool IsAtEnd = false;
430  KeyValueNode *CurrentEntry = nullptr;
431
432  void increment();
433};
434
435/// \brief Represents a YAML sequence created from either a block sequence for a
436///        flow sequence.
437///
438/// This parses the YAML stream as increment() is called.
439///
440/// Example:
441///   - Hello
442///   - World
443class SequenceNode final : public Node {
444  void anchor() override;
445
446public:
447  enum SequenceType {
448    ST_Block,
449    ST_Flow,
450    // Use for:
451    //
452    // key:
453    // - val1
454    // - val2
455    //
456    // As a BlockMappingEntry and BlockEnd are not created in this case.
457    ST_Indentless
458  };
459
460  SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
461               SequenceType ST)
462      : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {}
463
464  friend class basic_collection_iterator<SequenceNode, Node>;
465
466  using iterator = basic_collection_iterator<SequenceNode, Node>;
467
468  template <class T> friend typename T::iterator yaml::begin(T &);
469  template <class T> friend void yaml::skip(T &);
470
471  void increment();
472
473  iterator begin() { return yaml::begin(*this); }
474
475  iterator end() { return iterator(); }
476
477  void skip() override { yaml::skip(*this); }
478
479  static bool classof(const Node *N) {
480    return N->getType() == NK_Sequence;
481  }
482
483private:
484  SequenceType SeqType;
485  bool IsAtBeginning = true;
486  bool IsAtEnd = false;
487  bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','.
488  Node *CurrentEntry = nullptr;
489};
490
491/// \brief Represents an alias to a Node with an anchor.
492///
493/// Example:
494///   *AnchorName
495class AliasNode final : public Node {
496  void anchor() override;
497
498public:
499  AliasNode(std::unique_ptr<Document> &D, StringRef Val)
500      : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
501
502  StringRef getName() const { return Name; }
503  Node *getTarget();
504
505  static bool classof(const Node *N) { return N->getType() == NK_Alias; }
506
507private:
508  StringRef Name;
509};
510
511/// \brief A YAML Stream is a sequence of Documents. A document contains a root
512///        node.
513class Document {
514public:
515  Document(Stream &ParentStream);
516
517  /// \brief Root for parsing a node. Returns a single node.
518  Node *parseBlockNode();
519
520  /// \brief Finish parsing the current document and return true if there are
521  ///        more. Return false otherwise.
522  bool skip();
523
524  /// \brief Parse and return the root level node.
525  Node *getRoot() {
526    if (Root)
527      return Root;
528    return Root = parseBlockNode();
529  }
530
531  const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
532
533private:
534  friend class Node;
535  friend class document_iterator;
536
537  /// \brief Stream to read tokens from.
538  Stream &stream;
539
540  /// \brief Used to allocate nodes to. All are destroyed without calling their
541  ///        destructor when the document is destroyed.
542  BumpPtrAllocator NodeAllocator;
543
544  /// \brief The root node. Used to support skipping a partially parsed
545  ///        document.
546  Node *Root;
547
548  /// \brief Maps tag prefixes to their expansion.
549  std::map<StringRef, StringRef> TagMap;
550
551  Token &peekNext();
552  Token getNext();
553  void setError(const Twine &Message, Token &Location) const;
554  bool failed() const;
555
556  /// \brief Parse %BLAH directives and return true if any were encountered.
557  bool parseDirectives();
558
559  /// \brief Parse %YAML
560  void parseYAMLDirective();
561
562  /// \brief Parse %TAG
563  void parseTAGDirective();
564
565  /// \brief Consume the next token and error if it is not \a TK.
566  bool expectToken(int TK);
567};
568
569/// \brief Iterator abstraction for Documents over a Stream.
570class document_iterator {
571public:
572  document_iterator() = default;
573  document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
574
575  bool operator==(const document_iterator &Other) const {
576    if (isAtEnd() || Other.isAtEnd())
577      return isAtEnd() && Other.isAtEnd();
578
579    return Doc == Other.Doc;
580  }
581  bool operator!=(const document_iterator &Other) const {
582    return !(*this == Other);
583  }
584
585  document_iterator operator++() {
586    assert(Doc && "incrementing iterator past the end.");
587    if (!(*Doc)->skip()) {
588      Doc->reset(nullptr);
589    } else {
590      Stream &S = (*Doc)->stream;
591      Doc->reset(new Document(S));
592    }
593    return *this;
594  }
595
596  Document &operator*() { return *Doc->get(); }
597
598  std::unique_ptr<Document> &operator->() { return *Doc; }
599
600private:
601  bool isAtEnd() const { return !Doc || !*Doc; }
602
603  std::unique_ptr<Document> *Doc = nullptr;
604};
605
606} // end namespace yaml
607
608} // end namespace llvm
609
610#endif // LLVM_SUPPORT_YAMLPARSER_H
611