1//===--- YAMLParser.h - Simple YAML parser --------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This is a YAML 1.2 parser.
11//
12//  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
13//
14//  This currently does not implement the following:
15//    * Multi-line literal folding.
16//    * Tag resolution.
17//    * UTF-16.
18//    * BOMs anywhere other than the first Unicode scalar value in the file.
19//
20//  The most important class here is Stream. This represents a YAML stream with
21//  0, 1, or many documents.
22//
23//  SourceMgr sm;
24//  StringRef input = getInput();
25//  yaml::Stream stream(input, sm);
26//
27//  for (yaml::document_iterator di = stream.begin(), de = stream.end();
28//       di != de; ++di) {
29//    yaml::Node *n = di->getRoot();
30//    if (n) {
31//      // Do something with n...
32//    } else
33//      break;
34//  }
35//
36//===----------------------------------------------------------------------===//
37
38#ifndef LLVM_SUPPORT_YAMLPARSER_H
39#define LLVM_SUPPORT_YAMLPARSER_H
40
41#include "llvm/ADT/StringRef.h"
42#include "llvm/Support/Allocator.h"
43#include "llvm/Support/SMLoc.h"
44#include <limits>
45#include <map>
46#include <utility>
47
48namespace llvm {
49class MemoryBufferRef;
50class SourceMgr;
51class Twine;
52class raw_ostream;
53
54namespace yaml {
55
56class document_iterator;
57class Document;
58class Node;
59class Scanner;
60struct Token;
61
62/// \brief Dump all the tokens in this stream to OS.
63/// \returns true if there was an error, false otherwise.
64bool dumpTokens(StringRef Input, raw_ostream &);
65
66/// \brief Scans all tokens in input without outputting anything. This is used
67///        for benchmarking the tokenizer.
68/// \returns true if there was an error, false otherwise.
69bool scanTokens(StringRef Input);
70
71/// \brief Escape \a Input for a double quoted scalar.
72std::string escape(StringRef Input);
73
74/// \brief This class represents a YAML stream potentially containing multiple
75///        documents.
76class Stream {
77public:
78  /// \brief This keeps a reference to the string referenced by \p Input.
79  Stream(StringRef Input, SourceMgr &, bool ShowColors = true);
80
81  Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true);
82  ~Stream();
83
84  document_iterator begin();
85  document_iterator end();
86  void skip();
87  bool failed();
88  bool validate() {
89    skip();
90    return !failed();
91  }
92
93  void printError(Node *N, const Twine &Msg);
94
95private:
96  std::unique_ptr<Scanner> scanner;
97  std::unique_ptr<Document> CurrentDoc;
98
99  friend class Document;
100};
101
102/// \brief Abstract base class for all Nodes.
103class Node {
104  virtual void anchor();
105
106public:
107  enum NodeKind {
108    NK_Null,
109    NK_Scalar,
110    NK_BlockScalar,
111    NK_KeyValue,
112    NK_Mapping,
113    NK_Sequence,
114    NK_Alias
115  };
116
117  Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
118       StringRef Tag);
119
120  /// \brief Get the value of the anchor attached to this node. If it does not
121  ///        have one, getAnchor().size() will be 0.
122  StringRef getAnchor() const { return Anchor; }
123
124  /// \brief Get the tag as it was written in the document. This does not
125  ///   perform tag resolution.
126  StringRef getRawTag() const { return Tag; }
127
128  /// \brief Get the verbatium tag for a given Node. This performs tag resoluton
129  ///   and substitution.
130  std::string getVerbatimTag() const;
131
132  SMRange getSourceRange() const { return SourceRange; }
133  void setSourceRange(SMRange SR) { SourceRange = SR; }
134
135  // These functions forward to Document and Scanner.
136  Token &peekNext();
137  Token getNext();
138  Node *parseBlockNode();
139  BumpPtrAllocator &getAllocator();
140  void setError(const Twine &Message, Token &Location) const;
141  bool failed() const;
142
143  virtual void skip() {}
144
145  unsigned int getType() const { return TypeID; }
146
147  void *operator new(size_t Size, BumpPtrAllocator &Alloc,
148                     size_t Alignment = 16) LLVM_NOEXCEPT {
149    return Alloc.Allocate(Size, Alignment);
150  }
151
152  void operator delete(void *Ptr, BumpPtrAllocator &Alloc,
153                       size_t Size) LLVM_NOEXCEPT {
154    Alloc.Deallocate(Ptr, Size);
155  }
156
157protected:
158  std::unique_ptr<Document> &Doc;
159  SMRange SourceRange;
160
161  void operator delete(void *) LLVM_NOEXCEPT = delete;
162
163  ~Node() = default;
164
165private:
166  unsigned int TypeID;
167  StringRef Anchor;
168  /// \brief The tag as typed in the document.
169  StringRef Tag;
170};
171
172/// \brief A null value.
173///
174/// Example:
175///   !!null null
176class NullNode final : public Node {
177  void anchor() override;
178
179public:
180  NullNode(std::unique_ptr<Document> &D)
181      : Node(NK_Null, D, StringRef(), StringRef()) {}
182
183  static inline bool classof(const Node *N) { return N->getType() == NK_Null; }
184};
185
186/// \brief A scalar node is an opaque datum that can be presented as a
187///        series of zero or more Unicode scalar values.
188///
189/// Example:
190///   Adena
191class ScalarNode final : public Node {
192  void anchor() override;
193
194public:
195  ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
196             StringRef Val)
197      : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
198    SMLoc Start = SMLoc::getFromPointer(Val.begin());
199    SMLoc End = SMLoc::getFromPointer(Val.end());
200    SourceRange = SMRange(Start, End);
201  }
202
203  // Return Value without any escaping or folding or other fun YAML stuff. This
204  // is the exact bytes that are contained in the file (after conversion to
205  // utf8).
206  StringRef getRawValue() const { return Value; }
207
208  /// \brief Gets the value of this node as a StringRef.
209  ///
210  /// \param Storage is used to store the content of the returned StringRef iff
211  ///        it requires any modification from how it appeared in the source.
212  ///        This happens with escaped characters and multi-line literals.
213  StringRef getValue(SmallVectorImpl<char> &Storage) const;
214
215  static inline bool classof(const Node *N) {
216    return N->getType() == NK_Scalar;
217  }
218
219private:
220  StringRef Value;
221
222  StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
223                                 StringRef::size_type Start,
224                                 SmallVectorImpl<char> &Storage) const;
225};
226
227/// \brief A block scalar node is an opaque datum that can be presented as a
228///        series of zero or more Unicode scalar values.
229///
230/// Example:
231///   |
232///     Hello
233///     World
234class BlockScalarNode final : public Node {
235  void anchor() override;
236
237public:
238  BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
239                  StringRef Value, StringRef RawVal)
240      : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) {
241    SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
242    SMLoc End = SMLoc::getFromPointer(RawVal.end());
243    SourceRange = SMRange(Start, End);
244  }
245
246  /// \brief Gets the value of this node as a StringRef.
247  StringRef getValue() const { return Value; }
248
249  static inline bool classof(const Node *N) {
250    return N->getType() == NK_BlockScalar;
251  }
252
253private:
254  StringRef Value;
255};
256
257/// \brief A key and value pair. While not technically a Node under the YAML
258///        representation graph, it is easier to treat them this way.
259///
260/// TODO: Consider making this not a child of Node.
261///
262/// Example:
263///   Section: .text
264class KeyValueNode final : public Node {
265  void anchor() override;
266
267public:
268  KeyValueNode(std::unique_ptr<Document> &D)
269      : Node(NK_KeyValue, D, StringRef(), StringRef()), Key(nullptr),
270        Value(nullptr) {}
271
272  /// \brief Parse and return the key.
273  ///
274  /// This may be called multiple times.
275  ///
276  /// \returns The key, or nullptr if failed() == true.
277  Node *getKey();
278
279  /// \brief Parse and return the value.
280  ///
281  /// This may be called multiple times.
282  ///
283  /// \returns The value, or nullptr if failed() == true.
284  Node *getValue();
285
286  void skip() override {
287    getKey()->skip();
288    if (Node *Val = getValue())
289      Val->skip();
290  }
291
292  static inline bool classof(const Node *N) {
293    return N->getType() == NK_KeyValue;
294  }
295
296private:
297  Node *Key;
298  Node *Value;
299};
300
301/// \brief This is an iterator abstraction over YAML collections shared by both
302///        sequences and maps.
303///
304/// BaseT must have a ValueT* member named CurrentEntry and a member function
305/// increment() which must set CurrentEntry to 0 to create an end iterator.
306template <class BaseT, class ValueT>
307class basic_collection_iterator
308    : public std::iterator<std::forward_iterator_tag, ValueT> {
309public:
310  basic_collection_iterator() : Base(nullptr) {}
311  basic_collection_iterator(BaseT *B) : Base(B) {}
312
313  ValueT *operator->() const {
314    assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
315    return Base->CurrentEntry;
316  }
317
318  ValueT &operator*() const {
319    assert(Base && Base->CurrentEntry &&
320           "Attempted to dereference end iterator!");
321    return *Base->CurrentEntry;
322  }
323
324  operator ValueT *() const {
325    assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
326    return Base->CurrentEntry;
327  }
328
329  bool operator!=(const basic_collection_iterator &Other) const {
330    if (Base != Other.Base)
331      return true;
332    return (Base && Other.Base) &&
333           Base->CurrentEntry != Other.Base->CurrentEntry;
334  }
335
336  basic_collection_iterator &operator++() {
337    assert(Base && "Attempted to advance iterator past end!");
338    Base->increment();
339    // Create an end iterator.
340    if (!Base->CurrentEntry)
341      Base = nullptr;
342    return *this;
343  }
344
345private:
346  BaseT *Base;
347};
348
349// The following two templates are used for both MappingNode and Sequence Node.
350template <class CollectionType>
351typename CollectionType::iterator begin(CollectionType &C) {
352  assert(C.IsAtBeginning && "You may only iterate over a collection once!");
353  C.IsAtBeginning = false;
354  typename CollectionType::iterator ret(&C);
355  ++ret;
356  return ret;
357}
358
359template <class CollectionType> void skip(CollectionType &C) {
360  // TODO: support skipping from the middle of a parsed collection ;/
361  assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
362  if (C.IsAtBeginning)
363    for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
364         ++i)
365      i->skip();
366}
367
368/// \brief Represents a YAML map created from either a block map for a flow map.
369///
370/// This parses the YAML stream as increment() is called.
371///
372/// Example:
373///   Name: _main
374///   Scope: Global
375class MappingNode final : public Node {
376  void anchor() override;
377
378public:
379  enum MappingType {
380    MT_Block,
381    MT_Flow,
382    MT_Inline ///< An inline mapping node is used for "[key: value]".
383  };
384
385  MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
386              MappingType MT)
387      : Node(NK_Mapping, D, Anchor, Tag), Type(MT), IsAtBeginning(true),
388        IsAtEnd(false), CurrentEntry(nullptr) {}
389
390  friend class basic_collection_iterator<MappingNode, KeyValueNode>;
391  typedef basic_collection_iterator<MappingNode, KeyValueNode> iterator;
392  template <class T> friend typename T::iterator yaml::begin(T &);
393  template <class T> friend void yaml::skip(T &);
394
395  iterator begin() { return yaml::begin(*this); }
396
397  iterator end() { return iterator(); }
398
399  void skip() override { yaml::skip(*this); }
400
401  static inline bool classof(const Node *N) {
402    return N->getType() == NK_Mapping;
403  }
404
405private:
406  MappingType Type;
407  bool IsAtBeginning;
408  bool IsAtEnd;
409  KeyValueNode *CurrentEntry;
410
411  void increment();
412};
413
414/// \brief Represents a YAML sequence created from either a block sequence for a
415///        flow sequence.
416///
417/// This parses the YAML stream as increment() is called.
418///
419/// Example:
420///   - Hello
421///   - World
422class SequenceNode final : public Node {
423  void anchor() override;
424
425public:
426  enum SequenceType {
427    ST_Block,
428    ST_Flow,
429    // Use for:
430    //
431    // key:
432    // - val1
433    // - val2
434    //
435    // As a BlockMappingEntry and BlockEnd are not created in this case.
436    ST_Indentless
437  };
438
439  SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
440               SequenceType ST)
441      : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST), IsAtBeginning(true),
442        IsAtEnd(false),
443        WasPreviousTokenFlowEntry(true), // Start with an imaginary ','.
444        CurrentEntry(nullptr) {}
445
446  friend class basic_collection_iterator<SequenceNode, Node>;
447  typedef basic_collection_iterator<SequenceNode, Node> iterator;
448  template <class T> friend typename T::iterator yaml::begin(T &);
449  template <class T> friend void yaml::skip(T &);
450
451  void increment();
452
453  iterator begin() { return yaml::begin(*this); }
454
455  iterator end() { return iterator(); }
456
457  void skip() override { yaml::skip(*this); }
458
459  static inline bool classof(const Node *N) {
460    return N->getType() == NK_Sequence;
461  }
462
463private:
464  SequenceType SeqType;
465  bool IsAtBeginning;
466  bool IsAtEnd;
467  bool WasPreviousTokenFlowEntry;
468  Node *CurrentEntry;
469};
470
471/// \brief Represents an alias to a Node with an anchor.
472///
473/// Example:
474///   *AnchorName
475class AliasNode final : public Node {
476  void anchor() override;
477
478public:
479  AliasNode(std::unique_ptr<Document> &D, StringRef Val)
480      : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
481
482  StringRef getName() const { return Name; }
483  Node *getTarget();
484
485  static inline bool classof(const Node *N) { return N->getType() == NK_Alias; }
486
487private:
488  StringRef Name;
489};
490
491/// \brief A YAML Stream is a sequence of Documents. A document contains a root
492///        node.
493class Document {
494public:
495  /// \brief Root for parsing a node. Returns a single node.
496  Node *parseBlockNode();
497
498  Document(Stream &ParentStream);
499
500  /// \brief Finish parsing the current document and return true if there are
501  ///        more. Return false otherwise.
502  bool skip();
503
504  /// \brief Parse and return the root level node.
505  Node *getRoot() {
506    if (Root)
507      return Root;
508    return Root = parseBlockNode();
509  }
510
511  const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
512
513private:
514  friend class Node;
515  friend class document_iterator;
516
517  /// \brief Stream to read tokens from.
518  Stream &stream;
519
520  /// \brief Used to allocate nodes to. All are destroyed without calling their
521  ///        destructor when the document is destroyed.
522  BumpPtrAllocator NodeAllocator;
523
524  /// \brief The root node. Used to support skipping a partially parsed
525  ///        document.
526  Node *Root;
527
528  /// \brief Maps tag prefixes to their expansion.
529  std::map<StringRef, StringRef> TagMap;
530
531  Token &peekNext();
532  Token getNext();
533  void setError(const Twine &Message, Token &Location) const;
534  bool failed() const;
535
536  /// \brief Parse %BLAH directives and return true if any were encountered.
537  bool parseDirectives();
538
539  /// \brief Parse %YAML
540  void parseYAMLDirective();
541
542  /// \brief Parse %TAG
543  void parseTAGDirective();
544
545  /// \brief Consume the next token and error if it is not \a TK.
546  bool expectToken(int TK);
547};
548
549/// \brief Iterator abstraction for Documents over a Stream.
550class document_iterator {
551public:
552  document_iterator() : Doc(nullptr) {}
553  document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
554
555  bool operator==(const document_iterator &Other) {
556    if (isAtEnd() || Other.isAtEnd())
557      return isAtEnd() && Other.isAtEnd();
558
559    return Doc == Other.Doc;
560  }
561  bool operator!=(const document_iterator &Other) { return !(*this == Other); }
562
563  document_iterator operator++() {
564    assert(Doc && "incrementing iterator past the end.");
565    if (!(*Doc)->skip()) {
566      Doc->reset(nullptr);
567    } else {
568      Stream &S = (*Doc)->stream;
569      Doc->reset(new Document(S));
570    }
571    return *this;
572  }
573
574  Document &operator*() { return *Doc->get(); }
575
576  std::unique_ptr<Document> &operator->() { return *Doc; }
577
578private:
579  bool isAtEnd() const { return !Doc || !*Doc; }
580
581  std::unique_ptr<Document> *Doc;
582};
583
584} // End namespace yaml.
585
586} // End namespace llvm.
587
588#endif
589