1//===--- YAMLParser.h - Simple YAML parser --------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This is a YAML 1.2 parser.
11//
12//  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
13//
14//  This currently does not implement the following:
15//    * Multi-line literal folding.
16//    * Tag resolution.
17//    * UTF-16.
18//    * BOMs anywhere other than the first Unicode scalar value in the file.
19//
20//  The most important class here is Stream. This represents a YAML stream with
21//  0, 1, or many documents.
22//
23//  SourceMgr sm;
24//  StringRef input = getInput();
25//  yaml::Stream stream(input, sm);
26//
27//  for (yaml::document_iterator di = stream.begin(), de = stream.end();
28//       di != de; ++di) {
29//    yaml::Node *n = di->getRoot();
30//    if (n) {
31//      // Do something with n...
32//    } else
33//      break;
34//  }
35//
36//===----------------------------------------------------------------------===//
37
38#ifndef LLVM_SUPPORT_YAMLPARSER_H
39#define LLVM_SUPPORT_YAMLPARSER_H
40
41#include "llvm/ADT/StringRef.h"
42#include "llvm/Support/Allocator.h"
43#include "llvm/Support/SMLoc.h"
44#include <limits>
45#include <map>
46#include <utility>
47
48namespace llvm {
49class MemoryBufferRef;
50class SourceMgr;
51class Twine;
52class raw_ostream;
53
54namespace yaml {
55
56class document_iterator;
57class Document;
58class Node;
59class Scanner;
60struct Token;
61
62/// \brief Dump all the tokens in this stream to OS.
63/// \returns true if there was an error, false otherwise.
64bool dumpTokens(StringRef Input, raw_ostream &);
65
66/// \brief Scans all tokens in input without outputting anything. This is used
67///        for benchmarking the tokenizer.
68/// \returns true if there was an error, false otherwise.
69bool scanTokens(StringRef Input);
70
71/// \brief Escape \a Input for a double quoted scalar.
72std::string escape(StringRef Input);
73
74/// \brief This class represents a YAML stream potentially containing multiple
75///        documents.
76class Stream {
77public:
78  /// \brief This keeps a reference to the string referenced by \p Input.
79  Stream(StringRef Input, SourceMgr &);
80
81  Stream(MemoryBufferRef InputBuffer, SourceMgr &);
82  ~Stream();
83
84  document_iterator begin();
85  document_iterator end();
86  void skip();
87  bool failed();
88  bool validate() {
89    skip();
90    return !failed();
91  }
92
93  void printError(Node *N, const Twine &Msg);
94
95private:
96  std::unique_ptr<Scanner> scanner;
97  std::unique_ptr<Document> CurrentDoc;
98
99  friend class Document;
100};
101
102/// \brief Abstract base class for all Nodes.
103class Node {
104  virtual void anchor();
105
106public:
107  enum NodeKind {
108    NK_Null,
109    NK_Scalar,
110    NK_KeyValue,
111    NK_Mapping,
112    NK_Sequence,
113    NK_Alias
114  };
115
116  Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
117       StringRef Tag);
118
119  /// \brief Get the value of the anchor attached to this node. If it does not
120  ///        have one, getAnchor().size() will be 0.
121  StringRef getAnchor() const { return Anchor; }
122
123  /// \brief Get the tag as it was written in the document. This does not
124  ///   perform tag resolution.
125  StringRef getRawTag() const { return Tag; }
126
127  /// \brief Get the verbatium tag for a given Node. This performs tag resoluton
128  ///   and substitution.
129  std::string getVerbatimTag() const;
130
131  SMRange getSourceRange() const { return SourceRange; }
132  void setSourceRange(SMRange SR) { SourceRange = SR; }
133
134  // These functions forward to Document and Scanner.
135  Token &peekNext();
136  Token getNext();
137  Node *parseBlockNode();
138  BumpPtrAllocator &getAllocator();
139  void setError(const Twine &Message, Token &Location) const;
140  bool failed() const;
141
142  virtual void skip() {}
143
144  unsigned int getType() const { return TypeID; }
145
146  void *operator new(size_t Size, BumpPtrAllocator &Alloc,
147                     size_t Alignment = 16) throw() {
148    return Alloc.Allocate(Size, Alignment);
149  }
150
151  void operator delete(void *Ptr, BumpPtrAllocator &Alloc, size_t Size) throw() {
152    Alloc.Deallocate(Ptr, Size);
153  }
154
155protected:
156  std::unique_ptr<Document> &Doc;
157  SMRange SourceRange;
158
159  void operator delete(void *) throw() {}
160
161  virtual ~Node() {}
162
163private:
164  unsigned int TypeID;
165  StringRef Anchor;
166  /// \brief The tag as typed in the document.
167  StringRef Tag;
168};
169
170/// \brief A null value.
171///
172/// Example:
173///   !!null null
174class NullNode : public Node {
175  void anchor() override;
176
177public:
178  NullNode(std::unique_ptr<Document> &D)
179      : Node(NK_Null, D, StringRef(), StringRef()) {}
180
181  static inline bool classof(const Node *N) { return N->getType() == NK_Null; }
182};
183
184/// \brief A scalar node is an opaque datum that can be presented as a
185///        series of zero or more Unicode scalar values.
186///
187/// Example:
188///   Adena
189class ScalarNode : public Node {
190  void anchor() override;
191
192public:
193  ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
194             StringRef Val)
195      : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
196    SMLoc Start = SMLoc::getFromPointer(Val.begin());
197    SMLoc End = SMLoc::getFromPointer(Val.end());
198    SourceRange = SMRange(Start, End);
199  }
200
201  // Return Value without any escaping or folding or other fun YAML stuff. This
202  // is the exact bytes that are contained in the file (after conversion to
203  // utf8).
204  StringRef getRawValue() const { return Value; }
205
206  /// \brief Gets the value of this node as a StringRef.
207  ///
208  /// \param Storage is used to store the content of the returned StringRef iff
209  ///        it requires any modification from how it appeared in the source.
210  ///        This happens with escaped characters and multi-line literals.
211  StringRef getValue(SmallVectorImpl<char> &Storage) const;
212
213  static inline bool classof(const Node *N) {
214    return N->getType() == NK_Scalar;
215  }
216
217private:
218  StringRef Value;
219
220  StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
221                                 StringRef::size_type Start,
222                                 SmallVectorImpl<char> &Storage) const;
223};
224
225/// \brief A key and value pair. While not technically a Node under the YAML
226///        representation graph, it is easier to treat them this way.
227///
228/// TODO: Consider making this not a child of Node.
229///
230/// Example:
231///   Section: .text
232class KeyValueNode : public Node {
233  void anchor() override;
234
235public:
236  KeyValueNode(std::unique_ptr<Document> &D)
237      : Node(NK_KeyValue, D, StringRef(), StringRef()), Key(nullptr),
238        Value(nullptr) {}
239
240  /// \brief Parse and return the key.
241  ///
242  /// This may be called multiple times.
243  ///
244  /// \returns The key, or nullptr if failed() == true.
245  Node *getKey();
246
247  /// \brief Parse and return the value.
248  ///
249  /// This may be called multiple times.
250  ///
251  /// \returns The value, or nullptr if failed() == true.
252  Node *getValue();
253
254  void skip() override {
255    getKey()->skip();
256    getValue()->skip();
257  }
258
259  static inline bool classof(const Node *N) {
260    return N->getType() == NK_KeyValue;
261  }
262
263private:
264  Node *Key;
265  Node *Value;
266};
267
268/// \brief This is an iterator abstraction over YAML collections shared by both
269///        sequences and maps.
270///
271/// BaseT must have a ValueT* member named CurrentEntry and a member function
272/// increment() which must set CurrentEntry to 0 to create an end iterator.
273template <class BaseT, class ValueT>
274class basic_collection_iterator
275    : public std::iterator<std::forward_iterator_tag, ValueT> {
276public:
277  basic_collection_iterator() : Base(nullptr) {}
278  basic_collection_iterator(BaseT *B) : Base(B) {}
279
280  ValueT *operator->() const {
281    assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
282    return Base->CurrentEntry;
283  }
284
285  ValueT &operator*() const {
286    assert(Base && Base->CurrentEntry &&
287           "Attempted to dereference end iterator!");
288    return *Base->CurrentEntry;
289  }
290
291  operator ValueT *() const {
292    assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
293    return Base->CurrentEntry;
294  }
295
296  bool operator!=(const basic_collection_iterator &Other) const {
297    if (Base != Other.Base)
298      return true;
299    return (Base && Other.Base) &&
300           Base->CurrentEntry != Other.Base->CurrentEntry;
301  }
302
303  basic_collection_iterator &operator++() {
304    assert(Base && "Attempted to advance iterator past end!");
305    Base->increment();
306    // Create an end iterator.
307    if (!Base->CurrentEntry)
308      Base = nullptr;
309    return *this;
310  }
311
312private:
313  BaseT *Base;
314};
315
316// The following two templates are used for both MappingNode and Sequence Node.
317template <class CollectionType>
318typename CollectionType::iterator begin(CollectionType &C) {
319  assert(C.IsAtBeginning && "You may only iterate over a collection once!");
320  C.IsAtBeginning = false;
321  typename CollectionType::iterator ret(&C);
322  ++ret;
323  return ret;
324}
325
326template <class CollectionType> void skip(CollectionType &C) {
327  // TODO: support skipping from the middle of a parsed collection ;/
328  assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
329  if (C.IsAtBeginning)
330    for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
331         ++i)
332      i->skip();
333}
334
335/// \brief Represents a YAML map created from either a block map for a flow map.
336///
337/// This parses the YAML stream as increment() is called.
338///
339/// Example:
340///   Name: _main
341///   Scope: Global
342class MappingNode : public Node {
343  void anchor() override;
344
345public:
346  enum MappingType {
347    MT_Block,
348    MT_Flow,
349    MT_Inline ///< An inline mapping node is used for "[key: value]".
350  };
351
352  MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
353              MappingType MT)
354      : Node(NK_Mapping, D, Anchor, Tag), Type(MT), IsAtBeginning(true),
355        IsAtEnd(false), CurrentEntry(nullptr) {}
356
357  friend class basic_collection_iterator<MappingNode, KeyValueNode>;
358  typedef basic_collection_iterator<MappingNode, KeyValueNode> iterator;
359  template <class T> friend typename T::iterator yaml::begin(T &);
360  template <class T> friend void yaml::skip(T &);
361
362  iterator begin() { return yaml::begin(*this); }
363
364  iterator end() { return iterator(); }
365
366  void skip() override { yaml::skip(*this); }
367
368  static inline bool classof(const Node *N) {
369    return N->getType() == NK_Mapping;
370  }
371
372private:
373  MappingType Type;
374  bool IsAtBeginning;
375  bool IsAtEnd;
376  KeyValueNode *CurrentEntry;
377
378  void increment();
379};
380
381/// \brief Represents a YAML sequence created from either a block sequence for a
382///        flow sequence.
383///
384/// This parses the YAML stream as increment() is called.
385///
386/// Example:
387///   - Hello
388///   - World
389class SequenceNode : public Node {
390  void anchor() override;
391
392public:
393  enum SequenceType {
394    ST_Block,
395    ST_Flow,
396    // Use for:
397    //
398    // key:
399    // - val1
400    // - val2
401    //
402    // As a BlockMappingEntry and BlockEnd are not created in this case.
403    ST_Indentless
404  };
405
406  SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
407               SequenceType ST)
408      : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST), IsAtBeginning(true),
409        IsAtEnd(false),
410        WasPreviousTokenFlowEntry(true), // Start with an imaginary ','.
411        CurrentEntry(nullptr) {}
412
413  friend class basic_collection_iterator<SequenceNode, Node>;
414  typedef basic_collection_iterator<SequenceNode, Node> iterator;
415  template <class T> friend typename T::iterator yaml::begin(T &);
416  template <class T> friend void yaml::skip(T &);
417
418  void increment();
419
420  iterator begin() { return yaml::begin(*this); }
421
422  iterator end() { return iterator(); }
423
424  void skip() override { yaml::skip(*this); }
425
426  static inline bool classof(const Node *N) {
427    return N->getType() == NK_Sequence;
428  }
429
430private:
431  SequenceType SeqType;
432  bool IsAtBeginning;
433  bool IsAtEnd;
434  bool WasPreviousTokenFlowEntry;
435  Node *CurrentEntry;
436};
437
438/// \brief Represents an alias to a Node with an anchor.
439///
440/// Example:
441///   *AnchorName
442class AliasNode : public Node {
443  void anchor() override;
444
445public:
446  AliasNode(std::unique_ptr<Document> &D, StringRef Val)
447      : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
448
449  StringRef getName() const { return Name; }
450  Node *getTarget();
451
452  static inline bool classof(const Node *N) { return N->getType() == NK_Alias; }
453
454private:
455  StringRef Name;
456};
457
458/// \brief A YAML Stream is a sequence of Documents. A document contains a root
459///        node.
460class Document {
461public:
462  /// \brief Root for parsing a node. Returns a single node.
463  Node *parseBlockNode();
464
465  Document(Stream &ParentStream);
466
467  /// \brief Finish parsing the current document and return true if there are
468  ///        more. Return false otherwise.
469  bool skip();
470
471  /// \brief Parse and return the root level node.
472  Node *getRoot() {
473    if (Root)
474      return Root;
475    return Root = parseBlockNode();
476  }
477
478  const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
479
480private:
481  friend class Node;
482  friend class document_iterator;
483
484  /// \brief Stream to read tokens from.
485  Stream &stream;
486
487  /// \brief Used to allocate nodes to. All are destroyed without calling their
488  ///        destructor when the document is destroyed.
489  BumpPtrAllocator NodeAllocator;
490
491  /// \brief The root node. Used to support skipping a partially parsed
492  ///        document.
493  Node *Root;
494
495  /// \brief Maps tag prefixes to their expansion.
496  std::map<StringRef, StringRef> TagMap;
497
498  Token &peekNext();
499  Token getNext();
500  void setError(const Twine &Message, Token &Location) const;
501  bool failed() const;
502
503  /// \brief Parse %BLAH directives and return true if any were encountered.
504  bool parseDirectives();
505
506  /// \brief Parse %YAML
507  void parseYAMLDirective();
508
509  /// \brief Parse %TAG
510  void parseTAGDirective();
511
512  /// \brief Consume the next token and error if it is not \a TK.
513  bool expectToken(int TK);
514};
515
516/// \brief Iterator abstraction for Documents over a Stream.
517class document_iterator {
518public:
519  document_iterator() : Doc(nullptr) {}
520  document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
521
522  bool operator==(const document_iterator &Other) {
523    if (isAtEnd() || Other.isAtEnd())
524      return isAtEnd() && Other.isAtEnd();
525
526    return Doc == Other.Doc;
527  }
528  bool operator!=(const document_iterator &Other) { return !(*this == Other); }
529
530  document_iterator operator++() {
531    assert(Doc && "incrementing iterator past the end.");
532    if (!(*Doc)->skip()) {
533      Doc->reset(nullptr);
534    } else {
535      Stream &S = (*Doc)->stream;
536      Doc->reset(new Document(S));
537    }
538    return *this;
539  }
540
541  Document &operator*() { return *Doc->get(); }
542
543  std::unique_ptr<Document> &operator->() { return *Doc; }
544
545private:
546  bool isAtEnd() const { return !Doc || !*Doc; }
547
548  std::unique_ptr<Document> *Doc;
549};
550
551} // End namespace yaml.
552
553} // End namespace llvm.
554
555#endif
556