1//===--- YAMLParser.h - Simple YAML parser --------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This is a YAML 1.2 parser.
11//
12//  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
13//
14//  This currently does not implement the following:
15//    * Multi-line literal folding.
16//    * Tag resolution.
17//    * UTF-16.
18//    * BOMs anywhere other than the first Unicode scalar value in the file.
19//
20//  The most important class here is Stream. This represents a YAML stream with
21//  0, 1, or many documents.
22//
23//  SourceMgr sm;
24//  StringRef input = getInput();
25//  yaml::Stream stream(input, sm);
26//
27//  for (yaml::document_iterator di = stream.begin(), de = stream.end();
28//       di != de; ++di) {
29//    yaml::Node *n = di->getRoot();
30//    if (n) {
31//      // Do something with n...
32//    } else
33//      break;
34//  }
35//
36//===----------------------------------------------------------------------===//
37
38#ifndef LLVM_SUPPORT_YAMLPARSER_H
39#define LLVM_SUPPORT_YAMLPARSER_H
40
41#include "llvm/ADT/SmallString.h"
42#include "llvm/ADT/StringRef.h"
43#include "llvm/Support/Allocator.h"
44#include "llvm/Support/SMLoc.h"
45#include <limits>
46#include <map>
47#include <utility>
48
49namespace llvm {
50class MemoryBuffer;
51class SourceMgr;
52class raw_ostream;
53class Twine;
54
55namespace yaml {
56
57class document_iterator;
58class Document;
59class Node;
60class Scanner;
61struct Token;
62
63/// \brief Dump all the tokens in this stream to OS.
64/// \returns true if there was an error, false otherwise.
65bool dumpTokens(StringRef Input, raw_ostream &);
66
67/// \brief Scans all tokens in input without outputting anything. This is used
68///        for benchmarking the tokenizer.
69/// \returns true if there was an error, false otherwise.
70bool scanTokens(StringRef Input);
71
72/// \brief Escape \a Input for a double quoted scalar.
73std::string escape(StringRef Input);
74
75/// \brief This class represents a YAML stream potentially containing multiple
76///        documents.
77class Stream {
78public:
79  /// \brief This keeps a reference to the string referenced by \p Input.
80  Stream(StringRef Input, SourceMgr &);
81
82  /// \brief This takes ownership of \p InputBuffer.
83  Stream(MemoryBuffer *InputBuffer, SourceMgr &);
84  ~Stream();
85
86  document_iterator begin();
87  document_iterator end();
88  void skip();
89  bool failed();
90  bool validate() {
91    skip();
92    return !failed();
93  }
94
95  void printError(Node *N, const Twine &Msg);
96
97private:
98  std::unique_ptr<Scanner> scanner;
99  std::unique_ptr<Document> CurrentDoc;
100
101  friend class Document;
102};
103
104/// \brief Abstract base class for all Nodes.
105class Node {
106  virtual void anchor();
107
108public:
109  enum NodeKind {
110    NK_Null,
111    NK_Scalar,
112    NK_KeyValue,
113    NK_Mapping,
114    NK_Sequence,
115    NK_Alias
116  };
117
118  Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
119       StringRef Tag);
120
121  /// \brief Get the value of the anchor attached to this node. If it does not
122  ///        have one, getAnchor().size() will be 0.
123  StringRef getAnchor() const { return Anchor; }
124
125  /// \brief Get the tag as it was written in the document. This does not
126  ///   perform tag resolution.
127  StringRef getRawTag() const { return Tag; }
128
129  /// \brief Get the verbatium tag for a given Node. This performs tag resoluton
130  ///   and substitution.
131  std::string getVerbatimTag() const;
132
133  SMRange getSourceRange() const { return SourceRange; }
134  void setSourceRange(SMRange SR) { SourceRange = SR; }
135
136  // These functions forward to Document and Scanner.
137  Token &peekNext();
138  Token getNext();
139  Node *parseBlockNode();
140  BumpPtrAllocator &getAllocator();
141  void setError(const Twine &Message, Token &Location) const;
142  bool failed() const;
143
144  virtual void skip() {}
145
146  unsigned int getType() const { return TypeID; }
147
148  void *operator new(size_t Size, BumpPtrAllocator &Alloc,
149                     size_t Alignment = 16) throw() {
150    return Alloc.Allocate(Size, Alignment);
151  }
152
153  void operator delete(void *Ptr, BumpPtrAllocator &Alloc, size_t Size) throw() {
154    Alloc.Deallocate(Ptr, Size);
155  }
156
157protected:
158  std::unique_ptr<Document> &Doc;
159  SMRange SourceRange;
160
161  void operator delete(void *) throw() {}
162
163  virtual ~Node() {}
164
165private:
166  unsigned int TypeID;
167  StringRef Anchor;
168  /// \brief The tag as typed in the document.
169  StringRef Tag;
170};
171
172/// \brief A null value.
173///
174/// Example:
175///   !!null null
176class NullNode : public Node {
177  void anchor() override;
178
179public:
180  NullNode(std::unique_ptr<Document> &D)
181      : Node(NK_Null, D, StringRef(), StringRef()) {}
182
183  static inline bool classof(const Node *N) { return N->getType() == NK_Null; }
184};
185
186/// \brief A scalar node is an opaque datum that can be presented as a
187///        series of zero or more Unicode scalar values.
188///
189/// Example:
190///   Adena
191class ScalarNode : public Node {
192  void anchor() override;
193
194public:
195  ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
196             StringRef Val)
197      : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
198    SMLoc Start = SMLoc::getFromPointer(Val.begin());
199    SMLoc End = SMLoc::getFromPointer(Val.end());
200    SourceRange = SMRange(Start, End);
201  }
202
203  // Return Value without any escaping or folding or other fun YAML stuff. This
204  // is the exact bytes that are contained in the file (after conversion to
205  // utf8).
206  StringRef getRawValue() const { return Value; }
207
208  /// \brief Gets the value of this node as a StringRef.
209  ///
210  /// \param Storage is used to store the content of the returned StringRef iff
211  ///        it requires any modification from how it appeared in the source.
212  ///        This happens with escaped characters and multi-line literals.
213  StringRef getValue(SmallVectorImpl<char> &Storage) const;
214
215  static inline bool classof(const Node *N) {
216    return N->getType() == NK_Scalar;
217  }
218
219private:
220  StringRef Value;
221
222  StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
223                                 StringRef::size_type Start,
224                                 SmallVectorImpl<char> &Storage) const;
225};
226
227/// \brief A key and value pair. While not technically a Node under the YAML
228///        representation graph, it is easier to treat them this way.
229///
230/// TODO: Consider making this not a child of Node.
231///
232/// Example:
233///   Section: .text
234class KeyValueNode : public Node {
235  void anchor() override;
236
237public:
238  KeyValueNode(std::unique_ptr<Document> &D)
239      : Node(NK_KeyValue, D, StringRef(), StringRef()), Key(nullptr),
240        Value(nullptr) {}
241
242  /// \brief Parse and return the key.
243  ///
244  /// This may be called multiple times.
245  ///
246  /// \returns The key, or nullptr if failed() == true.
247  Node *getKey();
248
249  /// \brief Parse and return the value.
250  ///
251  /// This may be called multiple times.
252  ///
253  /// \returns The value, or nullptr if failed() == true.
254  Node *getValue();
255
256  void skip() override {
257    getKey()->skip();
258    getValue()->skip();
259  }
260
261  static inline bool classof(const Node *N) {
262    return N->getType() == NK_KeyValue;
263  }
264
265private:
266  Node *Key;
267  Node *Value;
268};
269
270/// \brief This is an iterator abstraction over YAML collections shared by both
271///        sequences and maps.
272///
273/// BaseT must have a ValueT* member named CurrentEntry and a member function
274/// increment() which must set CurrentEntry to 0 to create an end iterator.
275template <class BaseT, class ValueT>
276class basic_collection_iterator
277    : public std::iterator<std::forward_iterator_tag, ValueT> {
278public:
279  basic_collection_iterator() : Base(nullptr) {}
280  basic_collection_iterator(BaseT *B) : Base(B) {}
281
282  ValueT *operator->() const {
283    assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
284    return Base->CurrentEntry;
285  }
286
287  ValueT &operator*() const {
288    assert(Base && Base->CurrentEntry &&
289           "Attempted to dereference end iterator!");
290    return *Base->CurrentEntry;
291  }
292
293  operator ValueT *() const {
294    assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
295    return Base->CurrentEntry;
296  }
297
298  bool operator!=(const basic_collection_iterator &Other) const {
299    if (Base != Other.Base)
300      return true;
301    return (Base && Other.Base) &&
302           Base->CurrentEntry != Other.Base->CurrentEntry;
303  }
304
305  basic_collection_iterator &operator++() {
306    assert(Base && "Attempted to advance iterator past end!");
307    Base->increment();
308    // Create an end iterator.
309    if (!Base->CurrentEntry)
310      Base = nullptr;
311    return *this;
312  }
313
314private:
315  BaseT *Base;
316};
317
318// The following two templates are used for both MappingNode and Sequence Node.
319template <class CollectionType>
320typename CollectionType::iterator begin(CollectionType &C) {
321  assert(C.IsAtBeginning && "You may only iterate over a collection once!");
322  C.IsAtBeginning = false;
323  typename CollectionType::iterator ret(&C);
324  ++ret;
325  return ret;
326}
327
328template <class CollectionType> void skip(CollectionType &C) {
329  // TODO: support skipping from the middle of a parsed collection ;/
330  assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
331  if (C.IsAtBeginning)
332    for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
333         ++i)
334      i->skip();
335}
336
337/// \brief Represents a YAML map created from either a block map for a flow map.
338///
339/// This parses the YAML stream as increment() is called.
340///
341/// Example:
342///   Name: _main
343///   Scope: Global
344class MappingNode : public Node {
345  void anchor() override;
346
347public:
348  enum MappingType {
349    MT_Block,
350    MT_Flow,
351    MT_Inline ///< An inline mapping node is used for "[key: value]".
352  };
353
354  MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
355              MappingType MT)
356      : Node(NK_Mapping, D, Anchor, Tag), Type(MT), IsAtBeginning(true),
357        IsAtEnd(false), CurrentEntry(nullptr) {}
358
359  friend class basic_collection_iterator<MappingNode, KeyValueNode>;
360  typedef basic_collection_iterator<MappingNode, KeyValueNode> iterator;
361  template <class T> friend typename T::iterator yaml::begin(T &);
362  template <class T> friend void yaml::skip(T &);
363
364  iterator begin() { return yaml::begin(*this); }
365
366  iterator end() { return iterator(); }
367
368  void skip() override { yaml::skip(*this); }
369
370  static inline bool classof(const Node *N) {
371    return N->getType() == NK_Mapping;
372  }
373
374private:
375  MappingType Type;
376  bool IsAtBeginning;
377  bool IsAtEnd;
378  KeyValueNode *CurrentEntry;
379
380  void increment();
381};
382
383/// \brief Represents a YAML sequence created from either a block sequence for a
384///        flow sequence.
385///
386/// This parses the YAML stream as increment() is called.
387///
388/// Example:
389///   - Hello
390///   - World
391class SequenceNode : public Node {
392  void anchor() override;
393
394public:
395  enum SequenceType {
396    ST_Block,
397    ST_Flow,
398    // Use for:
399    //
400    // key:
401    // - val1
402    // - val2
403    //
404    // As a BlockMappingEntry and BlockEnd are not created in this case.
405    ST_Indentless
406  };
407
408  SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
409               SequenceType ST)
410      : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST), IsAtBeginning(true),
411        IsAtEnd(false),
412        WasPreviousTokenFlowEntry(true), // Start with an imaginary ','.
413        CurrentEntry(nullptr) {}
414
415  friend class basic_collection_iterator<SequenceNode, Node>;
416  typedef basic_collection_iterator<SequenceNode, Node> iterator;
417  template <class T> friend typename T::iterator yaml::begin(T &);
418  template <class T> friend void yaml::skip(T &);
419
420  void increment();
421
422  iterator begin() { return yaml::begin(*this); }
423
424  iterator end() { return iterator(); }
425
426  void skip() override { yaml::skip(*this); }
427
428  static inline bool classof(const Node *N) {
429    return N->getType() == NK_Sequence;
430  }
431
432private:
433  SequenceType SeqType;
434  bool IsAtBeginning;
435  bool IsAtEnd;
436  bool WasPreviousTokenFlowEntry;
437  Node *CurrentEntry;
438};
439
440/// \brief Represents an alias to a Node with an anchor.
441///
442/// Example:
443///   *AnchorName
444class AliasNode : public Node {
445  void anchor() override;
446
447public:
448  AliasNode(std::unique_ptr<Document> &D, StringRef Val)
449      : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
450
451  StringRef getName() const { return Name; }
452  Node *getTarget();
453
454  static inline bool classof(const Node *N) { return N->getType() == NK_Alias; }
455
456private:
457  StringRef Name;
458};
459
460/// \brief A YAML Stream is a sequence of Documents. A document contains a root
461///        node.
462class Document {
463public:
464  /// \brief Root for parsing a node. Returns a single node.
465  Node *parseBlockNode();
466
467  Document(Stream &ParentStream);
468
469  /// \brief Finish parsing the current document and return true if there are
470  ///        more. Return false otherwise.
471  bool skip();
472
473  /// \brief Parse and return the root level node.
474  Node *getRoot() {
475    if (Root)
476      return Root;
477    return Root = parseBlockNode();
478  }
479
480  const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
481
482private:
483  friend class Node;
484  friend class document_iterator;
485
486  /// \brief Stream to read tokens from.
487  Stream &stream;
488
489  /// \brief Used to allocate nodes to. All are destroyed without calling their
490  ///        destructor when the document is destroyed.
491  BumpPtrAllocator NodeAllocator;
492
493  /// \brief The root node. Used to support skipping a partially parsed
494  ///        document.
495  Node *Root;
496
497  /// \brief Maps tag prefixes to their expansion.
498  std::map<StringRef, StringRef> TagMap;
499
500  Token &peekNext();
501  Token getNext();
502  void setError(const Twine &Message, Token &Location) const;
503  bool failed() const;
504
505  /// \brief Parse %BLAH directives and return true if any were encountered.
506  bool parseDirectives();
507
508  /// \brief Parse %YAML
509  void parseYAMLDirective();
510
511  /// \brief Parse %TAG
512  void parseTAGDirective();
513
514  /// \brief Consume the next token and error if it is not \a TK.
515  bool expectToken(int TK);
516};
517
518/// \brief Iterator abstraction for Documents over a Stream.
519class document_iterator {
520public:
521  document_iterator() : Doc(nullptr) {}
522  document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
523
524  bool operator==(const document_iterator &Other) {
525    if (isAtEnd() || Other.isAtEnd())
526      return isAtEnd() && Other.isAtEnd();
527
528    return Doc == Other.Doc;
529  }
530  bool operator!=(const document_iterator &Other) { return !(*this == Other); }
531
532  document_iterator operator++() {
533    assert(Doc && "incrementing iterator past the end.");
534    if (!(*Doc)->skip()) {
535      Doc->reset(nullptr);
536    } else {
537      Stream &S = (*Doc)->stream;
538      Doc->reset(new Document(S));
539    }
540    return *this;
541  }
542
543  Document &operator*() { return *Doc->get(); }
544
545  std::unique_ptr<Document> &operator->() { return *Doc; }
546
547private:
548  bool isAtEnd() const { return !Doc || !*Doc; }
549
550  std::unique_ptr<Document> *Doc;
551};
552
553} // End namespace yaml.
554
555} // End namespace llvm.
556
557#endif
558