1//===--- YAMLParser.h - Simple YAML parser --------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This is a YAML 1.2 parser.
11//
12//  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
13//
14//  This currently does not implement the following:
15//    * Multi-line literal folding.
16//    * Tag resolution.
17//    * UTF-16.
18//    * BOMs anywhere other than the first Unicode scalar value in the file.
19//
20//  The most important class here is Stream. This represents a YAML stream with
21//  0, 1, or many documents.
22//
23//  SourceMgr sm;
24//  StringRef input = getInput();
25//  yaml::Stream stream(input, sm);
26//
27//  for (yaml::document_iterator di = stream.begin(), de = stream.end();
28//       di != de; ++di) {
29//    yaml::Node *n = di->getRoot();
30//    if (n) {
31//      // Do something with n...
32//    } else
33//      break;
34//  }
35//
36//===----------------------------------------------------------------------===//
37
38#ifndef LLVM_SUPPORT_YAML_PARSER_H
39#define LLVM_SUPPORT_YAML_PARSER_H
40
41#include "llvm/ADT/OwningPtr.h"
42#include "llvm/ADT/SmallString.h"
43#include "llvm/ADT/StringRef.h"
44#include "llvm/Support/Allocator.h"
45#include "llvm/Support/SMLoc.h"
46
47#include <limits>
48#include <utility>
49
50namespace llvm {
51class MemoryBuffer;
52class SourceMgr;
53class raw_ostream;
54class Twine;
55
56namespace yaml {
57
58class document_iterator;
59class Document;
60class Node;
61class Scanner;
62struct Token;
63
64/// @brief Dump all the tokens in this stream to OS.
65/// @returns true if there was an error, false otherwise.
66bool dumpTokens(StringRef Input, raw_ostream &);
67
68/// @brief Scans all tokens in input without outputting anything. This is used
69///        for benchmarking the tokenizer.
70/// @returns true if there was an error, false otherwise.
71bool scanTokens(StringRef Input);
72
73/// @brief Escape \a Input for a double quoted scalar.
74std::string escape(StringRef Input);
75
76/// @brief This class represents a YAML stream potentially containing multiple
77///        documents.
78class Stream {
79public:
80  Stream(StringRef Input, SourceMgr &);
81  ~Stream();
82
83  document_iterator begin();
84  document_iterator end();
85  void skip();
86  bool failed();
87  bool validate() {
88    skip();
89    return !failed();
90  }
91
92  void printError(Node *N, const Twine &Msg);
93
94private:
95  OwningPtr<Scanner> scanner;
96  OwningPtr<Document> CurrentDoc;
97
98  friend class Document;
99
100  /// @brief Validate a %YAML x.x directive.
101  void handleYAMLDirective(const Token &);
102};
103
104/// @brief Abstract base class for all Nodes.
105class Node {
106public:
107  enum NodeKind {
108    NK_Null,
109    NK_Scalar,
110    NK_KeyValue,
111    NK_Mapping,
112    NK_Sequence,
113    NK_Alias
114  };
115
116  Node(unsigned int Type, OwningPtr<Document>&, StringRef Anchor);
117
118  /// @brief Get the value of the anchor attached to this node. If it does not
119  ///        have one, getAnchor().size() will be 0.
120  StringRef getAnchor() const { return Anchor; }
121
122  SMRange getSourceRange() const { return SourceRange; }
123  void setSourceRange(SMRange SR) { SourceRange = SR; }
124
125  // These functions forward to Document and Scanner.
126  Token &peekNext();
127  Token getNext();
128  Node *parseBlockNode();
129  BumpPtrAllocator &getAllocator();
130  void setError(const Twine &Message, Token &Location) const;
131  bool failed() const;
132
133  virtual void skip() {}
134
135  unsigned int getType() const { return TypeID; }
136  static inline bool classof(const Node *) { return true; }
137
138  void *operator new ( size_t Size
139                     , BumpPtrAllocator &Alloc
140                     , size_t Alignment = 16) throw() {
141    return Alloc.Allocate(Size, Alignment);
142  }
143
144  void operator delete(void *Ptr, BumpPtrAllocator &Alloc, size_t) throw() {
145    Alloc.Deallocate(Ptr);
146  }
147
148protected:
149  OwningPtr<Document> &Doc;
150  SMRange SourceRange;
151
152  void operator delete(void *) throw() {}
153
154  virtual ~Node() {}
155
156private:
157  unsigned int TypeID;
158  StringRef Anchor;
159};
160
161/// @brief A null value.
162///
163/// Example:
164///   !!null null
165class NullNode : public Node {
166public:
167  NullNode(OwningPtr<Document> &D) : Node(NK_Null, D, StringRef()) {}
168
169  static inline bool classof(const NullNode *) { return true; }
170  static inline bool classof(const Node *N) {
171    return N->getType() == NK_Null;
172  }
173};
174
175/// @brief A scalar node is an opaque datum that can be presented as a
176///        series of zero or more Unicode scalar values.
177///
178/// Example:
179///   Adena
180class ScalarNode : public Node {
181public:
182  ScalarNode(OwningPtr<Document> &D, StringRef Anchor, StringRef Val)
183    : Node(NK_Scalar, D, Anchor)
184    , Value(Val) {
185    SMLoc Start = SMLoc::getFromPointer(Val.begin());
186    SMLoc End = SMLoc::getFromPointer(Val.end() - 1);
187    SourceRange = SMRange(Start, End);
188  }
189
190  // Return Value without any escaping or folding or other fun YAML stuff. This
191  // is the exact bytes that are contained in the file (after conversion to
192  // utf8).
193  StringRef getRawValue() const { return Value; }
194
195  /// @brief Gets the value of this node as a StringRef.
196  ///
197  /// @param Storage is used to store the content of the returned StringRef iff
198  ///        it requires any modification from how it appeared in the source.
199  ///        This happens with escaped characters and multi-line literals.
200  StringRef getValue(SmallVectorImpl<char> &Storage) const;
201
202  static inline bool classof(const ScalarNode *) { return true; }
203  static inline bool classof(const Node *N) {
204    return N->getType() == NK_Scalar;
205  }
206
207private:
208  StringRef Value;
209
210  StringRef unescapeDoubleQuoted( StringRef UnquotedValue
211                                , StringRef::size_type Start
212                                , SmallVectorImpl<char> &Storage) const;
213};
214
215/// @brief A key and value pair. While not technically a Node under the YAML
216///        representation graph, it is easier to treat them this way.
217///
218/// TODO: Consider making this not a child of Node.
219///
220/// Example:
221///   Section: .text
222class KeyValueNode : public Node {
223public:
224  KeyValueNode(OwningPtr<Document> &D)
225    : Node(NK_KeyValue, D, StringRef())
226    , Key(0)
227    , Value(0)
228  {}
229
230  /// @brief Parse and return the key.
231  ///
232  /// This may be called multiple times.
233  ///
234  /// @returns The key, or nullptr if failed() == true.
235  Node *getKey();
236
237  /// @brief Parse and return the value.
238  ///
239  /// This may be called multiple times.
240  ///
241  /// @returns The value, or nullptr if failed() == true.
242  Node *getValue();
243
244  virtual void skip() {
245    getKey()->skip();
246    getValue()->skip();
247  }
248
249  static inline bool classof(const KeyValueNode *) { return true; }
250  static inline bool classof(const Node *N) {
251    return N->getType() == NK_KeyValue;
252  }
253
254private:
255  Node *Key;
256  Node *Value;
257};
258
259/// @brief This is an iterator abstraction over YAML collections shared by both
260///        sequences and maps.
261///
262/// BaseT must have a ValueT* member named CurrentEntry and a member function
263/// increment() which must set CurrentEntry to 0 to create an end iterator.
264template <class BaseT, class ValueT>
265class basic_collection_iterator
266  : public std::iterator<std::forward_iterator_tag, ValueT> {
267public:
268  basic_collection_iterator() : Base(0) {}
269  basic_collection_iterator(BaseT *B) : Base(B) {}
270
271  ValueT *operator ->() const {
272    assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
273    return Base->CurrentEntry;
274  }
275
276  ValueT &operator *() const {
277    assert(Base && Base->CurrentEntry &&
278           "Attempted to dereference end iterator!");
279    return *Base->CurrentEntry;
280  }
281
282  operator ValueT*() const {
283    assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
284    return Base->CurrentEntry;
285  }
286
287  bool operator !=(const basic_collection_iterator &Other) const {
288    if(Base != Other.Base)
289      return true;
290    return (Base && Other.Base) && Base->CurrentEntry
291                                   != Other.Base->CurrentEntry;
292  }
293
294  basic_collection_iterator &operator++() {
295    assert(Base && "Attempted to advance iterator past end!");
296    Base->increment();
297    // Create an end iterator.
298    if (Base->CurrentEntry == 0)
299      Base = 0;
300    return *this;
301  }
302
303private:
304  BaseT *Base;
305};
306
307// The following two templates are used for both MappingNode and Sequence Node.
308template <class CollectionType>
309typename CollectionType::iterator begin(CollectionType &C) {
310  assert(C.IsAtBeginning && "You may only iterate over a collection once!");
311  C.IsAtBeginning = false;
312  typename CollectionType::iterator ret(&C);
313  ++ret;
314  return ret;
315}
316
317template <class CollectionType>
318void skip(CollectionType &C) {
319  // TODO: support skipping from the middle of a parsed collection ;/
320  assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
321  if (C.IsAtBeginning)
322    for (typename CollectionType::iterator i = begin(C), e = C.end();
323                                           i != e; ++i)
324      i->skip();
325}
326
327/// @brief Represents a YAML map created from either a block map for a flow map.
328///
329/// This parses the YAML stream as increment() is called.
330///
331/// Example:
332///   Name: _main
333///   Scope: Global
334class MappingNode : public Node {
335public:
336  enum MappingType {
337    MT_Block,
338    MT_Flow,
339    MT_Inline ///< An inline mapping node is used for "[key: value]".
340  };
341
342  MappingNode(OwningPtr<Document> &D, StringRef Anchor, MappingType MT)
343    : Node(NK_Mapping, D, Anchor)
344    , Type(MT)
345    , IsAtBeginning(true)
346    , IsAtEnd(false)
347    , CurrentEntry(0)
348  {}
349
350  friend class basic_collection_iterator<MappingNode, KeyValueNode>;
351  typedef basic_collection_iterator<MappingNode, KeyValueNode> iterator;
352  template <class T> friend typename T::iterator yaml::begin(T &);
353  template <class T> friend void yaml::skip(T &);
354
355  iterator begin() {
356    return yaml::begin(*this);
357  }
358
359  iterator end() { return iterator(); }
360
361  virtual void skip() {
362    yaml::skip(*this);
363  }
364
365  static inline bool classof(const MappingNode *) { return true; }
366  static inline bool classof(const Node *N) {
367    return N->getType() == NK_Mapping;
368  }
369
370private:
371  MappingType Type;
372  bool IsAtBeginning;
373  bool IsAtEnd;
374  KeyValueNode *CurrentEntry;
375
376  void increment();
377};
378
379/// @brief Represents a YAML sequence created from either a block sequence for a
380///        flow sequence.
381///
382/// This parses the YAML stream as increment() is called.
383///
384/// Example:
385///   - Hello
386///   - World
387class SequenceNode : public Node {
388public:
389  enum SequenceType {
390    ST_Block,
391    ST_Flow,
392    // Use for:
393    //
394    // key:
395    // - val1
396    // - val2
397    //
398    // As a BlockMappingEntry and BlockEnd are not created in this case.
399    ST_Indentless
400  };
401
402  SequenceNode(OwningPtr<Document> &D, StringRef Anchor, SequenceType ST)
403    : Node(NK_Sequence, D, Anchor)
404    , SeqType(ST)
405    , IsAtBeginning(true)
406    , IsAtEnd(false)
407    , WasPreviousTokenFlowEntry(true) // Start with an imaginary ','.
408    , CurrentEntry(0)
409  {}
410
411  friend class basic_collection_iterator<SequenceNode, Node>;
412  typedef basic_collection_iterator<SequenceNode, Node> iterator;
413  template <class T> friend typename T::iterator yaml::begin(T &);
414  template <class T> friend void yaml::skip(T &);
415
416  void increment();
417
418  iterator begin() {
419    return yaml::begin(*this);
420  }
421
422  iterator end() { return iterator(); }
423
424  virtual void skip() {
425    yaml::skip(*this);
426  }
427
428  static inline bool classof(const SequenceNode *) { return true; }
429  static inline bool classof(const Node *N) {
430    return N->getType() == NK_Sequence;
431  }
432
433private:
434  SequenceType SeqType;
435  bool IsAtBeginning;
436  bool IsAtEnd;
437  bool WasPreviousTokenFlowEntry;
438  Node *CurrentEntry;
439};
440
441/// @brief Represents an alias to a Node with an anchor.
442///
443/// Example:
444///   *AnchorName
445class AliasNode : public Node {
446public:
447  AliasNode(OwningPtr<Document> &D, StringRef Val)
448    : Node(NK_Alias, D, StringRef()), Name(Val) {}
449
450  StringRef getName() const { return Name; }
451  Node *getTarget();
452
453  static inline bool classof(const ScalarNode *) { return true; }
454  static inline bool classof(const Node *N) {
455    return N->getType() == NK_Alias;
456  }
457
458private:
459  StringRef Name;
460};
461
462/// @brief A YAML Stream is a sequence of Documents. A document contains a root
463///        node.
464class Document {
465public:
466  /// @brief Root for parsing a node. Returns a single node.
467  Node *parseBlockNode();
468
469  Document(Stream &ParentStream);
470
471  /// @brief Finish parsing the current document and return true if there are
472  ///        more. Return false otherwise.
473  bool skip();
474
475  /// @brief Parse and return the root level node.
476  Node *getRoot() {
477    if (Root)
478      return Root;
479    return Root = parseBlockNode();
480  }
481
482private:
483  friend class Node;
484  friend class document_iterator;
485
486  /// @brief Stream to read tokens from.
487  Stream &stream;
488
489  /// @brief Used to allocate nodes to. All are destroyed without calling their
490  ///        destructor when the document is destroyed.
491  BumpPtrAllocator NodeAllocator;
492
493  /// @brief The root node. Used to support skipping a partially parsed
494  ///        document.
495  Node *Root;
496
497  Token &peekNext();
498  Token getNext();
499  void setError(const Twine &Message, Token &Location) const;
500  bool failed() const;
501
502  void handleTagDirective(const Token &Tag) {
503    // TODO: Track tags.
504  }
505
506  /// @brief Parse %BLAH directives and return true if any were encountered.
507  bool parseDirectives();
508
509  /// @brief Consume the next token and error if it is not \a TK.
510  bool expectToken(int TK);
511};
512
513/// @brief Iterator abstraction for Documents over a Stream.
514class document_iterator {
515public:
516  document_iterator() : Doc(0) {}
517  document_iterator(OwningPtr<Document> &D) : Doc(&D) {}
518
519  bool operator ==(const document_iterator &Other) {
520    if (isAtEnd() || Other.isAtEnd())
521      return isAtEnd() && Other.isAtEnd();
522
523    return *Doc == *Other.Doc;
524  }
525  bool operator !=(const document_iterator &Other) {
526    return !(*this == Other);
527  }
528
529  document_iterator operator ++() {
530    assert(Doc != 0 && "incrementing iterator past the end.");
531    if (!(*Doc)->skip()) {
532      Doc->reset(0);
533    } else {
534      Stream &S = (*Doc)->stream;
535      Doc->reset(new Document(S));
536    }
537    return *this;
538  }
539
540  Document &operator *() {
541    return *Doc->get();
542  }
543
544  OwningPtr<Document> &operator ->() {
545    return *Doc;
546  }
547
548private:
549  bool isAtEnd() const {
550    return Doc == 0 || *Doc == 0;
551  }
552
553  OwningPtr<Document> *Doc;
554};
555
556}
557}
558
559#endif
560