YAMLParser.cpp revision 36b56886974eae4f9c5ebc96befd3e7bfe5de338
1//===--- YAMLParser.cpp - Simple YAML parser ------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This file implements a YAML parser.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Support/YAMLParser.h"
15#include "llvm/ADT/SmallVector.h"
16#include "llvm/ADT/StringExtras.h"
17#include "llvm/ADT/Twine.h"
18#include "llvm/ADT/ilist.h"
19#include "llvm/ADT/ilist_node.h"
20#include "llvm/Support/ErrorHandling.h"
21#include "llvm/Support/MemoryBuffer.h"
22#include "llvm/Support/SourceMgr.h"
23#include "llvm/Support/raw_ostream.h"
24
25using namespace llvm;
26using namespace yaml;
27
28enum UnicodeEncodingForm {
29  UEF_UTF32_LE, ///< UTF-32 Little Endian
30  UEF_UTF32_BE, ///< UTF-32 Big Endian
31  UEF_UTF16_LE, ///< UTF-16 Little Endian
32  UEF_UTF16_BE, ///< UTF-16 Big Endian
33  UEF_UTF8,     ///< UTF-8 or ascii.
34  UEF_Unknown   ///< Not a valid Unicode encoding.
35};
36
37/// EncodingInfo - Holds the encoding type and length of the byte order mark if
38///                it exists. Length is in {0, 2, 3, 4}.
39typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
40
41/// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
42///                      encoding form of \a Input.
43///
44/// @param Input A string of length 0 or more.
45/// @returns An EncodingInfo indicating the Unicode encoding form of the input
46///          and how long the byte order mark is if one exists.
47static EncodingInfo getUnicodeEncoding(StringRef Input) {
48  if (Input.size() == 0)
49    return std::make_pair(UEF_Unknown, 0);
50
51  switch (uint8_t(Input[0])) {
52  case 0x00:
53    if (Input.size() >= 4) {
54      if (  Input[1] == 0
55         && uint8_t(Input[2]) == 0xFE
56         && uint8_t(Input[3]) == 0xFF)
57        return std::make_pair(UEF_UTF32_BE, 4);
58      if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
59        return std::make_pair(UEF_UTF32_BE, 0);
60    }
61
62    if (Input.size() >= 2 && Input[1] != 0)
63      return std::make_pair(UEF_UTF16_BE, 0);
64    return std::make_pair(UEF_Unknown, 0);
65  case 0xFF:
66    if (  Input.size() >= 4
67       && uint8_t(Input[1]) == 0xFE
68       && Input[2] == 0
69       && Input[3] == 0)
70      return std::make_pair(UEF_UTF32_LE, 4);
71
72    if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
73      return std::make_pair(UEF_UTF16_LE, 2);
74    return std::make_pair(UEF_Unknown, 0);
75  case 0xFE:
76    if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
77      return std::make_pair(UEF_UTF16_BE, 2);
78    return std::make_pair(UEF_Unknown, 0);
79  case 0xEF:
80    if (  Input.size() >= 3
81       && uint8_t(Input[1]) == 0xBB
82       && uint8_t(Input[2]) == 0xBF)
83      return std::make_pair(UEF_UTF8, 3);
84    return std::make_pair(UEF_Unknown, 0);
85  }
86
87  // It could still be utf-32 or utf-16.
88  if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
89    return std::make_pair(UEF_UTF32_LE, 0);
90
91  if (Input.size() >= 2 && Input[1] == 0)
92    return std::make_pair(UEF_UTF16_LE, 0);
93
94  return std::make_pair(UEF_UTF8, 0);
95}
96
97namespace llvm {
98namespace yaml {
99/// Pin the vtables to this file.
100void Node::anchor() {}
101void NullNode::anchor() {}
102void ScalarNode::anchor() {}
103void KeyValueNode::anchor() {}
104void MappingNode::anchor() {}
105void SequenceNode::anchor() {}
106void AliasNode::anchor() {}
107
108/// Token - A single YAML token.
109struct Token : ilist_node<Token> {
110  enum TokenKind {
111    TK_Error, // Uninitialized token.
112    TK_StreamStart,
113    TK_StreamEnd,
114    TK_VersionDirective,
115    TK_TagDirective,
116    TK_DocumentStart,
117    TK_DocumentEnd,
118    TK_BlockEntry,
119    TK_BlockEnd,
120    TK_BlockSequenceStart,
121    TK_BlockMappingStart,
122    TK_FlowEntry,
123    TK_FlowSequenceStart,
124    TK_FlowSequenceEnd,
125    TK_FlowMappingStart,
126    TK_FlowMappingEnd,
127    TK_Key,
128    TK_Value,
129    TK_Scalar,
130    TK_Alias,
131    TK_Anchor,
132    TK_Tag
133  } Kind;
134
135  /// A string of length 0 or more whose begin() points to the logical location
136  /// of the token in the input.
137  StringRef Range;
138
139  Token() : Kind(TK_Error) {}
140};
141}
142}
143
144namespace llvm {
145template<>
146struct ilist_sentinel_traits<Token> {
147  Token *createSentinel() const {
148    return &Sentinel;
149  }
150  static void destroySentinel(Token*) {}
151
152  Token *provideInitialHead() const { return createSentinel(); }
153  Token *ensureHead(Token*) const { return createSentinel(); }
154  static void noteHead(Token*, Token*) {}
155
156private:
157  mutable Token Sentinel;
158};
159
160template<>
161struct ilist_node_traits<Token> {
162  Token *createNode(const Token &V) {
163    return new (Alloc.Allocate<Token>()) Token(V);
164  }
165  static void deleteNode(Token *V) {}
166
167  void addNodeToList(Token *) {}
168  void removeNodeFromList(Token *) {}
169  void transferNodesFromList(ilist_node_traits &    /*SrcTraits*/,
170                             ilist_iterator<Token> /*first*/,
171                             ilist_iterator<Token> /*last*/) {}
172
173  BumpPtrAllocator Alloc;
174};
175}
176
177typedef ilist<Token> TokenQueueT;
178
179namespace {
180/// @brief This struct is used to track simple keys.
181///
182/// Simple keys are handled by creating an entry in SimpleKeys for each Token
183/// which could legally be the start of a simple key. When peekNext is called,
184/// if the Token To be returned is referenced by a SimpleKey, we continue
185/// tokenizing until that potential simple key has either been found to not be
186/// a simple key (we moved on to the next line or went further than 1024 chars).
187/// Or when we run into a Value, and then insert a Key token (and possibly
188/// others) before the SimpleKey's Tok.
189struct SimpleKey {
190  TokenQueueT::iterator Tok;
191  unsigned Column;
192  unsigned Line;
193  unsigned FlowLevel;
194  bool IsRequired;
195
196  bool operator ==(const SimpleKey &Other) {
197    return Tok == Other.Tok;
198  }
199};
200}
201
202/// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
203///        subsequence and the subsequence's length in code units (uint8_t).
204///        A length of 0 represents an error.
205typedef std::pair<uint32_t, unsigned> UTF8Decoded;
206
207static UTF8Decoded decodeUTF8(StringRef Range) {
208  StringRef::iterator Position= Range.begin();
209  StringRef::iterator End = Range.end();
210  // 1 byte: [0x00, 0x7f]
211  // Bit pattern: 0xxxxxxx
212  if ((*Position & 0x80) == 0) {
213     return std::make_pair(*Position, 1);
214  }
215  // 2 bytes: [0x80, 0x7ff]
216  // Bit pattern: 110xxxxx 10xxxxxx
217  if (Position + 1 != End &&
218      ((*Position & 0xE0) == 0xC0) &&
219      ((*(Position + 1) & 0xC0) == 0x80)) {
220    uint32_t codepoint = ((*Position & 0x1F) << 6) |
221                          (*(Position + 1) & 0x3F);
222    if (codepoint >= 0x80)
223      return std::make_pair(codepoint, 2);
224  }
225  // 3 bytes: [0x8000, 0xffff]
226  // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
227  if (Position + 2 != End &&
228      ((*Position & 0xF0) == 0xE0) &&
229      ((*(Position + 1) & 0xC0) == 0x80) &&
230      ((*(Position + 2) & 0xC0) == 0x80)) {
231    uint32_t codepoint = ((*Position & 0x0F) << 12) |
232                         ((*(Position + 1) & 0x3F) << 6) |
233                          (*(Position + 2) & 0x3F);
234    // Codepoints between 0xD800 and 0xDFFF are invalid, as
235    // they are high / low surrogate halves used by UTF-16.
236    if (codepoint >= 0x800 &&
237        (codepoint < 0xD800 || codepoint > 0xDFFF))
238      return std::make_pair(codepoint, 3);
239  }
240  // 4 bytes: [0x10000, 0x10FFFF]
241  // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
242  if (Position + 3 != End &&
243      ((*Position & 0xF8) == 0xF0) &&
244      ((*(Position + 1) & 0xC0) == 0x80) &&
245      ((*(Position + 2) & 0xC0) == 0x80) &&
246      ((*(Position + 3) & 0xC0) == 0x80)) {
247    uint32_t codepoint = ((*Position & 0x07) << 18) |
248                         ((*(Position + 1) & 0x3F) << 12) |
249                         ((*(Position + 2) & 0x3F) << 6) |
250                          (*(Position + 3) & 0x3F);
251    if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
252      return std::make_pair(codepoint, 4);
253  }
254  return std::make_pair(0, 0);
255}
256
257namespace llvm {
258namespace yaml {
259/// @brief Scans YAML tokens from a MemoryBuffer.
260class Scanner {
261public:
262  Scanner(const StringRef Input, SourceMgr &SM);
263  Scanner(MemoryBuffer *Buffer, SourceMgr &SM_);
264
265  /// @brief Parse the next token and return it without popping it.
266  Token &peekNext();
267
268  /// @brief Parse the next token and pop it from the queue.
269  Token getNext();
270
271  void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
272                  ArrayRef<SMRange> Ranges = None) {
273    SM.PrintMessage(Loc, Kind, Message, Ranges);
274  }
275
276  void setError(const Twine &Message, StringRef::iterator Position) {
277    if (Current >= End)
278      Current = End - 1;
279
280    // Don't print out more errors after the first one we encounter. The rest
281    // are just the result of the first, and have no meaning.
282    if (!Failed)
283      printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
284    Failed = true;
285  }
286
287  void setError(const Twine &Message) {
288    setError(Message, Current);
289  }
290
291  /// @brief Returns true if an error occurred while parsing.
292  bool failed() {
293    return Failed;
294  }
295
296private:
297  StringRef currentInput() {
298    return StringRef(Current, End - Current);
299  }
300
301  /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting
302  ///        at \a Position.
303  ///
304  /// If the UTF-8 code units starting at Position do not form a well-formed
305  /// code unit subsequence, then the Unicode scalar value is 0, and the length
306  /// is 0.
307  UTF8Decoded decodeUTF8(StringRef::iterator Position) {
308    return ::decodeUTF8(StringRef(Position, End - Position));
309  }
310
311  // The following functions are based on the gramar rules in the YAML spec. The
312  // style of the function names it meant to closely match how they are written
313  // in the spec. The number within the [] is the number of the grammar rule in
314  // the spec.
315  //
316  // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
317  //
318  // c-
319  //   A production starting and ending with a special character.
320  // b-
321  //   A production matching a single line break.
322  // nb-
323  //   A production starting and ending with a non-break character.
324  // s-
325  //   A production starting and ending with a white space character.
326  // ns-
327  //   A production starting and ending with a non-space character.
328  // l-
329  //   A production matching complete line(s).
330
331  /// @brief Skip a single nb-char[27] starting at Position.
332  ///
333  /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
334  ///                  | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
335  ///
336  /// @returns The code unit after the nb-char, or Position if it's not an
337  ///          nb-char.
338  StringRef::iterator skip_nb_char(StringRef::iterator Position);
339
340  /// @brief Skip a single b-break[28] starting at Position.
341  ///
342  /// A b-break is 0xD 0xA | 0xD | 0xA
343  ///
344  /// @returns The code unit after the b-break, or Position if it's not a
345  ///          b-break.
346  StringRef::iterator skip_b_break(StringRef::iterator Position);
347
348  /// @brief Skip a single s-white[33] starting at Position.
349  ///
350  /// A s-white is 0x20 | 0x9
351  ///
352  /// @returns The code unit after the s-white, or Position if it's not a
353  ///          s-white.
354  StringRef::iterator skip_s_white(StringRef::iterator Position);
355
356  /// @brief Skip a single ns-char[34] starting at Position.
357  ///
358  /// A ns-char is nb-char - s-white
359  ///
360  /// @returns The code unit after the ns-char, or Position if it's not a
361  ///          ns-char.
362  StringRef::iterator skip_ns_char(StringRef::iterator Position);
363
364  typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator);
365  /// @brief Skip minimal well-formed code unit subsequences until Func
366  ///        returns its input.
367  ///
368  /// @returns The code unit after the last minimal well-formed code unit
369  ///          subsequence that Func accepted.
370  StringRef::iterator skip_while( SkipWhileFunc Func
371                                , StringRef::iterator Position);
372
373  /// @brief Scan ns-uri-char[39]s starting at Cur.
374  ///
375  /// This updates Cur and Column while scanning.
376  ///
377  /// @returns A StringRef starting at Cur which covers the longest contiguous
378  ///          sequence of ns-uri-char.
379  StringRef scan_ns_uri_char();
380
381  /// @brief Consume a minimal well-formed code unit subsequence starting at
382  ///        \a Cur. Return false if it is not the same Unicode scalar value as
383  ///        \a Expected. This updates \a Column.
384  bool consume(uint32_t Expected);
385
386  /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
387  void skip(uint32_t Distance);
388
389  /// @brief Return true if the minimal well-formed code unit subsequence at
390  ///        Pos is whitespace or a new line
391  bool isBlankOrBreak(StringRef::iterator Position);
392
393  /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
394  void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
395                             , unsigned AtColumn
396                             , bool IsRequired);
397
398  /// @brief Remove simple keys that can no longer be valid simple keys.
399  ///
400  /// Invalid simple keys are not on the current line or are further than 1024
401  /// columns back.
402  void removeStaleSimpleKeyCandidates();
403
404  /// @brief Remove all simple keys on FlowLevel \a Level.
405  void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
406
407  /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
408  ///        tokens if needed.
409  bool unrollIndent(int ToColumn);
410
411  /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
412  ///        if needed.
413  bool rollIndent( int ToColumn
414                 , Token::TokenKind Kind
415                 , TokenQueueT::iterator InsertPoint);
416
417  /// @brief Skip whitespace and comments until the start of the next token.
418  void scanToNextToken();
419
420  /// @brief Must be the first token generated.
421  bool scanStreamStart();
422
423  /// @brief Generate tokens needed to close out the stream.
424  bool scanStreamEnd();
425
426  /// @brief Scan a %BLAH directive.
427  bool scanDirective();
428
429  /// @brief Scan a ... or ---.
430  bool scanDocumentIndicator(bool IsStart);
431
432  /// @brief Scan a [ or { and generate the proper flow collection start token.
433  bool scanFlowCollectionStart(bool IsSequence);
434
435  /// @brief Scan a ] or } and generate the proper flow collection end token.
436  bool scanFlowCollectionEnd(bool IsSequence);
437
438  /// @brief Scan the , that separates entries in a flow collection.
439  bool scanFlowEntry();
440
441  /// @brief Scan the - that starts block sequence entries.
442  bool scanBlockEntry();
443
444  /// @brief Scan an explicit ? indicating a key.
445  bool scanKey();
446
447  /// @brief Scan an explicit : indicating a value.
448  bool scanValue();
449
450  /// @brief Scan a quoted scalar.
451  bool scanFlowScalar(bool IsDoubleQuoted);
452
453  /// @brief Scan an unquoted scalar.
454  bool scanPlainScalar();
455
456  /// @brief Scan an Alias or Anchor starting with * or &.
457  bool scanAliasOrAnchor(bool IsAlias);
458
459  /// @brief Scan a block scalar starting with | or >.
460  bool scanBlockScalar(bool IsLiteral);
461
462  /// @brief Scan a tag of the form !stuff.
463  bool scanTag();
464
465  /// @brief Dispatch to the next scanning function based on \a *Cur.
466  bool fetchMoreTokens();
467
468  /// @brief The SourceMgr used for diagnostics and buffer management.
469  SourceMgr &SM;
470
471  /// @brief The original input.
472  MemoryBuffer *InputBuffer;
473
474  /// @brief The current position of the scanner.
475  StringRef::iterator Current;
476
477  /// @brief The end of the input (one past the last character).
478  StringRef::iterator End;
479
480  /// @brief Current YAML indentation level in spaces.
481  int Indent;
482
483  /// @brief Current column number in Unicode code points.
484  unsigned Column;
485
486  /// @brief Current line number.
487  unsigned Line;
488
489  /// @brief How deep we are in flow style containers. 0 Means at block level.
490  unsigned FlowLevel;
491
492  /// @brief Are we at the start of the stream?
493  bool IsStartOfStream;
494
495  /// @brief Can the next token be the start of a simple key?
496  bool IsSimpleKeyAllowed;
497
498  /// @brief True if an error has occurred.
499  bool Failed;
500
501  /// @brief Queue of tokens. This is required to queue up tokens while looking
502  ///        for the end of a simple key. And for cases where a single character
503  ///        can produce multiple tokens (e.g. BlockEnd).
504  TokenQueueT TokenQueue;
505
506  /// @brief Indentation levels.
507  SmallVector<int, 4> Indents;
508
509  /// @brief Potential simple keys.
510  SmallVector<SimpleKey, 4> SimpleKeys;
511};
512
513} // end namespace yaml
514} // end namespace llvm
515
516/// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
517static void encodeUTF8( uint32_t UnicodeScalarValue
518                      , SmallVectorImpl<char> &Result) {
519  if (UnicodeScalarValue <= 0x7F) {
520    Result.push_back(UnicodeScalarValue & 0x7F);
521  } else if (UnicodeScalarValue <= 0x7FF) {
522    uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
523    uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
524    Result.push_back(FirstByte);
525    Result.push_back(SecondByte);
526  } else if (UnicodeScalarValue <= 0xFFFF) {
527    uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
528    uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
529    uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
530    Result.push_back(FirstByte);
531    Result.push_back(SecondByte);
532    Result.push_back(ThirdByte);
533  } else if (UnicodeScalarValue <= 0x10FFFF) {
534    uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
535    uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
536    uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
537    uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
538    Result.push_back(FirstByte);
539    Result.push_back(SecondByte);
540    Result.push_back(ThirdByte);
541    Result.push_back(FourthByte);
542  }
543}
544
545bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
546  SourceMgr SM;
547  Scanner scanner(Input, SM);
548  while (true) {
549    Token T = scanner.getNext();
550    switch (T.Kind) {
551    case Token::TK_StreamStart:
552      OS << "Stream-Start: ";
553      break;
554    case Token::TK_StreamEnd:
555      OS << "Stream-End: ";
556      break;
557    case Token::TK_VersionDirective:
558      OS << "Version-Directive: ";
559      break;
560    case Token::TK_TagDirective:
561      OS << "Tag-Directive: ";
562      break;
563    case Token::TK_DocumentStart:
564      OS << "Document-Start: ";
565      break;
566    case Token::TK_DocumentEnd:
567      OS << "Document-End: ";
568      break;
569    case Token::TK_BlockEntry:
570      OS << "Block-Entry: ";
571      break;
572    case Token::TK_BlockEnd:
573      OS << "Block-End: ";
574      break;
575    case Token::TK_BlockSequenceStart:
576      OS << "Block-Sequence-Start: ";
577      break;
578    case Token::TK_BlockMappingStart:
579      OS << "Block-Mapping-Start: ";
580      break;
581    case Token::TK_FlowEntry:
582      OS << "Flow-Entry: ";
583      break;
584    case Token::TK_FlowSequenceStart:
585      OS << "Flow-Sequence-Start: ";
586      break;
587    case Token::TK_FlowSequenceEnd:
588      OS << "Flow-Sequence-End: ";
589      break;
590    case Token::TK_FlowMappingStart:
591      OS << "Flow-Mapping-Start: ";
592      break;
593    case Token::TK_FlowMappingEnd:
594      OS << "Flow-Mapping-End: ";
595      break;
596    case Token::TK_Key:
597      OS << "Key: ";
598      break;
599    case Token::TK_Value:
600      OS << "Value: ";
601      break;
602    case Token::TK_Scalar:
603      OS << "Scalar: ";
604      break;
605    case Token::TK_Alias:
606      OS << "Alias: ";
607      break;
608    case Token::TK_Anchor:
609      OS << "Anchor: ";
610      break;
611    case Token::TK_Tag:
612      OS << "Tag: ";
613      break;
614    case Token::TK_Error:
615      break;
616    }
617    OS << T.Range << "\n";
618    if (T.Kind == Token::TK_StreamEnd)
619      break;
620    else if (T.Kind == Token::TK_Error)
621      return false;
622  }
623  return true;
624}
625
626bool yaml::scanTokens(StringRef Input) {
627  llvm::SourceMgr SM;
628  llvm::yaml::Scanner scanner(Input, SM);
629  for (;;) {
630    llvm::yaml::Token T = scanner.getNext();
631    if (T.Kind == Token::TK_StreamEnd)
632      break;
633    else if (T.Kind == Token::TK_Error)
634      return false;
635  }
636  return true;
637}
638
639std::string yaml::escape(StringRef Input) {
640  std::string EscapedInput;
641  for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
642    if (*i == '\\')
643      EscapedInput += "\\\\";
644    else if (*i == '"')
645      EscapedInput += "\\\"";
646    else if (*i == 0)
647      EscapedInput += "\\0";
648    else if (*i == 0x07)
649      EscapedInput += "\\a";
650    else if (*i == 0x08)
651      EscapedInput += "\\b";
652    else if (*i == 0x09)
653      EscapedInput += "\\t";
654    else if (*i == 0x0A)
655      EscapedInput += "\\n";
656    else if (*i == 0x0B)
657      EscapedInput += "\\v";
658    else if (*i == 0x0C)
659      EscapedInput += "\\f";
660    else if (*i == 0x0D)
661      EscapedInput += "\\r";
662    else if (*i == 0x1B)
663      EscapedInput += "\\e";
664    else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
665      std::string HexStr = utohexstr(*i);
666      EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
667    } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
668      UTF8Decoded UnicodeScalarValue
669        = decodeUTF8(StringRef(i, Input.end() - i));
670      if (UnicodeScalarValue.second == 0) {
671        // Found invalid char.
672        SmallString<4> Val;
673        encodeUTF8(0xFFFD, Val);
674        EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
675        // FIXME: Error reporting.
676        return EscapedInput;
677      }
678      if (UnicodeScalarValue.first == 0x85)
679        EscapedInput += "\\N";
680      else if (UnicodeScalarValue.first == 0xA0)
681        EscapedInput += "\\_";
682      else if (UnicodeScalarValue.first == 0x2028)
683        EscapedInput += "\\L";
684      else if (UnicodeScalarValue.first == 0x2029)
685        EscapedInput += "\\P";
686      else {
687        std::string HexStr = utohexstr(UnicodeScalarValue.first);
688        if (HexStr.size() <= 2)
689          EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
690        else if (HexStr.size() <= 4)
691          EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
692        else if (HexStr.size() <= 8)
693          EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
694      }
695      i += UnicodeScalarValue.second - 1;
696    } else
697      EscapedInput.push_back(*i);
698  }
699  return EscapedInput;
700}
701
702Scanner::Scanner(StringRef Input, SourceMgr &sm)
703  : SM(sm)
704  , Indent(-1)
705  , Column(0)
706  , Line(0)
707  , FlowLevel(0)
708  , IsStartOfStream(true)
709  , IsSimpleKeyAllowed(true)
710  , Failed(false) {
711  InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML");
712  SM.AddNewSourceBuffer(InputBuffer, SMLoc());
713  Current = InputBuffer->getBufferStart();
714  End = InputBuffer->getBufferEnd();
715}
716
717Scanner::Scanner(MemoryBuffer *Buffer, SourceMgr &SM_)
718  : SM(SM_)
719  , InputBuffer(Buffer)
720  , Current(InputBuffer->getBufferStart())
721  , End(InputBuffer->getBufferEnd())
722  , Indent(-1)
723  , Column(0)
724  , Line(0)
725  , FlowLevel(0)
726  , IsStartOfStream(true)
727  , IsSimpleKeyAllowed(true)
728  , Failed(false) {
729    SM.AddNewSourceBuffer(InputBuffer, SMLoc());
730}
731
732Token &Scanner::peekNext() {
733  // If the current token is a possible simple key, keep parsing until we
734  // can confirm.
735  bool NeedMore = false;
736  while (true) {
737    if (TokenQueue.empty() || NeedMore) {
738      if (!fetchMoreTokens()) {
739        TokenQueue.clear();
740        TokenQueue.push_back(Token());
741        return TokenQueue.front();
742      }
743    }
744    assert(!TokenQueue.empty() &&
745            "fetchMoreTokens lied about getting tokens!");
746
747    removeStaleSimpleKeyCandidates();
748    SimpleKey SK;
749    SK.Tok = TokenQueue.front();
750    if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK)
751        == SimpleKeys.end())
752      break;
753    else
754      NeedMore = true;
755  }
756  return TokenQueue.front();
757}
758
759Token Scanner::getNext() {
760  Token Ret = peekNext();
761  // TokenQueue can be empty if there was an error getting the next token.
762  if (!TokenQueue.empty())
763    TokenQueue.pop_front();
764
765  // There cannot be any referenced Token's if the TokenQueue is empty. So do a
766  // quick deallocation of them all.
767  if (TokenQueue.empty()) {
768    TokenQueue.Alloc.Reset();
769  }
770
771  return Ret;
772}
773
774StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
775  if (Position == End)
776    return Position;
777  // Check 7 bit c-printable - b-char.
778  if (   *Position == 0x09
779      || (*Position >= 0x20 && *Position <= 0x7E))
780    return Position + 1;
781
782  // Check for valid UTF-8.
783  if (uint8_t(*Position) & 0x80) {
784    UTF8Decoded u8d = decodeUTF8(Position);
785    if (   u8d.second != 0
786        && u8d.first != 0xFEFF
787        && ( u8d.first == 0x85
788          || ( u8d.first >= 0xA0
789            && u8d.first <= 0xD7FF)
790          || ( u8d.first >= 0xE000
791            && u8d.first <= 0xFFFD)
792          || ( u8d.first >= 0x10000
793            && u8d.first <= 0x10FFFF)))
794      return Position + u8d.second;
795  }
796  return Position;
797}
798
799StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
800  if (Position == End)
801    return Position;
802  if (*Position == 0x0D) {
803    if (Position + 1 != End && *(Position + 1) == 0x0A)
804      return Position + 2;
805    return Position + 1;
806  }
807
808  if (*Position == 0x0A)
809    return Position + 1;
810  return Position;
811}
812
813
814StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
815  if (Position == End)
816    return Position;
817  if (*Position == ' ' || *Position == '\t')
818    return Position + 1;
819  return Position;
820}
821
822StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
823  if (Position == End)
824    return Position;
825  if (*Position == ' ' || *Position == '\t')
826    return Position;
827  return skip_nb_char(Position);
828}
829
830StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
831                                       , StringRef::iterator Position) {
832  while (true) {
833    StringRef::iterator i = (this->*Func)(Position);
834    if (i == Position)
835      break;
836    Position = i;
837  }
838  return Position;
839}
840
841static bool is_ns_hex_digit(const char C) {
842  return    (C >= '0' && C <= '9')
843         || (C >= 'a' && C <= 'z')
844         || (C >= 'A' && C <= 'Z');
845}
846
847static bool is_ns_word_char(const char C) {
848  return    C == '-'
849         || (C >= 'a' && C <= 'z')
850         || (C >= 'A' && C <= 'Z');
851}
852
853StringRef Scanner::scan_ns_uri_char() {
854  StringRef::iterator Start = Current;
855  while (true) {
856    if (Current == End)
857      break;
858    if ((   *Current == '%'
859          && Current + 2 < End
860          && is_ns_hex_digit(*(Current + 1))
861          && is_ns_hex_digit(*(Current + 2)))
862        || is_ns_word_char(*Current)
863        || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
864          != StringRef::npos) {
865      ++Current;
866      ++Column;
867    } else
868      break;
869  }
870  return StringRef(Start, Current - Start);
871}
872
873bool Scanner::consume(uint32_t Expected) {
874  if (Expected >= 0x80)
875    report_fatal_error("Not dealing with this yet");
876  if (Current == End)
877    return false;
878  if (uint8_t(*Current) >= 0x80)
879    report_fatal_error("Not dealing with this yet");
880  if (uint8_t(*Current) == Expected) {
881    ++Current;
882    ++Column;
883    return true;
884  }
885  return false;
886}
887
888void Scanner::skip(uint32_t Distance) {
889  Current += Distance;
890  Column += Distance;
891  assert(Current <= End && "Skipped past the end");
892}
893
894bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
895  if (Position == End)
896    return false;
897  if (   *Position == ' ' || *Position == '\t'
898      || *Position == '\r' || *Position == '\n')
899    return true;
900  return false;
901}
902
903void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
904                                    , unsigned AtColumn
905                                    , bool IsRequired) {
906  if (IsSimpleKeyAllowed) {
907    SimpleKey SK;
908    SK.Tok = Tok;
909    SK.Line = Line;
910    SK.Column = AtColumn;
911    SK.IsRequired = IsRequired;
912    SK.FlowLevel = FlowLevel;
913    SimpleKeys.push_back(SK);
914  }
915}
916
917void Scanner::removeStaleSimpleKeyCandidates() {
918  for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
919                                            i != SimpleKeys.end();) {
920    if (i->Line != Line || i->Column + 1024 < Column) {
921      if (i->IsRequired)
922        setError( "Could not find expected : for simple key"
923                , i->Tok->Range.begin());
924      i = SimpleKeys.erase(i);
925    } else
926      ++i;
927  }
928}
929
930void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
931  if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
932    SimpleKeys.pop_back();
933}
934
935bool Scanner::unrollIndent(int ToColumn) {
936  Token T;
937  // Indentation is ignored in flow.
938  if (FlowLevel != 0)
939    return true;
940
941  while (Indent > ToColumn) {
942    T.Kind = Token::TK_BlockEnd;
943    T.Range = StringRef(Current, 1);
944    TokenQueue.push_back(T);
945    Indent = Indents.pop_back_val();
946  }
947
948  return true;
949}
950
951bool Scanner::rollIndent( int ToColumn
952                        , Token::TokenKind Kind
953                        , TokenQueueT::iterator InsertPoint) {
954  if (FlowLevel)
955    return true;
956  if (Indent < ToColumn) {
957    Indents.push_back(Indent);
958    Indent = ToColumn;
959
960    Token T;
961    T.Kind = Kind;
962    T.Range = StringRef(Current, 0);
963    TokenQueue.insert(InsertPoint, T);
964  }
965  return true;
966}
967
968void Scanner::scanToNextToken() {
969  while (true) {
970    while (*Current == ' ' || *Current == '\t') {
971      skip(1);
972    }
973
974    // Skip comment.
975    if (*Current == '#') {
976      while (true) {
977        // This may skip more than one byte, thus Column is only incremented
978        // for code points.
979        StringRef::iterator i = skip_nb_char(Current);
980        if (i == Current)
981          break;
982        Current = i;
983        ++Column;
984      }
985    }
986
987    // Skip EOL.
988    StringRef::iterator i = skip_b_break(Current);
989    if (i == Current)
990      break;
991    Current = i;
992    ++Line;
993    Column = 0;
994    // New lines may start a simple key.
995    if (!FlowLevel)
996      IsSimpleKeyAllowed = true;
997  }
998}
999
1000bool Scanner::scanStreamStart() {
1001  IsStartOfStream = false;
1002
1003  EncodingInfo EI = getUnicodeEncoding(currentInput());
1004
1005  Token T;
1006  T.Kind = Token::TK_StreamStart;
1007  T.Range = StringRef(Current, EI.second);
1008  TokenQueue.push_back(T);
1009  Current += EI.second;
1010  return true;
1011}
1012
1013bool Scanner::scanStreamEnd() {
1014  // Force an ending new line if one isn't present.
1015  if (Column != 0) {
1016    Column = 0;
1017    ++Line;
1018  }
1019
1020  unrollIndent(-1);
1021  SimpleKeys.clear();
1022  IsSimpleKeyAllowed = false;
1023
1024  Token T;
1025  T.Kind = Token::TK_StreamEnd;
1026  T.Range = StringRef(Current, 0);
1027  TokenQueue.push_back(T);
1028  return true;
1029}
1030
1031bool Scanner::scanDirective() {
1032  // Reset the indentation level.
1033  unrollIndent(-1);
1034  SimpleKeys.clear();
1035  IsSimpleKeyAllowed = false;
1036
1037  StringRef::iterator Start = Current;
1038  consume('%');
1039  StringRef::iterator NameStart = Current;
1040  Current = skip_while(&Scanner::skip_ns_char, Current);
1041  StringRef Name(NameStart, Current - NameStart);
1042  Current = skip_while(&Scanner::skip_s_white, Current);
1043
1044  Token T;
1045  if (Name == "YAML") {
1046    Current = skip_while(&Scanner::skip_ns_char, Current);
1047    T.Kind = Token::TK_VersionDirective;
1048    T.Range = StringRef(Start, Current - Start);
1049    TokenQueue.push_back(T);
1050    return true;
1051  } else if(Name == "TAG") {
1052    Current = skip_while(&Scanner::skip_ns_char, Current);
1053    Current = skip_while(&Scanner::skip_s_white, Current);
1054    Current = skip_while(&Scanner::skip_ns_char, Current);
1055    T.Kind = Token::TK_TagDirective;
1056    T.Range = StringRef(Start, Current - Start);
1057    TokenQueue.push_back(T);
1058    return true;
1059  }
1060  return false;
1061}
1062
1063bool Scanner::scanDocumentIndicator(bool IsStart) {
1064  unrollIndent(-1);
1065  SimpleKeys.clear();
1066  IsSimpleKeyAllowed = false;
1067
1068  Token T;
1069  T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
1070  T.Range = StringRef(Current, 3);
1071  skip(3);
1072  TokenQueue.push_back(T);
1073  return true;
1074}
1075
1076bool Scanner::scanFlowCollectionStart(bool IsSequence) {
1077  Token T;
1078  T.Kind = IsSequence ? Token::TK_FlowSequenceStart
1079                      : Token::TK_FlowMappingStart;
1080  T.Range = StringRef(Current, 1);
1081  skip(1);
1082  TokenQueue.push_back(T);
1083
1084  // [ and { may begin a simple key.
1085  saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false);
1086
1087  // And may also be followed by a simple key.
1088  IsSimpleKeyAllowed = true;
1089  ++FlowLevel;
1090  return true;
1091}
1092
1093bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
1094  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1095  IsSimpleKeyAllowed = false;
1096  Token T;
1097  T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
1098                      : Token::TK_FlowMappingEnd;
1099  T.Range = StringRef(Current, 1);
1100  skip(1);
1101  TokenQueue.push_back(T);
1102  if (FlowLevel)
1103    --FlowLevel;
1104  return true;
1105}
1106
1107bool Scanner::scanFlowEntry() {
1108  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1109  IsSimpleKeyAllowed = true;
1110  Token T;
1111  T.Kind = Token::TK_FlowEntry;
1112  T.Range = StringRef(Current, 1);
1113  skip(1);
1114  TokenQueue.push_back(T);
1115  return true;
1116}
1117
1118bool Scanner::scanBlockEntry() {
1119  rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
1120  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1121  IsSimpleKeyAllowed = true;
1122  Token T;
1123  T.Kind = Token::TK_BlockEntry;
1124  T.Range = StringRef(Current, 1);
1125  skip(1);
1126  TokenQueue.push_back(T);
1127  return true;
1128}
1129
1130bool Scanner::scanKey() {
1131  if (!FlowLevel)
1132    rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1133
1134  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1135  IsSimpleKeyAllowed = !FlowLevel;
1136
1137  Token T;
1138  T.Kind = Token::TK_Key;
1139  T.Range = StringRef(Current, 1);
1140  skip(1);
1141  TokenQueue.push_back(T);
1142  return true;
1143}
1144
1145bool Scanner::scanValue() {
1146  // If the previous token could have been a simple key, insert the key token
1147  // into the token queue.
1148  if (!SimpleKeys.empty()) {
1149    SimpleKey SK = SimpleKeys.pop_back_val();
1150    Token T;
1151    T.Kind = Token::TK_Key;
1152    T.Range = SK.Tok->Range;
1153    TokenQueueT::iterator i, e;
1154    for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
1155      if (i == SK.Tok)
1156        break;
1157    }
1158    assert(i != e && "SimpleKey not in token queue!");
1159    i = TokenQueue.insert(i, T);
1160
1161    // We may also need to add a Block-Mapping-Start token.
1162    rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
1163
1164    IsSimpleKeyAllowed = false;
1165  } else {
1166    if (!FlowLevel)
1167      rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1168    IsSimpleKeyAllowed = !FlowLevel;
1169  }
1170
1171  Token T;
1172  T.Kind = Token::TK_Value;
1173  T.Range = StringRef(Current, 1);
1174  skip(1);
1175  TokenQueue.push_back(T);
1176  return true;
1177}
1178
1179// Forbidding inlining improves performance by roughly 20%.
1180// FIXME: Remove once llvm optimizes this to the faster version without hints.
1181LLVM_ATTRIBUTE_NOINLINE static bool
1182wasEscaped(StringRef::iterator First, StringRef::iterator Position);
1183
1184// Returns whether a character at 'Position' was escaped with a leading '\'.
1185// 'First' specifies the position of the first character in the string.
1186static bool wasEscaped(StringRef::iterator First,
1187                       StringRef::iterator Position) {
1188  assert(Position - 1 >= First);
1189  StringRef::iterator I = Position - 1;
1190  // We calculate the number of consecutive '\'s before the current position
1191  // by iterating backwards through our string.
1192  while (I >= First && *I == '\\') --I;
1193  // (Position - 1 - I) now contains the number of '\'s before the current
1194  // position. If it is odd, the character at 'Position' was escaped.
1195  return (Position - 1 - I) % 2 == 1;
1196}
1197
1198bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
1199  StringRef::iterator Start = Current;
1200  unsigned ColStart = Column;
1201  if (IsDoubleQuoted) {
1202    do {
1203      ++Current;
1204      while (Current != End && *Current != '"')
1205        ++Current;
1206      // Repeat until the previous character was not a '\' or was an escaped
1207      // backslash.
1208    } while (   Current != End
1209             && *(Current - 1) == '\\'
1210             && wasEscaped(Start + 1, Current));
1211  } else {
1212    skip(1);
1213    while (true) {
1214      // Skip a ' followed by another '.
1215      if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
1216        skip(2);
1217        continue;
1218      } else if (*Current == '\'')
1219        break;
1220      StringRef::iterator i = skip_nb_char(Current);
1221      if (i == Current) {
1222        i = skip_b_break(Current);
1223        if (i == Current)
1224          break;
1225        Current = i;
1226        Column = 0;
1227        ++Line;
1228      } else {
1229        if (i == End)
1230          break;
1231        Current = i;
1232        ++Column;
1233      }
1234    }
1235  }
1236
1237  if (Current == End) {
1238    setError("Expected quote at end of scalar", Current);
1239    return false;
1240  }
1241
1242  skip(1); // Skip ending quote.
1243  Token T;
1244  T.Kind = Token::TK_Scalar;
1245  T.Range = StringRef(Start, Current - Start);
1246  TokenQueue.push_back(T);
1247
1248  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1249
1250  IsSimpleKeyAllowed = false;
1251
1252  return true;
1253}
1254
1255bool Scanner::scanPlainScalar() {
1256  StringRef::iterator Start = Current;
1257  unsigned ColStart = Column;
1258  unsigned LeadingBlanks = 0;
1259  assert(Indent >= -1 && "Indent must be >= -1 !");
1260  unsigned indent = static_cast<unsigned>(Indent + 1);
1261  while (true) {
1262    if (*Current == '#')
1263      break;
1264
1265    while (!isBlankOrBreak(Current)) {
1266      if (  FlowLevel && *Current == ':'
1267          && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
1268        setError("Found unexpected ':' while scanning a plain scalar", Current);
1269        return false;
1270      }
1271
1272      // Check for the end of the plain scalar.
1273      if (  (*Current == ':' && isBlankOrBreak(Current + 1))
1274          || (  FlowLevel
1275          && (StringRef(Current, 1).find_first_of(",:?[]{}")
1276              != StringRef::npos)))
1277        break;
1278
1279      StringRef::iterator i = skip_nb_char(Current);
1280      if (i == Current)
1281        break;
1282      Current = i;
1283      ++Column;
1284    }
1285
1286    // Are we at the end?
1287    if (!isBlankOrBreak(Current))
1288      break;
1289
1290    // Eat blanks.
1291    StringRef::iterator Tmp = Current;
1292    while (isBlankOrBreak(Tmp)) {
1293      StringRef::iterator i = skip_s_white(Tmp);
1294      if (i != Tmp) {
1295        if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
1296          setError("Found invalid tab character in indentation", Tmp);
1297          return false;
1298        }
1299        Tmp = i;
1300        ++Column;
1301      } else {
1302        i = skip_b_break(Tmp);
1303        if (!LeadingBlanks)
1304          LeadingBlanks = 1;
1305        Tmp = i;
1306        Column = 0;
1307        ++Line;
1308      }
1309    }
1310
1311    if (!FlowLevel && Column < indent)
1312      break;
1313
1314    Current = Tmp;
1315  }
1316  if (Start == Current) {
1317    setError("Got empty plain scalar", Start);
1318    return false;
1319  }
1320  Token T;
1321  T.Kind = Token::TK_Scalar;
1322  T.Range = StringRef(Start, Current - Start);
1323  TokenQueue.push_back(T);
1324
1325  // Plain scalars can be simple keys.
1326  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1327
1328  IsSimpleKeyAllowed = false;
1329
1330  return true;
1331}
1332
1333bool Scanner::scanAliasOrAnchor(bool IsAlias) {
1334  StringRef::iterator Start = Current;
1335  unsigned ColStart = Column;
1336  skip(1);
1337  while(true) {
1338    if (   *Current == '[' || *Current == ']'
1339        || *Current == '{' || *Current == '}'
1340        || *Current == ','
1341        || *Current == ':')
1342      break;
1343    StringRef::iterator i = skip_ns_char(Current);
1344    if (i == Current)
1345      break;
1346    Current = i;
1347    ++Column;
1348  }
1349
1350  if (Start == Current) {
1351    setError("Got empty alias or anchor", Start);
1352    return false;
1353  }
1354
1355  Token T;
1356  T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
1357  T.Range = StringRef(Start, Current - Start);
1358  TokenQueue.push_back(T);
1359
1360  // Alias and anchors can be simple keys.
1361  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1362
1363  IsSimpleKeyAllowed = false;
1364
1365  return true;
1366}
1367
1368bool Scanner::scanBlockScalar(bool IsLiteral) {
1369  StringRef::iterator Start = Current;
1370  skip(1); // Eat | or >
1371  while(true) {
1372    StringRef::iterator i = skip_nb_char(Current);
1373    if (i == Current) {
1374      if (Column == 0)
1375        break;
1376      i = skip_b_break(Current);
1377      if (i != Current) {
1378        // We got a line break.
1379        Column = 0;
1380        ++Line;
1381        Current = i;
1382        continue;
1383      } else {
1384        // There was an error, which should already have been printed out.
1385        return false;
1386      }
1387    }
1388    Current = i;
1389    ++Column;
1390  }
1391
1392  if (Start == Current) {
1393    setError("Got empty block scalar", Start);
1394    return false;
1395  }
1396
1397  Token T;
1398  T.Kind = Token::TK_Scalar;
1399  T.Range = StringRef(Start, Current - Start);
1400  TokenQueue.push_back(T);
1401  return true;
1402}
1403
1404bool Scanner::scanTag() {
1405  StringRef::iterator Start = Current;
1406  unsigned ColStart = Column;
1407  skip(1); // Eat !.
1408  if (Current == End || isBlankOrBreak(Current)); // An empty tag.
1409  else if (*Current == '<') {
1410    skip(1);
1411    scan_ns_uri_char();
1412    if (!consume('>'))
1413      return false;
1414  } else {
1415    // FIXME: Actually parse the c-ns-shorthand-tag rule.
1416    Current = skip_while(&Scanner::skip_ns_char, Current);
1417  }
1418
1419  Token T;
1420  T.Kind = Token::TK_Tag;
1421  T.Range = StringRef(Start, Current - Start);
1422  TokenQueue.push_back(T);
1423
1424  // Tags can be simple keys.
1425  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1426
1427  IsSimpleKeyAllowed = false;
1428
1429  return true;
1430}
1431
1432bool Scanner::fetchMoreTokens() {
1433  if (IsStartOfStream)
1434    return scanStreamStart();
1435
1436  scanToNextToken();
1437
1438  if (Current == End)
1439    return scanStreamEnd();
1440
1441  removeStaleSimpleKeyCandidates();
1442
1443  unrollIndent(Column);
1444
1445  if (Column == 0 && *Current == '%')
1446    return scanDirective();
1447
1448  if (Column == 0 && Current + 4 <= End
1449      && *Current == '-'
1450      && *(Current + 1) == '-'
1451      && *(Current + 2) == '-'
1452      && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1453    return scanDocumentIndicator(true);
1454
1455  if (Column == 0 && Current + 4 <= End
1456      && *Current == '.'
1457      && *(Current + 1) == '.'
1458      && *(Current + 2) == '.'
1459      && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1460    return scanDocumentIndicator(false);
1461
1462  if (*Current == '[')
1463    return scanFlowCollectionStart(true);
1464
1465  if (*Current == '{')
1466    return scanFlowCollectionStart(false);
1467
1468  if (*Current == ']')
1469    return scanFlowCollectionEnd(true);
1470
1471  if (*Current == '}')
1472    return scanFlowCollectionEnd(false);
1473
1474  if (*Current == ',')
1475    return scanFlowEntry();
1476
1477  if (*Current == '-' && isBlankOrBreak(Current + 1))
1478    return scanBlockEntry();
1479
1480  if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
1481    return scanKey();
1482
1483  if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
1484    return scanValue();
1485
1486  if (*Current == '*')
1487    return scanAliasOrAnchor(true);
1488
1489  if (*Current == '&')
1490    return scanAliasOrAnchor(false);
1491
1492  if (*Current == '!')
1493    return scanTag();
1494
1495  if (*Current == '|' && !FlowLevel)
1496    return scanBlockScalar(true);
1497
1498  if (*Current == '>' && !FlowLevel)
1499    return scanBlockScalar(false);
1500
1501  if (*Current == '\'')
1502    return scanFlowScalar(false);
1503
1504  if (*Current == '"')
1505    return scanFlowScalar(true);
1506
1507  // Get a plain scalar.
1508  StringRef FirstChar(Current, 1);
1509  if (!(isBlankOrBreak(Current)
1510        || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
1511      || (*Current == '-' && !isBlankOrBreak(Current + 1))
1512      || (!FlowLevel && (*Current == '?' || *Current == ':')
1513          && isBlankOrBreak(Current + 1))
1514      || (!FlowLevel && *Current == ':'
1515                      && Current + 2 < End
1516                      && *(Current + 1) == ':'
1517                      && !isBlankOrBreak(Current + 2)))
1518    return scanPlainScalar();
1519
1520  setError("Unrecognized character while tokenizing.");
1521  return false;
1522}
1523
1524Stream::Stream(StringRef Input, SourceMgr &SM)
1525    : scanner(new Scanner(Input, SM)), CurrentDoc() {}
1526
1527Stream::Stream(MemoryBuffer *InputBuffer, SourceMgr &SM)
1528    : scanner(new Scanner(InputBuffer, SM)), CurrentDoc() {}
1529
1530Stream::~Stream() {}
1531
1532bool Stream::failed() { return scanner->failed(); }
1533
1534void Stream::printError(Node *N, const Twine &Msg) {
1535  SmallVector<SMRange, 1> Ranges;
1536  Ranges.push_back(N->getSourceRange());
1537  scanner->printError( N->getSourceRange().Start
1538                     , SourceMgr::DK_Error
1539                     , Msg
1540                     , Ranges);
1541}
1542
1543document_iterator Stream::begin() {
1544  if (CurrentDoc)
1545    report_fatal_error("Can only iterate over the stream once");
1546
1547  // Skip Stream-Start.
1548  scanner->getNext();
1549
1550  CurrentDoc.reset(new Document(*this));
1551  return document_iterator(CurrentDoc);
1552}
1553
1554document_iterator Stream::end() {
1555  return document_iterator();
1556}
1557
1558void Stream::skip() {
1559  for (document_iterator i = begin(), e = end(); i != e; ++i)
1560    i->skip();
1561}
1562
1563Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A,
1564           StringRef T)
1565    : Doc(D), TypeID(Type), Anchor(A), Tag(T) {
1566  SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
1567  SourceRange = SMRange(Start, Start);
1568}
1569
1570std::string Node::getVerbatimTag() const {
1571  StringRef Raw = getRawTag();
1572  if (!Raw.empty() && Raw != "!") {
1573    std::string Ret;
1574    if (Raw.find_last_of('!') == 0) {
1575      Ret = Doc->getTagMap().find("!")->second;
1576      Ret += Raw.substr(1);
1577      return std::move(Ret);
1578    } else if (Raw.startswith("!!")) {
1579      Ret = Doc->getTagMap().find("!!")->second;
1580      Ret += Raw.substr(2);
1581      return std::move(Ret);
1582    } else {
1583      StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
1584      std::map<StringRef, StringRef>::const_iterator It =
1585          Doc->getTagMap().find(TagHandle);
1586      if (It != Doc->getTagMap().end())
1587        Ret = It->second;
1588      else {
1589        Token T;
1590        T.Kind = Token::TK_Tag;
1591        T.Range = TagHandle;
1592        setError(Twine("Unknown tag handle ") + TagHandle, T);
1593      }
1594      Ret += Raw.substr(Raw.find_last_of('!') + 1);
1595      return std::move(Ret);
1596    }
1597  }
1598
1599  switch (getType()) {
1600  case NK_Null:
1601    return "tag:yaml.org,2002:null";
1602  case NK_Scalar:
1603    // TODO: Tag resolution.
1604    return "tag:yaml.org,2002:str";
1605  case NK_Mapping:
1606    return "tag:yaml.org,2002:map";
1607  case NK_Sequence:
1608    return "tag:yaml.org,2002:seq";
1609  }
1610
1611  return "";
1612}
1613
1614Token &Node::peekNext() {
1615  return Doc->peekNext();
1616}
1617
1618Token Node::getNext() {
1619  return Doc->getNext();
1620}
1621
1622Node *Node::parseBlockNode() {
1623  return Doc->parseBlockNode();
1624}
1625
1626BumpPtrAllocator &Node::getAllocator() {
1627  return Doc->NodeAllocator;
1628}
1629
1630void Node::setError(const Twine &Msg, Token &Tok) const {
1631  Doc->setError(Msg, Tok);
1632}
1633
1634bool Node::failed() const {
1635  return Doc->failed();
1636}
1637
1638
1639
1640StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
1641  // TODO: Handle newlines properly. We need to remove leading whitespace.
1642  if (Value[0] == '"') { // Double quoted.
1643    // Pull off the leading and trailing "s.
1644    StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
1645    // Search for characters that would require unescaping the value.
1646    StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
1647    if (i != StringRef::npos)
1648      return unescapeDoubleQuoted(UnquotedValue, i, Storage);
1649    return UnquotedValue;
1650  } else if (Value[0] == '\'') { // Single quoted.
1651    // Pull off the leading and trailing 's.
1652    StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
1653    StringRef::size_type i = UnquotedValue.find('\'');
1654    if (i != StringRef::npos) {
1655      // We're going to need Storage.
1656      Storage.clear();
1657      Storage.reserve(UnquotedValue.size());
1658      for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
1659        StringRef Valid(UnquotedValue.begin(), i);
1660        Storage.insert(Storage.end(), Valid.begin(), Valid.end());
1661        Storage.push_back('\'');
1662        UnquotedValue = UnquotedValue.substr(i + 2);
1663      }
1664      Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
1665      return StringRef(Storage.begin(), Storage.size());
1666    }
1667    return UnquotedValue;
1668  }
1669  // Plain or block.
1670  return Value.rtrim(" ");
1671}
1672
1673StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
1674                                          , StringRef::size_type i
1675                                          , SmallVectorImpl<char> &Storage)
1676                                          const {
1677  // Use Storage to build proper value.
1678  Storage.clear();
1679  Storage.reserve(UnquotedValue.size());
1680  for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
1681    // Insert all previous chars into Storage.
1682    StringRef Valid(UnquotedValue.begin(), i);
1683    Storage.insert(Storage.end(), Valid.begin(), Valid.end());
1684    // Chop off inserted chars.
1685    UnquotedValue = UnquotedValue.substr(i);
1686
1687    assert(!UnquotedValue.empty() && "Can't be empty!");
1688
1689    // Parse escape or line break.
1690    switch (UnquotedValue[0]) {
1691    case '\r':
1692    case '\n':
1693      Storage.push_back('\n');
1694      if (   UnquotedValue.size() > 1
1695          && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
1696        UnquotedValue = UnquotedValue.substr(1);
1697      UnquotedValue = UnquotedValue.substr(1);
1698      break;
1699    default:
1700      if (UnquotedValue.size() == 1)
1701        // TODO: Report error.
1702        break;
1703      UnquotedValue = UnquotedValue.substr(1);
1704      switch (UnquotedValue[0]) {
1705      default: {
1706          Token T;
1707          T.Range = StringRef(UnquotedValue.begin(), 1);
1708          setError("Unrecognized escape code!", T);
1709          return "";
1710        }
1711      case '\r':
1712      case '\n':
1713        // Remove the new line.
1714        if (   UnquotedValue.size() > 1
1715            && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
1716          UnquotedValue = UnquotedValue.substr(1);
1717        // If this was just a single byte newline, it will get skipped
1718        // below.
1719        break;
1720      case '0':
1721        Storage.push_back(0x00);
1722        break;
1723      case 'a':
1724        Storage.push_back(0x07);
1725        break;
1726      case 'b':
1727        Storage.push_back(0x08);
1728        break;
1729      case 't':
1730      case 0x09:
1731        Storage.push_back(0x09);
1732        break;
1733      case 'n':
1734        Storage.push_back(0x0A);
1735        break;
1736      case 'v':
1737        Storage.push_back(0x0B);
1738        break;
1739      case 'f':
1740        Storage.push_back(0x0C);
1741        break;
1742      case 'r':
1743        Storage.push_back(0x0D);
1744        break;
1745      case 'e':
1746        Storage.push_back(0x1B);
1747        break;
1748      case ' ':
1749        Storage.push_back(0x20);
1750        break;
1751      case '"':
1752        Storage.push_back(0x22);
1753        break;
1754      case '/':
1755        Storage.push_back(0x2F);
1756        break;
1757      case '\\':
1758        Storage.push_back(0x5C);
1759        break;
1760      case 'N':
1761        encodeUTF8(0x85, Storage);
1762        break;
1763      case '_':
1764        encodeUTF8(0xA0, Storage);
1765        break;
1766      case 'L':
1767        encodeUTF8(0x2028, Storage);
1768        break;
1769      case 'P':
1770        encodeUTF8(0x2029, Storage);
1771        break;
1772      case 'x': {
1773          if (UnquotedValue.size() < 3)
1774            // TODO: Report error.
1775            break;
1776          unsigned int UnicodeScalarValue;
1777          if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
1778            // TODO: Report error.
1779            UnicodeScalarValue = 0xFFFD;
1780          encodeUTF8(UnicodeScalarValue, Storage);
1781          UnquotedValue = UnquotedValue.substr(2);
1782          break;
1783        }
1784      case 'u': {
1785          if (UnquotedValue.size() < 5)
1786            // TODO: Report error.
1787            break;
1788          unsigned int UnicodeScalarValue;
1789          if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
1790            // TODO: Report error.
1791            UnicodeScalarValue = 0xFFFD;
1792          encodeUTF8(UnicodeScalarValue, Storage);
1793          UnquotedValue = UnquotedValue.substr(4);
1794          break;
1795        }
1796      case 'U': {
1797          if (UnquotedValue.size() < 9)
1798            // TODO: Report error.
1799            break;
1800          unsigned int UnicodeScalarValue;
1801          if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
1802            // TODO: Report error.
1803            UnicodeScalarValue = 0xFFFD;
1804          encodeUTF8(UnicodeScalarValue, Storage);
1805          UnquotedValue = UnquotedValue.substr(8);
1806          break;
1807        }
1808      }
1809      UnquotedValue = UnquotedValue.substr(1);
1810    }
1811  }
1812  Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
1813  return StringRef(Storage.begin(), Storage.size());
1814}
1815
1816Node *KeyValueNode::getKey() {
1817  if (Key)
1818    return Key;
1819  // Handle implicit null keys.
1820  {
1821    Token &t = peekNext();
1822    if (   t.Kind == Token::TK_BlockEnd
1823        || t.Kind == Token::TK_Value
1824        || t.Kind == Token::TK_Error) {
1825      return Key = new (getAllocator()) NullNode(Doc);
1826    }
1827    if (t.Kind == Token::TK_Key)
1828      getNext(); // skip TK_Key.
1829  }
1830
1831  // Handle explicit null keys.
1832  Token &t = peekNext();
1833  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
1834    return Key = new (getAllocator()) NullNode(Doc);
1835  }
1836
1837  // We've got a normal key.
1838  return Key = parseBlockNode();
1839}
1840
1841Node *KeyValueNode::getValue() {
1842  if (Value)
1843    return Value;
1844  getKey()->skip();
1845  if (failed())
1846    return Value = new (getAllocator()) NullNode(Doc);
1847
1848  // Handle implicit null values.
1849  {
1850    Token &t = peekNext();
1851    if (   t.Kind == Token::TK_BlockEnd
1852        || t.Kind == Token::TK_FlowMappingEnd
1853        || t.Kind == Token::TK_Key
1854        || t.Kind == Token::TK_FlowEntry
1855        || t.Kind == Token::TK_Error) {
1856      return Value = new (getAllocator()) NullNode(Doc);
1857    }
1858
1859    if (t.Kind != Token::TK_Value) {
1860      setError("Unexpected token in Key Value.", t);
1861      return Value = new (getAllocator()) NullNode(Doc);
1862    }
1863    getNext(); // skip TK_Value.
1864  }
1865
1866  // Handle explicit null values.
1867  Token &t = peekNext();
1868  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
1869    return Value = new (getAllocator()) NullNode(Doc);
1870  }
1871
1872  // We got a normal value.
1873  return Value = parseBlockNode();
1874}
1875
1876void MappingNode::increment() {
1877  if (failed()) {
1878    IsAtEnd = true;
1879    CurrentEntry = 0;
1880    return;
1881  }
1882  if (CurrentEntry) {
1883    CurrentEntry->skip();
1884    if (Type == MT_Inline) {
1885      IsAtEnd = true;
1886      CurrentEntry = 0;
1887      return;
1888    }
1889  }
1890  Token T = peekNext();
1891  if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
1892    // KeyValueNode eats the TK_Key. That way it can detect null keys.
1893    CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
1894  } else if (Type == MT_Block) {
1895    switch (T.Kind) {
1896    case Token::TK_BlockEnd:
1897      getNext();
1898      IsAtEnd = true;
1899      CurrentEntry = 0;
1900      break;
1901    default:
1902      setError("Unexpected token. Expected Key or Block End", T);
1903    case Token::TK_Error:
1904      IsAtEnd = true;
1905      CurrentEntry = 0;
1906    }
1907  } else {
1908    switch (T.Kind) {
1909    case Token::TK_FlowEntry:
1910      // Eat the flow entry and recurse.
1911      getNext();
1912      return increment();
1913    case Token::TK_FlowMappingEnd:
1914      getNext();
1915    case Token::TK_Error:
1916      // Set this to end iterator.
1917      IsAtEnd = true;
1918      CurrentEntry = 0;
1919      break;
1920    default:
1921      setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
1922                "Mapping End."
1923              , T);
1924      IsAtEnd = true;
1925      CurrentEntry = 0;
1926    }
1927  }
1928}
1929
1930void SequenceNode::increment() {
1931  if (failed()) {
1932    IsAtEnd = true;
1933    CurrentEntry = 0;
1934    return;
1935  }
1936  if (CurrentEntry)
1937    CurrentEntry->skip();
1938  Token T = peekNext();
1939  if (SeqType == ST_Block) {
1940    switch (T.Kind) {
1941    case Token::TK_BlockEntry:
1942      getNext();
1943      CurrentEntry = parseBlockNode();
1944      if (CurrentEntry == 0) { // An error occurred.
1945        IsAtEnd = true;
1946        CurrentEntry = 0;
1947      }
1948      break;
1949    case Token::TK_BlockEnd:
1950      getNext();
1951      IsAtEnd = true;
1952      CurrentEntry = 0;
1953      break;
1954    default:
1955      setError( "Unexpected token. Expected Block Entry or Block End."
1956              , T);
1957    case Token::TK_Error:
1958      IsAtEnd = true;
1959      CurrentEntry = 0;
1960    }
1961  } else if (SeqType == ST_Indentless) {
1962    switch (T.Kind) {
1963    case Token::TK_BlockEntry:
1964      getNext();
1965      CurrentEntry = parseBlockNode();
1966      if (CurrentEntry == 0) { // An error occurred.
1967        IsAtEnd = true;
1968        CurrentEntry = 0;
1969      }
1970      break;
1971    default:
1972    case Token::TK_Error:
1973      IsAtEnd = true;
1974      CurrentEntry = 0;
1975    }
1976  } else if (SeqType == ST_Flow) {
1977    switch (T.Kind) {
1978    case Token::TK_FlowEntry:
1979      // Eat the flow entry and recurse.
1980      getNext();
1981      WasPreviousTokenFlowEntry = true;
1982      return increment();
1983    case Token::TK_FlowSequenceEnd:
1984      getNext();
1985    case Token::TK_Error:
1986      // Set this to end iterator.
1987      IsAtEnd = true;
1988      CurrentEntry = 0;
1989      break;
1990    case Token::TK_StreamEnd:
1991    case Token::TK_DocumentEnd:
1992    case Token::TK_DocumentStart:
1993      setError("Could not find closing ]!", T);
1994      // Set this to end iterator.
1995      IsAtEnd = true;
1996      CurrentEntry = 0;
1997      break;
1998    default:
1999      if (!WasPreviousTokenFlowEntry) {
2000        setError("Expected , between entries!", T);
2001        IsAtEnd = true;
2002        CurrentEntry = 0;
2003        break;
2004      }
2005      // Otherwise it must be a flow entry.
2006      CurrentEntry = parseBlockNode();
2007      if (!CurrentEntry) {
2008        IsAtEnd = true;
2009      }
2010      WasPreviousTokenFlowEntry = false;
2011      break;
2012    }
2013  }
2014}
2015
2016Document::Document(Stream &S) : stream(S), Root(0) {
2017  // Tag maps starts with two default mappings.
2018  TagMap["!"] = "!";
2019  TagMap["!!"] = "tag:yaml.org,2002:";
2020
2021  if (parseDirectives())
2022    expectToken(Token::TK_DocumentStart);
2023  Token &T = peekNext();
2024  if (T.Kind == Token::TK_DocumentStart)
2025    getNext();
2026}
2027
2028bool Document::skip()  {
2029  if (stream.scanner->failed())
2030    return false;
2031  if (!Root)
2032    getRoot();
2033  Root->skip();
2034  Token &T = peekNext();
2035  if (T.Kind == Token::TK_StreamEnd)
2036    return false;
2037  if (T.Kind == Token::TK_DocumentEnd) {
2038    getNext();
2039    return skip();
2040  }
2041  return true;
2042}
2043
2044Token &Document::peekNext() {
2045  return stream.scanner->peekNext();
2046}
2047
2048Token Document::getNext() {
2049  return stream.scanner->getNext();
2050}
2051
2052void Document::setError(const Twine &Message, Token &Location) const {
2053  stream.scanner->setError(Message, Location.Range.begin());
2054}
2055
2056bool Document::failed() const {
2057  return stream.scanner->failed();
2058}
2059
2060Node *Document::parseBlockNode() {
2061  Token T = peekNext();
2062  // Handle properties.
2063  Token AnchorInfo;
2064  Token TagInfo;
2065parse_property:
2066  switch (T.Kind) {
2067  case Token::TK_Alias:
2068    getNext();
2069    return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
2070  case Token::TK_Anchor:
2071    if (AnchorInfo.Kind == Token::TK_Anchor) {
2072      setError("Already encountered an anchor for this node!", T);
2073      return 0;
2074    }
2075    AnchorInfo = getNext(); // Consume TK_Anchor.
2076    T = peekNext();
2077    goto parse_property;
2078  case Token::TK_Tag:
2079    if (TagInfo.Kind == Token::TK_Tag) {
2080      setError("Already encountered a tag for this node!", T);
2081      return 0;
2082    }
2083    TagInfo = getNext(); // Consume TK_Tag.
2084    T = peekNext();
2085    goto parse_property;
2086  default:
2087    break;
2088  }
2089
2090  switch (T.Kind) {
2091  case Token::TK_BlockEntry:
2092    // We got an unindented BlockEntry sequence. This is not terminated with
2093    // a BlockEnd.
2094    // Don't eat the TK_BlockEntry, SequenceNode needs it.
2095    return new (NodeAllocator) SequenceNode( stream.CurrentDoc
2096                                           , AnchorInfo.Range.substr(1)
2097                                           , TagInfo.Range
2098                                           , SequenceNode::ST_Indentless);
2099  case Token::TK_BlockSequenceStart:
2100    getNext();
2101    return new (NodeAllocator)
2102      SequenceNode( stream.CurrentDoc
2103                  , AnchorInfo.Range.substr(1)
2104                  , TagInfo.Range
2105                  , SequenceNode::ST_Block);
2106  case Token::TK_BlockMappingStart:
2107    getNext();
2108    return new (NodeAllocator)
2109      MappingNode( stream.CurrentDoc
2110                 , AnchorInfo.Range.substr(1)
2111                 , TagInfo.Range
2112                 , MappingNode::MT_Block);
2113  case Token::TK_FlowSequenceStart:
2114    getNext();
2115    return new (NodeAllocator)
2116      SequenceNode( stream.CurrentDoc
2117                  , AnchorInfo.Range.substr(1)
2118                  , TagInfo.Range
2119                  , SequenceNode::ST_Flow);
2120  case Token::TK_FlowMappingStart:
2121    getNext();
2122    return new (NodeAllocator)
2123      MappingNode( stream.CurrentDoc
2124                 , AnchorInfo.Range.substr(1)
2125                 , TagInfo.Range
2126                 , MappingNode::MT_Flow);
2127  case Token::TK_Scalar:
2128    getNext();
2129    return new (NodeAllocator)
2130      ScalarNode( stream.CurrentDoc
2131                , AnchorInfo.Range.substr(1)
2132                , TagInfo.Range
2133                , T.Range);
2134  case Token::TK_Key:
2135    // Don't eat the TK_Key, KeyValueNode expects it.
2136    return new (NodeAllocator)
2137      MappingNode( stream.CurrentDoc
2138                 , AnchorInfo.Range.substr(1)
2139                 , TagInfo.Range
2140                 , MappingNode::MT_Inline);
2141  case Token::TK_DocumentStart:
2142  case Token::TK_DocumentEnd:
2143  case Token::TK_StreamEnd:
2144  default:
2145    // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
2146    //       !!null null.
2147    return new (NodeAllocator) NullNode(stream.CurrentDoc);
2148  case Token::TK_Error:
2149    return 0;
2150  }
2151  llvm_unreachable("Control flow shouldn't reach here.");
2152  return 0;
2153}
2154
2155bool Document::parseDirectives() {
2156  bool isDirective = false;
2157  while (true) {
2158    Token T = peekNext();
2159    if (T.Kind == Token::TK_TagDirective) {
2160      parseTAGDirective();
2161      isDirective = true;
2162    } else if (T.Kind == Token::TK_VersionDirective) {
2163      parseYAMLDirective();
2164      isDirective = true;
2165    } else
2166      break;
2167  }
2168  return isDirective;
2169}
2170
2171void Document::parseYAMLDirective() {
2172  getNext(); // Eat %YAML <version>
2173}
2174
2175void Document::parseTAGDirective() {
2176  Token Tag = getNext(); // %TAG <handle> <prefix>
2177  StringRef T = Tag.Range;
2178  // Strip %TAG
2179  T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
2180  std::size_t HandleEnd = T.find_first_of(" \t");
2181  StringRef TagHandle = T.substr(0, HandleEnd);
2182  StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
2183  TagMap[TagHandle] = TagPrefix;
2184}
2185
2186bool Document::expectToken(int TK) {
2187  Token T = getNext();
2188  if (T.Kind != TK) {
2189    setError("Unexpected token", T);
2190    return false;
2191  }
2192  return true;
2193}
2194