json_reader.cpp revision f59fb0e83fd0a4b41700d3f5eebdc8d21b173c2e
1// Copyright 2007-2011 Baptiste Lepilleur
2// Distributed under MIT license, or public domain if desired and
3// recognized in your jurisdiction.
4// See file LICENSE for detail or copy at http://jsoncpp.sourceforge.net/LICENSE
5
6#if !defined(JSON_IS_AMALGAMATION)
7# include <json/assertions.h>
8# include <json/reader.h>
9# include <json/value.h>
10# include "json_tool.h"
11#endif // if !defined(JSON_IS_AMALGAMATION)
12#include <utility>
13#include <cstdio>
14#include <cassert>
15#include <cstring>
16#include <stdexcept>
17
18#if _MSC_VER >= 1400 // VC++ 8.0
19#pragma warning( disable : 4996 )   // disable warning about strdup being deprecated.
20#endif
21
22namespace Json {
23
24// Implementation of class Features
25// ////////////////////////////////
26
27Features::Features()
28   : allowComments_( true )
29   , strictRoot_( false )
30{
31}
32
33
34Features
35Features::all()
36{
37   return Features();
38}
39
40
41Features
42Features::strictMode()
43{
44   Features features;
45   features.allowComments_ = false;
46   features.strictRoot_ = true;
47   return features;
48}
49
50// Implementation of class Reader
51// ////////////////////////////////
52
53
54static inline bool
55in( Reader::Char c, Reader::Char c1, Reader::Char c2, Reader::Char c3, Reader::Char c4 )
56{
57   return c == c1  ||  c == c2  ||  c == c3  ||  c == c4;
58}
59
60static inline bool
61in( Reader::Char c, Reader::Char c1, Reader::Char c2, Reader::Char c3, Reader::Char c4, Reader::Char c5 )
62{
63   return c == c1  ||  c == c2  ||  c == c3  ||  c == c4  ||  c == c5;
64}
65
66
67static bool
68containsNewLine( Reader::Location begin,
69                 Reader::Location end )
70{
71   for ( ;begin < end; ++begin )
72      if ( *begin == '\n'  ||  *begin == '\r' )
73         return true;
74   return false;
75}
76
77
78// Class Reader
79// //////////////////////////////////////////////////////////////////
80
81Reader::Reader()
82    : errors_(),
83      document_(),
84      begin_(),
85      end_(),
86      current_(),
87      lastValueEnd_(),
88      lastValue_(),
89      commentsBefore_(),
90      features_( Features::all() ),
91      collectComments_()
92{
93}
94
95
96Reader::Reader( const Features &features )
97    : errors_(),
98      document_(),
99      begin_(),
100      end_(),
101      current_(),
102      lastValueEnd_(),
103      lastValue_(),
104      commentsBefore_(),
105      features_( features ),
106      collectComments_()
107{
108}
109
110
111bool
112Reader::parse( const std::string &document,
113               Value &root,
114               bool collectComments )
115{
116   document_ = document;
117   const char *begin = document_.c_str();
118   const char *end = begin + document_.length();
119   return parse( begin, end, root, collectComments );
120}
121
122
123bool
124Reader::parse( std::istream& sin,
125               Value &root,
126               bool collectComments )
127{
128   //std::istream_iterator<char> begin(sin);
129   //std::istream_iterator<char> end;
130   // Those would allow streamed input from a file, if parse() were a
131   // template function.
132
133   // Since std::string is reference-counted, this at least does not
134   // create an extra copy.
135   std::string doc;
136   std::getline(sin, doc, (char)EOF);
137   return parse( doc, root, collectComments );
138}
139
140bool
141Reader::parse( const char *beginDoc, const char *endDoc,
142               Value &root,
143               bool collectComments )
144{
145   if ( !features_.allowComments_ )
146   {
147      collectComments = false;
148   }
149
150   begin_ = beginDoc;
151   end_ = endDoc;
152   collectComments_ = collectComments;
153   current_ = begin_;
154   lastValueEnd_ = 0;
155   lastValue_ = 0;
156   commentsBefore_ = "";
157   errors_.clear();
158   while ( !nodes_.empty() )
159      nodes_.pop();
160   nodes_.push( &root );
161
162   bool successful = readValue();
163   Token token;
164   skipCommentTokens( token );
165   if ( collectComments_  &&  !commentsBefore_.empty() )
166      root.setComment( commentsBefore_, commentAfter );
167   if ( features_.strictRoot_ )
168   {
169      if ( !root.isArray()  &&  !root.isObject() )
170      {
171         // Set error location to start of doc, ideally should be first token found in doc
172         token.type_ = tokenError;
173         token.start_ = beginDoc;
174         token.end_ = endDoc;
175         addError( "A valid JSON document must be either an array or an object value.",
176                   token );
177         return false;
178      }
179   }
180   return successful;
181}
182
183
184bool
185Reader::readValue()
186{
187   Token token;
188   skipCommentTokens( token );
189   bool successful = true;
190
191   if ( collectComments_  &&  !commentsBefore_.empty() )
192   {
193      currentValue().setComment( commentsBefore_, commentBefore );
194      commentsBefore_ = "";
195   }
196
197
198   switch ( token.type_ )
199   {
200   case tokenObjectBegin:
201      successful = readObject( token );
202      break;
203   case tokenArrayBegin:
204      successful = readArray( token );
205      break;
206   case tokenNumber:
207      successful = decodeNumber( token );
208      break;
209   case tokenString:
210      successful = decodeString( token );
211      break;
212   case tokenTrue:
213      currentValue() = true;
214      break;
215   case tokenFalse:
216      currentValue() = false;
217      break;
218   case tokenNull:
219      currentValue() = Value();
220      break;
221   default:
222      return addError( "Syntax error: value, object or array expected.", token );
223   }
224
225   if ( collectComments_ )
226   {
227      lastValueEnd_ = current_;
228      lastValue_ = &currentValue();
229   }
230
231   return successful;
232}
233
234
235void
236Reader::skipCommentTokens( Token &token )
237{
238   if ( features_.allowComments_ )
239   {
240      do
241      {
242         readToken( token );
243      }
244      while ( token.type_ == tokenComment );
245   }
246   else
247   {
248      readToken( token );
249   }
250}
251
252
253bool
254Reader::expectToken( TokenType type, Token &token, const char *message )
255{
256   readToken( token );
257   if ( token.type_ != type )
258      return addError( message, token );
259   return true;
260}
261
262
263bool
264Reader::readToken( Token &token )
265{
266   skipSpaces();
267   token.start_ = current_;
268   Char c = getNextChar();
269   bool ok = true;
270   switch ( c )
271   {
272   case '{':
273      token.type_ = tokenObjectBegin;
274      break;
275   case '}':
276      token.type_ = tokenObjectEnd;
277      break;
278   case '[':
279      token.type_ = tokenArrayBegin;
280      break;
281   case ']':
282      token.type_ = tokenArrayEnd;
283      break;
284   case '"':
285      token.type_ = tokenString;
286      ok = readString();
287      break;
288   case '/':
289      token.type_ = tokenComment;
290      ok = readComment();
291      break;
292   case '0':
293   case '1':
294   case '2':
295   case '3':
296   case '4':
297   case '5':
298   case '6':
299   case '7':
300   case '8':
301   case '9':
302   case '-':
303      token.type_ = tokenNumber;
304      readNumber();
305      break;
306   case 't':
307      token.type_ = tokenTrue;
308      ok = match( "rue", 3 );
309      break;
310   case 'f':
311      token.type_ = tokenFalse;
312      ok = match( "alse", 4 );
313      break;
314   case 'n':
315      token.type_ = tokenNull;
316      ok = match( "ull", 3 );
317      break;
318   case ',':
319      token.type_ = tokenArraySeparator;
320      break;
321   case ':':
322      token.type_ = tokenMemberSeparator;
323      break;
324   case 0:
325      token.type_ = tokenEndOfStream;
326      break;
327   default:
328      ok = false;
329      break;
330   }
331   if ( !ok )
332      token.type_ = tokenError;
333   token.end_ = current_;
334   return true;
335}
336
337
338void
339Reader::skipSpaces()
340{
341   while ( current_ != end_ )
342   {
343      Char c = *current_;
344      if ( c == ' '  ||  c == '\t'  ||  c == '\r'  ||  c == '\n' )
345         ++current_;
346      else
347         break;
348   }
349}
350
351
352bool
353Reader::match( Location pattern,
354               int patternLength )
355{
356   if ( end_ - current_ < patternLength )
357      return false;
358   int index = patternLength;
359   while ( index-- )
360      if ( current_[index] != pattern[index] )
361         return false;
362   current_ += patternLength;
363   return true;
364}
365
366
367bool
368Reader::readComment()
369{
370   Location commentBegin = current_ - 1;
371   Char c = getNextChar();
372   bool successful = false;
373   if ( c == '*' )
374      successful = readCStyleComment();
375   else if ( c == '/' )
376      successful = readCppStyleComment();
377   if ( !successful )
378      return false;
379
380   if ( collectComments_ )
381   {
382      CommentPlacement placement = commentBefore;
383      if ( lastValueEnd_  &&  !containsNewLine( lastValueEnd_, commentBegin ) )
384      {
385         if ( c != '*'  ||  !containsNewLine( commentBegin, current_ ) )
386            placement = commentAfterOnSameLine;
387      }
388
389      addComment( commentBegin, current_, placement );
390   }
391   return true;
392}
393
394
395void
396Reader::addComment( Location begin,
397                    Location end,
398                    CommentPlacement placement )
399{
400   assert( collectComments_ );
401   if ( placement == commentAfterOnSameLine )
402   {
403      assert( lastValue_ != 0 );
404      lastValue_->setComment( std::string( begin, end ), placement );
405   }
406   else
407   {
408      if ( !commentsBefore_.empty() )
409         commentsBefore_ += "\n";
410      commentsBefore_ += std::string( begin, end );
411   }
412}
413
414
415bool
416Reader::readCStyleComment()
417{
418   while ( current_ != end_ )
419   {
420      Char c = getNextChar();
421      if ( c == '*'  &&  *current_ == '/' )
422         break;
423   }
424   return getNextChar() == '/';
425}
426
427
428bool
429Reader::readCppStyleComment()
430{
431   while ( current_ != end_ )
432   {
433      Char c = getNextChar();
434      if (  c == '\r'  ||  c == '\n' )
435         break;
436   }
437   return true;
438}
439
440
441void
442Reader::readNumber()
443{
444   while ( current_ != end_ )
445   {
446      if ( !(*current_ >= '0'  &&  *current_ <= '9')  &&
447           !in( *current_, '.', 'e', 'E', '+', '-' ) )
448         break;
449      ++current_;
450   }
451}
452
453bool
454Reader::readString()
455{
456   Char c = 0;
457   while ( current_ != end_ )
458   {
459      c = getNextChar();
460      if ( c == '\\' )
461         getNextChar();
462      else if ( c == '"' )
463         break;
464   }
465   return c == '"';
466}
467
468
469bool
470Reader::readObject( Token &/*tokenStart*/ )
471{
472   Token tokenName;
473   std::string name;
474   currentValue() = Value( objectValue );
475   while ( readToken( tokenName ) )
476   {
477      bool initialTokenOk = true;
478      while ( tokenName.type_ == tokenComment  &&  initialTokenOk )
479         initialTokenOk = readToken( tokenName );
480      if  ( !initialTokenOk )
481         break;
482      if ( tokenName.type_ == tokenObjectEnd  &&  name.empty() )  // empty object
483         return true;
484      if ( tokenName.type_ != tokenString )
485         break;
486
487      name = "";
488      if ( !decodeString( tokenName, name ) )
489         return recoverFromError( tokenObjectEnd );
490
491      Token colon;
492      if ( !readToken( colon ) ||  colon.type_ != tokenMemberSeparator )
493      {
494         return addErrorAndRecover( "Missing ':' after object member name",
495                                    colon,
496                                    tokenObjectEnd );
497      }
498      Value &value = currentValue()[ name ];
499      nodes_.push( &value );
500      bool ok = readValue();
501      nodes_.pop();
502      if ( !ok ) // error already set
503         return recoverFromError( tokenObjectEnd );
504
505      Token comma;
506      if ( !readToken( comma )
507            ||  ( comma.type_ != tokenObjectEnd  &&
508                  comma.type_ != tokenArraySeparator &&
509                  comma.type_ != tokenComment ) )
510      {
511         return addErrorAndRecover( "Missing ',' or '}' in object declaration",
512                                    comma,
513                                    tokenObjectEnd );
514      }
515      bool finalizeTokenOk = true;
516      while ( comma.type_ == tokenComment &&
517              finalizeTokenOk )
518         finalizeTokenOk = readToken( comma );
519      if ( comma.type_ == tokenObjectEnd )
520         return true;
521   }
522   return addErrorAndRecover( "Missing '}' or object member name",
523                              tokenName,
524                              tokenObjectEnd );
525}
526
527
528bool
529Reader::readArray( Token &/*tokenStart*/ )
530{
531   currentValue() = Value( arrayValue );
532   skipSpaces();
533   if ( *current_ == ']' ) // empty array
534   {
535      Token endArray;
536      readToken( endArray );
537      return true;
538   }
539   int index = 0;
540   for (;;)
541   {
542      Value &value = currentValue()[ index++ ];
543      nodes_.push( &value );
544      bool ok = readValue();
545      nodes_.pop();
546      if ( !ok ) // error already set
547         return recoverFromError( tokenArrayEnd );
548
549      Token token;
550      // Accept Comment after last item in the array.
551      ok = readToken( token );
552      while ( token.type_ == tokenComment  &&  ok )
553      {
554         ok = readToken( token );
555      }
556      bool badTokenType = ( token.type_ != tokenArraySeparator  &&
557                            token.type_ != tokenArrayEnd );
558      if ( !ok  ||  badTokenType )
559      {
560         return addErrorAndRecover( "Missing ',' or ']' in array declaration",
561                                    token,
562                                    tokenArrayEnd );
563      }
564      if ( token.type_ == tokenArrayEnd )
565         break;
566   }
567   return true;
568}
569
570
571bool
572Reader::decodeNumber( Token &token )
573{
574   bool isDouble = false;
575   for ( Location inspect = token.start_; inspect != token.end_; ++inspect )
576   {
577      isDouble = isDouble
578                 ||  in( *inspect, '.', 'e', 'E', '+' )
579                 ||  ( *inspect == '-'  &&  inspect != token.start_ );
580   }
581   if ( isDouble )
582      return decodeDouble( token );
583   // Attempts to parse the number as an integer. If the number is
584   // larger than the maximum supported value of an integer then
585   // we decode the number as a double.
586   Location current = token.start_;
587   bool isNegative = *current == '-';
588   if ( isNegative )
589      ++current;
590   Value::LargestUInt maxIntegerValue = isNegative ? Value::LargestUInt(-Value::minLargestInt)
591                                                   : Value::maxLargestUInt;
592   Value::LargestUInt threshold = maxIntegerValue / 10;
593   Value::LargestUInt value = 0;
594   while ( current < token.end_ )
595   {
596      Char c = *current++;
597      if ( c < '0'  ||  c > '9' )
598         return addError( "'" + std::string( token.start_, token.end_ ) + "' is not a number.", token );
599      Value::UInt digit(c - '0');
600      if ( value >= threshold )
601      {
602         // We've hit or exceeded the max value divided by 10 (rounded down). If
603         // a) we've only just touched the limit, b) this is the last digit, and
604         // c) it's small enough to fit in that rounding delta, we're okay.
605         // Otherwise treat this number as a double to avoid overflow.
606         if (value > threshold ||
607             current != token.end_ ||
608             digit > maxIntegerValue % 10)
609         {
610            return decodeDouble( token );
611         }
612      }
613      value = value * 10 + digit;
614   }
615   if ( isNegative )
616      currentValue() = -Value::LargestInt( value );
617   else if ( value <= Value::LargestUInt(Value::maxInt) )
618      currentValue() = Value::LargestInt( value );
619   else
620      currentValue() = value;
621   return true;
622}
623
624
625bool
626Reader::decodeDouble( Token &token )
627{
628   double value = 0;
629   const int bufferSize = 32;
630   int count;
631   int length = int(token.end_ - token.start_);
632
633   // Sanity check to avoid buffer overflow exploits.
634   if (length < 0) {
635      return addError( "Unable to parse token length", token );
636   }
637
638   // Avoid using a string constant for the format control string given to
639   // sscanf, as this can cause hard to debug crashes on OS X. See here for more
640   // info:
641   //
642   //     http://developer.apple.com/library/mac/#DOCUMENTATION/DeveloperTools/gcc-4.0.1/gcc/Incompatibilities.html
643   char format[] = "%lf";
644
645   if ( length <= bufferSize )
646   {
647      Char buffer[bufferSize+1];
648      memcpy( buffer, token.start_, length );
649      buffer[length] = 0;
650      count = sscanf( buffer, format, &value );
651   }
652   else
653   {
654      std::string buffer( token.start_, token.end_ );
655      count = sscanf( buffer.c_str(), format, &value );
656   }
657
658   if ( count != 1 )
659      return addError( "'" + std::string( token.start_, token.end_ ) + "' is not a number.", token );
660   currentValue() = value;
661   return true;
662}
663
664
665bool
666Reader::decodeString( Token &token )
667{
668   std::string decoded;
669   if ( !decodeString( token, decoded ) )
670      return false;
671   currentValue() = decoded;
672   return true;
673}
674
675
676bool
677Reader::decodeString( Token &token, std::string &decoded )
678{
679   decoded.reserve( token.end_ - token.start_ - 2 );
680   Location current = token.start_ + 1; // skip '"'
681   Location end = token.end_ - 1;      // do not include '"'
682   while ( current != end )
683   {
684      Char c = *current++;
685      if ( c == '"' )
686         break;
687      else if ( c == '\\' )
688      {
689         if ( current == end )
690            return addError( "Empty escape sequence in string", token, current );
691         Char escape = *current++;
692         switch ( escape )
693         {
694         case '"': decoded += '"'; break;
695         case '/': decoded += '/'; break;
696         case '\\': decoded += '\\'; break;
697         case 'b': decoded += '\b'; break;
698         case 'f': decoded += '\f'; break;
699         case 'n': decoded += '\n'; break;
700         case 'r': decoded += '\r'; break;
701         case 't': decoded += '\t'; break;
702         case 'u':
703            {
704               unsigned int unicode;
705               if ( !decodeUnicodeCodePoint( token, current, end, unicode ) )
706                  return false;
707               decoded += codePointToUTF8(unicode);
708            }
709            break;
710         default:
711            return addError( "Bad escape sequence in string", token, current );
712         }
713      }
714      else
715      {
716         decoded += c;
717      }
718   }
719   return true;
720}
721
722bool
723Reader::decodeUnicodeCodePoint( Token &token,
724                                     Location &current,
725                                     Location end,
726                                     unsigned int &unicode )
727{
728
729   if ( !decodeUnicodeEscapeSequence( token, current, end, unicode ) )
730      return false;
731   if (unicode >= 0xD800 && unicode <= 0xDBFF)
732   {
733      // surrogate pairs
734      if (end - current < 6)
735         return addError( "additional six characters expected to parse unicode surrogate pair.", token, current );
736      unsigned int surrogatePair;
737      if (*(current++) == '\\' && *(current++)== 'u')
738      {
739         if (decodeUnicodeEscapeSequence( token, current, end, surrogatePair ))
740         {
741            unicode = 0x10000 + ((unicode & 0x3FF) << 10) + (surrogatePair & 0x3FF);
742         }
743         else
744            return false;
745      }
746      else
747         return addError( "expecting another \\u token to begin the second half of a unicode surrogate pair", token, current );
748   }
749   return true;
750}
751
752bool
753Reader::decodeUnicodeEscapeSequence( Token &token,
754                                     Location &current,
755                                     Location end,
756                                     unsigned int &unicode )
757{
758   if ( end - current < 4 )
759      return addError( "Bad unicode escape sequence in string: four digits expected.", token, current );
760   unicode = 0;
761   for ( int index =0; index < 4; ++index )
762   {
763      Char c = *current++;
764      unicode *= 16;
765      if ( c >= '0'  &&  c <= '9' )
766         unicode += c - '0';
767      else if ( c >= 'a'  &&  c <= 'f' )
768         unicode += c - 'a' + 10;
769      else if ( c >= 'A'  &&  c <= 'F' )
770         unicode += c - 'A' + 10;
771      else
772         return addError( "Bad unicode escape sequence in string: hexadecimal digit expected.", token, current );
773   }
774   return true;
775}
776
777
778bool
779Reader::addError( const std::string &message,
780                  Token &token,
781                  Location extra )
782{
783   ErrorInfo info;
784   info.token_ = token;
785   info.message_ = message;
786   info.extra_ = extra;
787   errors_.push_back( info );
788   return false;
789}
790
791
792bool
793Reader::recoverFromError( TokenType skipUntilToken )
794{
795   int errorCount = int(errors_.size());
796   Token skip;
797   for (;;)
798   {
799      if ( !readToken(skip) )
800         errors_.resize( errorCount ); // discard errors caused by recovery
801      if ( skip.type_ == skipUntilToken  ||  skip.type_ == tokenEndOfStream )
802         break;
803   }
804   errors_.resize( errorCount );
805   return false;
806}
807
808
809bool
810Reader::addErrorAndRecover( const std::string &message,
811                            Token &token,
812                            TokenType skipUntilToken )
813{
814   addError( message, token );
815   return recoverFromError( skipUntilToken );
816}
817
818
819Value &
820Reader::currentValue()
821{
822   return *(nodes_.top());
823}
824
825
826Reader::Char
827Reader::getNextChar()
828{
829   if ( current_ == end_ )
830      return 0;
831   return *current_++;
832}
833
834
835void
836Reader::getLocationLineAndColumn( Location location,
837                                  int &line,
838                                  int &column ) const
839{
840   Location current = begin_;
841   Location lastLineStart = current;
842   line = 0;
843   while ( current < location  &&  current != end_ )
844   {
845      Char c = *current++;
846      if ( c == '\r' )
847      {
848         if ( *current == '\n' )
849            ++current;
850         lastLineStart = current;
851         ++line;
852      }
853      else if ( c == '\n' )
854      {
855         lastLineStart = current;
856         ++line;
857      }
858   }
859   // column & line start at 1
860   column = int(location - lastLineStart) + 1;
861   ++line;
862}
863
864
865std::string
866Reader::getLocationLineAndColumn( Location location ) const
867{
868   int line, column;
869   getLocationLineAndColumn( location, line, column );
870   char buffer[18+16+16+1];
871   sprintf( buffer, "Line %d, Column %d", line, column );
872   return buffer;
873}
874
875
876// Deprecated. Preserved for backward compatibility
877std::string
878Reader::getFormatedErrorMessages() const
879{
880    return getFormattedErrorMessages();
881}
882
883
884std::string
885Reader::getFormattedErrorMessages() const
886{
887   std::string formattedMessage;
888   for ( Errors::const_iterator itError = errors_.begin();
889         itError != errors_.end();
890         ++itError )
891   {
892      const ErrorInfo &error = *itError;
893      formattedMessage += "* " + getLocationLineAndColumn( error.token_.start_ ) + "\n";
894      formattedMessage += "  " + error.message_ + "\n";
895      if ( error.extra_ )
896         formattedMessage += "See " + getLocationLineAndColumn( error.extra_ ) + " for detail.\n";
897   }
898   return formattedMessage;
899}
900
901
902std::istream& operator>>( std::istream &sin, Value &root )
903{
904    Json::Reader reader;
905    bool ok = reader.parse(sin, root, true);
906    if (!ok) {
907      fprintf(
908          stderr,
909          "Error from reader: %s",
910          reader.getFormattedErrorMessages().c_str());
911
912      JSON_FAIL_MESSAGE("reader error");
913    }
914    return sin;
915}
916
917
918} // namespace Json
919