json_reader.cpp revision a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7
1// Copyright 2007-2011 Baptiste Lepilleur
2// Distributed under MIT license, or public domain if desired and
3// recognized in your jurisdiction.
4// See file LICENSE for detail or copy at http://jsoncpp.sourceforge.net/LICENSE
5
6#if !defined(JSON_IS_AMALGAMATION)
7# include <json/assertions.h>
8# include <json/reader.h>
9# include <json/value.h>
10# include "json_tool.h"
11#endif // if !defined(JSON_IS_AMALGAMATION)
12#include <utility>
13#include <cstdio>
14#include <cassert>
15#include <cstring>
16#include <stdexcept>
17#ifdef __pnacl__
18// This file uses the following headers (at least in Reader::parse), but
19// the upstream version doesn't include them because iostream pulls in
20// static initializers.  This breaks the PNaCl build because it uses
21// libc++ which declares getline in <string> (as per the C++ standard)
22// but defines it in <iostream>. The code therefore fails linking, which
23// these includes fix.
24#include <string>
25#include <iostream>
26#endif
27
28#if _MSC_VER >= 1400 // VC++ 8.0
29#pragma warning( disable : 4996 )   // disable warning about strdup being deprecated.
30#endif
31
32namespace Json {
33
34// Implementation of class Features
35// ////////////////////////////////
36
37Features::Features()
38   : allowComments_( true )
39   , strictRoot_( false )
40{
41}
42
43
44Features
45Features::all()
46{
47   return Features();
48}
49
50
51Features
52Features::strictMode()
53{
54   Features features;
55   features.allowComments_ = false;
56   features.strictRoot_ = true;
57   return features;
58}
59
60// Implementation of class Reader
61// ////////////////////////////////
62
63
64static inline bool
65in( Reader::Char c, Reader::Char c1, Reader::Char c2, Reader::Char c3, Reader::Char c4 )
66{
67   return c == c1  ||  c == c2  ||  c == c3  ||  c == c4;
68}
69
70static inline bool
71in( Reader::Char c, Reader::Char c1, Reader::Char c2, Reader::Char c3, Reader::Char c4, Reader::Char c5 )
72{
73   return c == c1  ||  c == c2  ||  c == c3  ||  c == c4  ||  c == c5;
74}
75
76
77static bool
78containsNewLine( Reader::Location begin,
79                 Reader::Location end )
80{
81   for ( ;begin < end; ++begin )
82      if ( *begin == '\n'  ||  *begin == '\r' )
83         return true;
84   return false;
85}
86
87
88// Class Reader
89// //////////////////////////////////////////////////////////////////
90
91Reader::Reader()
92    : errors_(),
93      document_(),
94      begin_(),
95      end_(),
96      current_(),
97      lastValueEnd_(),
98      lastValue_(),
99      commentsBefore_(),
100      features_( Features::all() ),
101      collectComments_()
102{
103}
104
105
106Reader::Reader( const Features &features )
107    : errors_(),
108      document_(),
109      begin_(),
110      end_(),
111      current_(),
112      lastValueEnd_(),
113      lastValue_(),
114      commentsBefore_(),
115      features_( features ),
116      collectComments_()
117{
118}
119
120
121bool
122Reader::parse( const std::string &document,
123               Value &root,
124               bool collectComments )
125{
126   document_ = document;
127   const char *begin = document_.c_str();
128   const char *end = begin + document_.length();
129   return parse( begin, end, root, collectComments );
130}
131
132
133bool
134Reader::parse( std::istream& sin,
135               Value &root,
136               bool collectComments )
137{
138   //std::istream_iterator<char> begin(sin);
139   //std::istream_iterator<char> end;
140   // Those would allow streamed input from a file, if parse() were a
141   // template function.
142
143   // Since std::string is reference-counted, this at least does not
144   // create an extra copy.
145   std::string doc;
146   std::getline(sin, doc, (char)EOF);
147   return parse( doc, root, collectComments );
148}
149
150bool
151Reader::parse( const char *beginDoc, const char *endDoc,
152               Value &root,
153               bool collectComments )
154{
155   if ( !features_.allowComments_ )
156   {
157      collectComments = false;
158   }
159
160   begin_ = beginDoc;
161   end_ = endDoc;
162   collectComments_ = collectComments;
163   current_ = begin_;
164   lastValueEnd_ = 0;
165   lastValue_ = 0;
166   commentsBefore_ = "";
167   errors_.clear();
168   while ( !nodes_.empty() )
169      nodes_.pop();
170   nodes_.push( &root );
171
172   bool successful = readValue();
173   Token token;
174   skipCommentTokens( token );
175   if ( collectComments_  &&  !commentsBefore_.empty() )
176      root.setComment( commentsBefore_, commentAfter );
177   if ( features_.strictRoot_ )
178   {
179      if ( !root.isArray()  &&  !root.isObject() )
180      {
181         // Set error location to start of doc, ideally should be first token found in doc
182         token.type_ = tokenError;
183         token.start_ = beginDoc;
184         token.end_ = endDoc;
185         addError( "A valid JSON document must be either an array or an object value.",
186                   token );
187         return false;
188      }
189   }
190   return successful;
191}
192
193
194bool
195Reader::readValue()
196{
197   Token token;
198   skipCommentTokens( token );
199   bool successful = true;
200
201   if ( collectComments_  &&  !commentsBefore_.empty() )
202   {
203      currentValue().setComment( commentsBefore_, commentBefore );
204      commentsBefore_ = "";
205   }
206
207
208   switch ( token.type_ )
209   {
210   case tokenObjectBegin:
211      successful = readObject( token );
212      break;
213   case tokenArrayBegin:
214      successful = readArray( token );
215      break;
216   case tokenNumber:
217      successful = decodeNumber( token );
218      break;
219   case tokenString:
220      successful = decodeString( token );
221      break;
222   case tokenTrue:
223      currentValue() = true;
224      break;
225   case tokenFalse:
226      currentValue() = false;
227      break;
228   case tokenNull:
229      currentValue() = Value();
230      break;
231   default:
232      return addError( "Syntax error: value, object or array expected.", token );
233   }
234
235   if ( collectComments_ )
236   {
237      lastValueEnd_ = current_;
238      lastValue_ = &currentValue();
239   }
240
241   return successful;
242}
243
244
245void
246Reader::skipCommentTokens( Token &token )
247{
248   if ( features_.allowComments_ )
249   {
250      do
251      {
252         readToken( token );
253      }
254      while ( token.type_ == tokenComment );
255   }
256   else
257   {
258      readToken( token );
259   }
260}
261
262
263bool
264Reader::expectToken( TokenType type, Token &token, const char *message )
265{
266   readToken( token );
267   if ( token.type_ != type )
268      return addError( message, token );
269   return true;
270}
271
272
273bool
274Reader::readToken( Token &token )
275{
276   skipSpaces();
277   token.start_ = current_;
278   Char c = getNextChar();
279   bool ok = true;
280   switch ( c )
281   {
282   case '{':
283      token.type_ = tokenObjectBegin;
284      break;
285   case '}':
286      token.type_ = tokenObjectEnd;
287      break;
288   case '[':
289      token.type_ = tokenArrayBegin;
290      break;
291   case ']':
292      token.type_ = tokenArrayEnd;
293      break;
294   case '"':
295      token.type_ = tokenString;
296      ok = readString();
297      break;
298   case '/':
299      token.type_ = tokenComment;
300      ok = readComment();
301      break;
302   case '0':
303   case '1':
304   case '2':
305   case '3':
306   case '4':
307   case '5':
308   case '6':
309   case '7':
310   case '8':
311   case '9':
312   case '-':
313      token.type_ = tokenNumber;
314      readNumber();
315      break;
316   case 't':
317      token.type_ = tokenTrue;
318      ok = match( "rue", 3 );
319      break;
320   case 'f':
321      token.type_ = tokenFalse;
322      ok = match( "alse", 4 );
323      break;
324   case 'n':
325      token.type_ = tokenNull;
326      ok = match( "ull", 3 );
327      break;
328   case ',':
329      token.type_ = tokenArraySeparator;
330      break;
331   case ':':
332      token.type_ = tokenMemberSeparator;
333      break;
334   case 0:
335      token.type_ = tokenEndOfStream;
336      break;
337   default:
338      ok = false;
339      break;
340   }
341   if ( !ok )
342      token.type_ = tokenError;
343   token.end_ = current_;
344   return true;
345}
346
347
348void
349Reader::skipSpaces()
350{
351   while ( current_ != end_ )
352   {
353      Char c = *current_;
354      if ( c == ' '  ||  c == '\t'  ||  c == '\r'  ||  c == '\n' )
355         ++current_;
356      else
357         break;
358   }
359}
360
361
362bool
363Reader::match( Location pattern,
364               int patternLength )
365{
366   if ( end_ - current_ < patternLength )
367      return false;
368   int index = patternLength;
369   while ( index-- )
370      if ( current_[index] != pattern[index] )
371         return false;
372   current_ += patternLength;
373   return true;
374}
375
376
377bool
378Reader::readComment()
379{
380   Location commentBegin = current_ - 1;
381   Char c = getNextChar();
382   bool successful = false;
383   if ( c == '*' )
384      successful = readCStyleComment();
385   else if ( c == '/' )
386      successful = readCppStyleComment();
387   if ( !successful )
388      return false;
389
390   if ( collectComments_ )
391   {
392      CommentPlacement placement = commentBefore;
393      if ( lastValueEnd_  &&  !containsNewLine( lastValueEnd_, commentBegin ) )
394      {
395         if ( c != '*'  ||  !containsNewLine( commentBegin, current_ ) )
396            placement = commentAfterOnSameLine;
397      }
398
399      addComment( commentBegin, current_, placement );
400   }
401   return true;
402}
403
404
405void
406Reader::addComment( Location begin,
407                    Location end,
408                    CommentPlacement placement )
409{
410   assert( collectComments_ );
411   if ( placement == commentAfterOnSameLine )
412   {
413      assert( lastValue_ != 0 );
414      lastValue_->setComment( std::string( begin, end ), placement );
415   }
416   else
417   {
418      if ( !commentsBefore_.empty() )
419         commentsBefore_ += "\n";
420      commentsBefore_ += std::string( begin, end );
421   }
422}
423
424
425bool
426Reader::readCStyleComment()
427{
428   while ( current_ != end_ )
429   {
430      Char c = getNextChar();
431      if ( c == '*'  &&  *current_ == '/' )
432         break;
433   }
434   return getNextChar() == '/';
435}
436
437
438bool
439Reader::readCppStyleComment()
440{
441   while ( current_ != end_ )
442   {
443      Char c = getNextChar();
444      if (  c == '\r'  ||  c == '\n' )
445         break;
446   }
447   return true;
448}
449
450
451void
452Reader::readNumber()
453{
454   while ( current_ != end_ )
455   {
456      if ( !(*current_ >= '0'  &&  *current_ <= '9')  &&
457           !in( *current_, '.', 'e', 'E', '+', '-' ) )
458         break;
459      ++current_;
460   }
461}
462
463bool
464Reader::readString()
465{
466   Char c = 0;
467   while ( current_ != end_ )
468   {
469      c = getNextChar();
470      if ( c == '\\' )
471         getNextChar();
472      else if ( c == '"' )
473         break;
474   }
475   return c == '"';
476}
477
478
479bool
480Reader::readObject( Token &/*tokenStart*/ )
481{
482   Token tokenName;
483   std::string name;
484   currentValue() = Value( objectValue );
485   while ( readToken( tokenName ) )
486   {
487      bool initialTokenOk = true;
488      while ( tokenName.type_ == tokenComment  &&  initialTokenOk )
489         initialTokenOk = readToken( tokenName );
490      if  ( !initialTokenOk )
491         break;
492      if ( tokenName.type_ == tokenObjectEnd  &&  name.empty() )  // empty object
493         return true;
494      if ( tokenName.type_ != tokenString )
495         break;
496
497      name = "";
498      if ( !decodeString( tokenName, name ) )
499         return recoverFromError( tokenObjectEnd );
500
501      Token colon;
502      if ( !readToken( colon ) ||  colon.type_ != tokenMemberSeparator )
503      {
504         return addErrorAndRecover( "Missing ':' after object member name",
505                                    colon,
506                                    tokenObjectEnd );
507      }
508      Value &value = currentValue()[ name ];
509      nodes_.push( &value );
510      bool ok = readValue();
511      nodes_.pop();
512      if ( !ok ) // error already set
513         return recoverFromError( tokenObjectEnd );
514
515      Token comma;
516      if ( !readToken( comma )
517            ||  ( comma.type_ != tokenObjectEnd  &&
518                  comma.type_ != tokenArraySeparator &&
519                  comma.type_ != tokenComment ) )
520      {
521         return addErrorAndRecover( "Missing ',' or '}' in object declaration",
522                                    comma,
523                                    tokenObjectEnd );
524      }
525      bool finalizeTokenOk = true;
526      while ( comma.type_ == tokenComment &&
527              finalizeTokenOk )
528         finalizeTokenOk = readToken( comma );
529      if ( comma.type_ == tokenObjectEnd )
530         return true;
531   }
532   return addErrorAndRecover( "Missing '}' or object member name",
533                              tokenName,
534                              tokenObjectEnd );
535}
536
537
538bool
539Reader::readArray( Token &/*tokenStart*/ )
540{
541   currentValue() = Value( arrayValue );
542   skipSpaces();
543   if ( *current_ == ']' ) // empty array
544   {
545      Token endArray;
546      readToken( endArray );
547      return true;
548   }
549   int index = 0;
550   for (;;)
551   {
552      Value &value = currentValue()[ index++ ];
553      nodes_.push( &value );
554      bool ok = readValue();
555      nodes_.pop();
556      if ( !ok ) // error already set
557         return recoverFromError( tokenArrayEnd );
558
559      Token token;
560      // Accept Comment after last item in the array.
561      ok = readToken( token );
562      while ( token.type_ == tokenComment  &&  ok )
563      {
564         ok = readToken( token );
565      }
566      bool badTokenType = ( token.type_ != tokenArraySeparator  &&
567                            token.type_ != tokenArrayEnd );
568      if ( !ok  ||  badTokenType )
569      {
570         return addErrorAndRecover( "Missing ',' or ']' in array declaration",
571                                    token,
572                                    tokenArrayEnd );
573      }
574      if ( token.type_ == tokenArrayEnd )
575         break;
576   }
577   return true;
578}
579
580
581bool
582Reader::decodeNumber( Token &token )
583{
584   bool isDouble = false;
585   for ( Location inspect = token.start_; inspect != token.end_; ++inspect )
586   {
587      isDouble = isDouble
588                 ||  in( *inspect, '.', 'e', 'E', '+' )
589                 ||  ( *inspect == '-'  &&  inspect != token.start_ );
590   }
591   if ( isDouble )
592      return decodeDouble( token );
593   // Attempts to parse the number as an integer. If the number is
594   // larger than the maximum supported value of an integer then
595   // we decode the number as a double.
596   Location current = token.start_;
597   bool isNegative = *current == '-';
598   if ( isNegative )
599      ++current;
600   Value::LargestUInt maxIntegerValue = isNegative ? Value::LargestUInt(-Value::minLargestInt)
601                                                   : Value::maxLargestUInt;
602   Value::LargestUInt threshold = maxIntegerValue / 10;
603   Value::LargestUInt value = 0;
604   while ( current < token.end_ )
605   {
606      Char c = *current++;
607      if ( c < '0'  ||  c > '9' )
608         return addError( "'" + std::string( token.start_, token.end_ ) + "' is not a number.", token );
609      Value::UInt digit(c - '0');
610      if ( value >= threshold )
611      {
612         // We've hit or exceeded the max value divided by 10 (rounded down). If
613         // a) we've only just touched the limit, b) this is the last digit, and
614         // c) it's small enough to fit in that rounding delta, we're okay.
615         // Otherwise treat this number as a double to avoid overflow.
616         if (value > threshold ||
617             current != token.end_ ||
618             digit > maxIntegerValue % 10)
619         {
620            return decodeDouble( token );
621         }
622      }
623      value = value * 10 + digit;
624   }
625   if ( isNegative )
626      currentValue() = -Value::LargestInt( value );
627   else if ( value <= Value::LargestUInt(Value::maxInt) )
628      currentValue() = Value::LargestInt( value );
629   else
630      currentValue() = value;
631   return true;
632}
633
634
635bool
636Reader::decodeDouble( Token &token )
637{
638   double value = 0;
639   const int bufferSize = 32;
640   int count;
641   int length = int(token.end_ - token.start_);
642
643   // Sanity check to avoid buffer overflow exploits.
644   if (length < 0) {
645      return addError( "Unable to parse token length", token );
646   }
647
648   // Avoid using a string constant for the format control string given to
649   // sscanf, as this can cause hard to debug crashes on OS X. See here for more
650   // info:
651   //
652   //     http://developer.apple.com/library/mac/#DOCUMENTATION/DeveloperTools/gcc-4.0.1/gcc/Incompatibilities.html
653   char format[] = "%lf";
654
655   if ( length <= bufferSize )
656   {
657      Char buffer[bufferSize+1];
658      memcpy( buffer, token.start_, length );
659      buffer[length] = 0;
660      count = sscanf( buffer, format, &value );
661   }
662   else
663   {
664      std::string buffer( token.start_, token.end_ );
665      count = sscanf( buffer.c_str(), format, &value );
666   }
667
668   if ( count != 1 )
669      return addError( "'" + std::string( token.start_, token.end_ ) + "' is not a number.", token );
670   currentValue() = value;
671   return true;
672}
673
674
675bool
676Reader::decodeString( Token &token )
677{
678   std::string decoded;
679   if ( !decodeString( token, decoded ) )
680      return false;
681   currentValue() = decoded;
682   return true;
683}
684
685
686bool
687Reader::decodeString( Token &token, std::string &decoded )
688{
689   decoded.reserve( token.end_ - token.start_ - 2 );
690   Location current = token.start_ + 1; // skip '"'
691   Location end = token.end_ - 1;      // do not include '"'
692   while ( current != end )
693   {
694      Char c = *current++;
695      if ( c == '"' )
696         break;
697      else if ( c == '\\' )
698      {
699         if ( current == end )
700            return addError( "Empty escape sequence in string", token, current );
701         Char escape = *current++;
702         switch ( escape )
703         {
704         case '"': decoded += '"'; break;
705         case '/': decoded += '/'; break;
706         case '\\': decoded += '\\'; break;
707         case 'b': decoded += '\b'; break;
708         case 'f': decoded += '\f'; break;
709         case 'n': decoded += '\n'; break;
710         case 'r': decoded += '\r'; break;
711         case 't': decoded += '\t'; break;
712         case 'u':
713            {
714               unsigned int unicode;
715               if ( !decodeUnicodeCodePoint( token, current, end, unicode ) )
716                  return false;
717               decoded += codePointToUTF8(unicode);
718            }
719            break;
720         default:
721            return addError( "Bad escape sequence in string", token, current );
722         }
723      }
724      else
725      {
726         decoded += c;
727      }
728   }
729   return true;
730}
731
732bool
733Reader::decodeUnicodeCodePoint( Token &token,
734                                     Location &current,
735                                     Location end,
736                                     unsigned int &unicode )
737{
738
739   if ( !decodeUnicodeEscapeSequence( token, current, end, unicode ) )
740      return false;
741   if (unicode >= 0xD800 && unicode <= 0xDBFF)
742   {
743      // surrogate pairs
744      if (end - current < 6)
745         return addError( "additional six characters expected to parse unicode surrogate pair.", token, current );
746      unsigned int surrogatePair;
747      if (*(current++) == '\\' && *(current++)== 'u')
748      {
749         if (decodeUnicodeEscapeSequence( token, current, end, surrogatePair ))
750         {
751            unicode = 0x10000 + ((unicode & 0x3FF) << 10) + (surrogatePair & 0x3FF);
752         }
753         else
754            return false;
755      }
756      else
757         return addError( "expecting another \\u token to begin the second half of a unicode surrogate pair", token, current );
758   }
759   return true;
760}
761
762bool
763Reader::decodeUnicodeEscapeSequence( Token &token,
764                                     Location &current,
765                                     Location end,
766                                     unsigned int &unicode )
767{
768   if ( end - current < 4 )
769      return addError( "Bad unicode escape sequence in string: four digits expected.", token, current );
770   unicode = 0;
771   for ( int index =0; index < 4; ++index )
772   {
773      Char c = *current++;
774      unicode *= 16;
775      if ( c >= '0'  &&  c <= '9' )
776         unicode += c - '0';
777      else if ( c >= 'a'  &&  c <= 'f' )
778         unicode += c - 'a' + 10;
779      else if ( c >= 'A'  &&  c <= 'F' )
780         unicode += c - 'A' + 10;
781      else
782         return addError( "Bad unicode escape sequence in string: hexadecimal digit expected.", token, current );
783   }
784   return true;
785}
786
787
788bool
789Reader::addError( const std::string &message,
790                  Token &token,
791                  Location extra )
792{
793   ErrorInfo info;
794   info.token_ = token;
795   info.message_ = message;
796   info.extra_ = extra;
797   errors_.push_back( info );
798   return false;
799}
800
801
802bool
803Reader::recoverFromError( TokenType skipUntilToken )
804{
805   int errorCount = int(errors_.size());
806   Token skip;
807   for (;;)
808   {
809      if ( !readToken(skip) )
810         errors_.resize( errorCount ); // discard errors caused by recovery
811      if ( skip.type_ == skipUntilToken  ||  skip.type_ == tokenEndOfStream )
812         break;
813   }
814   errors_.resize( errorCount );
815   return false;
816}
817
818
819bool
820Reader::addErrorAndRecover( const std::string &message,
821                            Token &token,
822                            TokenType skipUntilToken )
823{
824   addError( message, token );
825   return recoverFromError( skipUntilToken );
826}
827
828
829Value &
830Reader::currentValue()
831{
832   return *(nodes_.top());
833}
834
835
836Reader::Char
837Reader::getNextChar()
838{
839   if ( current_ == end_ )
840      return 0;
841   return *current_++;
842}
843
844
845void
846Reader::getLocationLineAndColumn( Location location,
847                                  int &line,
848                                  int &column ) const
849{
850   Location current = begin_;
851   Location lastLineStart = current;
852   line = 0;
853   while ( current < location  &&  current != end_ )
854   {
855      Char c = *current++;
856      if ( c == '\r' )
857      {
858         if ( *current == '\n' )
859            ++current;
860         lastLineStart = current;
861         ++line;
862      }
863      else if ( c == '\n' )
864      {
865         lastLineStart = current;
866         ++line;
867      }
868   }
869   // column & line start at 1
870   column = int(location - lastLineStart) + 1;
871   ++line;
872}
873
874
875std::string
876Reader::getLocationLineAndColumn( Location location ) const
877{
878   int line, column;
879   getLocationLineAndColumn( location, line, column );
880   char buffer[18+16+16+1];
881   sprintf( buffer, "Line %d, Column %d", line, column );
882   return buffer;
883}
884
885
886// Deprecated. Preserved for backward compatibility
887std::string
888Reader::getFormatedErrorMessages() const
889{
890    return getFormattedErrorMessages();
891}
892
893
894std::string
895Reader::getFormattedErrorMessages() const
896{
897   std::string formattedMessage;
898   for ( Errors::const_iterator itError = errors_.begin();
899         itError != errors_.end();
900         ++itError )
901   {
902      const ErrorInfo &error = *itError;
903      formattedMessage += "* " + getLocationLineAndColumn( error.token_.start_ ) + "\n";
904      formattedMessage += "  " + error.message_ + "\n";
905      if ( error.extra_ )
906         formattedMessage += "See " + getLocationLineAndColumn( error.extra_ ) + " for detail.\n";
907   }
908   return formattedMessage;
909}
910
911
912std::istream& operator>>( std::istream &sin, Value &root )
913{
914    Json::Reader reader;
915    bool ok = reader.parse(sin, root, true);
916    if (!ok) {
917      fprintf(
918          stderr,
919          "Error from reader: %s",
920          reader.getFormattedErrorMessages().c_str());
921
922      JSON_FAIL_MESSAGE("reader error");
923    }
924    return sin;
925}
926
927
928} // namespace Json
929