1/*
2www.sourceforge.net/projects/tinyxml
3Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
4
5This software is provided 'as-is', without any express or implied
6warranty. In no event will the authors be held liable for any
7damages arising from the use of this software.
8
9Permission is granted to anyone to use this software for any
10purpose, including commercial applications, and to alter it and
11redistribute it freely, subject to the following restrictions:
12
131. The origin of this software must not be misrepresented; you must
14not claim that you wrote the original software. If you use this
15software in a product, an acknowledgment in the product documentation
16would be appreciated but is not required.
17
182. Altered source versions must be plainly marked as such, and
19must not be misrepresented as being the original software.
20
213. This notice may not be removed or altered from any source
22distribution.
23*/
24
25#include "tinyxml.h"
26#include <ctype.h>
27#include <stddef.h>
28
29//#define DEBUG_PARSER
30
31// Note tha "PutString" hardcodes the same list. This
32// is less flexible than it appears. Changing the entries
33// or order will break putstring.
34TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
35{
36	{ "&amp;",  5, '&' },
37	{ "&lt;",   4, '<' },
38	{ "&gt;",   4, '>' },
39	{ "&quot;", 6, '\"' },
40	{ "&apos;", 6, '\'' }
41};
42
43// Bunch of unicode info at:
44//		http://www.unicode.org/faq/utf_bom.html
45// Including the basic of this table, which determines the #bytes in the
46// sequence from the lead byte. 1 placed for invalid sequences --
47// although the result will be junk, pass it through as much as possible.
48// Beware of the non-characters in UTF-8:
49//				ef bb bf (Microsoft "lead bytes")
50//				ef bf be
51//				ef bf bf
52
53const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
54const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
55const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
56
57const int TiXmlBase::utf8ByteTable[256] =
58{
59	//	0	1	2	3	4	5	6	7	8	9	a	b	c	d	e	f
60		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x00
61		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x10
62		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x20
63		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x30
64		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x40
65		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x50
66		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x60
67		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x70	End of ASCII range
68		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x80 0x80 to 0xc1 invalid
69		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x90
70		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xa0
71		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xb0
72		1,	1,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xc0 0xc2 to 0xdf 2 byte
73		2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xd0
74		3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	// 0xe0 0xe0 to 0xef 3 byte
75		4,	4,	4,	4,	4,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1	// 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
76};
77
78
79void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
80{
81	const unsigned long BYTE_MASK = 0xBF;
82	const unsigned long BYTE_MARK = 0x80;
83	const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
84
85	if (input < 0x80)
86		*length = 1;
87	else if ( input < 0x800 )
88		*length = 2;
89	else if ( input < 0x10000 )
90		*length = 3;
91	else if ( input < 0x200000 )
92		*length = 4;
93	else
94		{ *length = 0; return; }	// This code won't covert this correctly anyway.
95
96	output += *length;
97
98	// Scary scary fall throughs.
99	switch (*length)
100	{
101		case 4:
102			--output;
103			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
104			input >>= 6;
105		case 3:
106			--output;
107			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
108			input >>= 6;
109		case 2:
110			--output;
111			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
112			input >>= 6;
113		case 1:
114			--output;
115			*output = (char)(input | FIRST_BYTE_MARK[*length]);
116	}
117}
118
119
120/*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
121{
122	// This will only work for low-ascii, everything else is assumed to be a valid
123	// letter. I'm not sure this is the best approach, but it is quite tricky trying
124	// to figure out alhabetical vs. not across encoding. So take a very
125	// conservative approach.
126
127//	if ( encoding == TIXML_ENCODING_UTF8 )
128//	{
129		if ( anyByte < 127 )
130			return isalpha( anyByte );
131		else
132			return 1;	// What else to do? The unicode set is huge...get the english ones right.
133//	}
134//	else
135//	{
136//		return isalpha( anyByte );
137//	}
138}
139
140
141/*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
142{
143	// This will only work for low-ascii, everything else is assumed to be a valid
144	// letter. I'm not sure this is the best approach, but it is quite tricky trying
145	// to figure out alhabetical vs. not across encoding. So take a very
146	// conservative approach.
147
148//	if ( encoding == TIXML_ENCODING_UTF8 )
149//	{
150		if ( anyByte < 127 )
151			return isalnum( anyByte );
152		else
153			return 1;	// What else to do? The unicode set is huge...get the english ones right.
154//	}
155//	else
156//	{
157//		return isalnum( anyByte );
158//	}
159}
160
161
162class TiXmlParsingData
163{
164	friend class TiXmlDocument;
165  public:
166	void Stamp( const char* now, TiXmlEncoding encoding );
167
168	const TiXmlCursor& Cursor()	{ return cursor; }
169
170  private:
171	// Only used by the document!
172	TiXmlParsingData( const char* start, int _tabsize, int row, int col )
173	{
174		assert( start );
175		stamp = start;
176		tabsize = _tabsize;
177		cursor.row = row;
178		cursor.col = col;
179	}
180
181	TiXmlCursor		cursor;
182	const char*		stamp;
183	int				tabsize;
184};
185
186
187void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
188{
189	assert( now );
190
191	// Do nothing if the tabsize is 0.
192	if ( tabsize < 1 )
193	{
194		return;
195	}
196
197	// Get the current row, column.
198	int row = cursor.row;
199	int col = cursor.col;
200	const char* p = stamp;
201	assert( p );
202
203	while ( p < now )
204	{
205		// Treat p as unsigned, so we have a happy compiler.
206		const unsigned char* pU = (const unsigned char*)p;
207
208		// Code contributed by Fletcher Dunn: (modified by lee)
209		switch (*pU) {
210			case 0:
211				// We *should* never get here, but in case we do, don't
212				// advance past the terminating null character, ever
213				return;
214
215			case '\r':
216				// bump down to the next line
217				++row;
218				col = 0;
219				// Eat the character
220				++p;
221
222				// Check for \r\n sequence, and treat this as a single character
223				if (*p == '\n') {
224					++p;
225				}
226				break;
227
228			case '\n':
229				// bump down to the next line
230				++row;
231				col = 0;
232
233				// Eat the character
234				++p;
235
236				// Check for \n\r sequence, and treat this as a single
237				// character.  (Yes, this bizarre thing does occur still
238				// on some arcane platforms...)
239				if (*p == '\r') {
240					++p;
241				}
242				break;
243
244			case '\t':
245				// Eat the character
246				++p;
247
248				// Skip to next tab stop
249				col = (col / tabsize + 1) * tabsize;
250				break;
251
252			case TIXML_UTF_LEAD_0:
253				if ( encoding == TIXML_ENCODING_UTF8 )
254				{
255					if ( *(p+1) && *(p+2) )
256					{
257						// In these cases, don't advance the column. These are
258						// 0-width spaces.
259						if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
260							p += 3;
261						else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
262							p += 3;
263						else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
264							p += 3;
265						else
266							{ p +=3; ++col; }	// A normal character.
267					}
268				}
269				else
270				{
271					++p;
272					++col;
273				}
274				break;
275
276			default:
277				if ( encoding == TIXML_ENCODING_UTF8 )
278				{
279					// Eat the 1 to 4 byte utf8 character.
280					int step = TiXmlBase::utf8ByteTable[*((unsigned char*)p)];
281					if ( step == 0 )
282						step = 1;		// Error case from bad encoding, but handle gracefully.
283					p += step;
284
285					// Just advance one column, of course.
286					++col;
287				}
288				else
289				{
290					++p;
291					++col;
292				}
293				break;
294		}
295	}
296	cursor.row = row;
297	cursor.col = col;
298	assert( cursor.row >= -1 );
299	assert( cursor.col >= -1 );
300	stamp = p;
301	assert( stamp );
302}
303
304
305const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
306{
307	if ( !p || !*p )
308	{
309		return 0;
310	}
311	if ( encoding == TIXML_ENCODING_UTF8 )
312	{
313		while ( *p )
314		{
315			const unsigned char* pU = (const unsigned char*)p;
316
317			// Skip the stupid Microsoft UTF-8 Byte order marks
318			if (	*(pU+0)==TIXML_UTF_LEAD_0
319				 && *(pU+1)==TIXML_UTF_LEAD_1
320				 && *(pU+2)==TIXML_UTF_LEAD_2 )
321			{
322				p += 3;
323				continue;
324			}
325			else if(*(pU+0)==TIXML_UTF_LEAD_0
326				 && *(pU+1)==0xbfU
327				 && *(pU+2)==0xbeU )
328			{
329				p += 3;
330				continue;
331			}
332			else if(*(pU+0)==TIXML_UTF_LEAD_0
333				 && *(pU+1)==0xbfU
334				 && *(pU+2)==0xbfU )
335			{
336				p += 3;
337				continue;
338			}
339
340			if ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )		// Still using old rules for white space.
341				++p;
342			else
343				break;
344		}
345	}
346	else
347	{
348		while ( *p && IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )
349			++p;
350	}
351
352	return p;
353}
354
355#ifdef TIXML_USE_STL
356/*static*/ bool TiXmlBase::StreamWhiteSpace( TIXML_ISTREAM * in, TIXML_STRING * tag )
357{
358	for( ;; )
359	{
360		if ( !in->good() ) return false;
361
362		int c = in->peek();
363		// At this scope, we can't get to a document. So fail silently.
364		if ( !IsWhiteSpace( c ) || c <= 0 )
365			return true;
366
367		*tag += (char) in->get();
368	}
369}
370
371/*static*/ bool TiXmlBase::StreamTo( TIXML_ISTREAM * in, int character, TIXML_STRING * tag )
372{
373	//assert( character > 0 && character < 128 );	// else it won't work in utf-8
374	while ( in->good() )
375	{
376		int c = in->peek();
377		if ( c == character )
378			return true;
379		if ( c <= 0 )		// Silent failure: can't get document at this scope
380			return false;
381
382		in->get();
383		*tag += (char) c;
384	}
385	return false;
386}
387#endif
388
389const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
390{
391	*name = "";
392	assert( p );
393
394	// Names start with letters or underscores.
395	// Of course, in unicode, tinyxml has no idea what a letter *is*. The
396	// algorithm is generous.
397	//
398	// After that, they can be letters, underscores, numbers,
399	// hyphens, or colons. (Colons are valid ony for namespaces,
400	// but tinyxml can't tell namespaces from names.)
401	if (    p && *p
402		 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
403	{
404		while(		p && *p
405				&&	(		IsAlphaNum( (unsigned char ) *p, encoding )
406						 || *p == '_'
407						 || *p == '-'
408						 || *p == '.'
409						 || *p == ':' ) )
410		{
411			(*name) += *p;
412			++p;
413		}
414		return p;
415	}
416	return 0;
417}
418
419const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
420{
421	// Presume an entity, and pull it out.
422    TIXML_STRING ent;
423	int i;
424	*length = 0;
425
426	if ( *(p+1) && *(p+1) == '#' && *(p+2) )
427	{
428		unsigned long ucs = 0;
429		ptrdiff_t delta = 0;
430		unsigned mult = 1;
431
432		if ( *(p+2) == 'x' )
433		{
434			// Hexadecimal.
435			if ( !*(p+3) ) return 0;
436
437			const char* q = p+3;
438			q = strchr( q, ';' );
439
440			if ( !q || !*q ) return 0;
441
442			delta = q-p;
443			--q;
444
445			while ( *q != 'x' )
446			{
447				if ( *q >= '0' && *q <= '9' )
448					ucs += mult * (*q - '0');
449				else if ( *q >= 'a' && *q <= 'f' )
450					ucs += mult * (*q - 'a' + 10);
451				else if ( *q >= 'A' && *q <= 'F' )
452					ucs += mult * (*q - 'A' + 10 );
453				else
454					return 0;
455				mult *= 16;
456				--q;
457			}
458		}
459		else
460		{
461			// Decimal.
462			if ( !*(p+2) ) return 0;
463
464			const char* q = p+2;
465			q = strchr( q, ';' );
466
467			if ( !q || !*q ) return 0;
468
469			delta = q-p;
470			--q;
471
472			while ( *q != '#' )
473			{
474				if ( *q >= '0' && *q <= '9' )
475					ucs += mult * (*q - '0');
476				else
477					return 0;
478				mult *= 10;
479				--q;
480			}
481		}
482		if ( encoding == TIXML_ENCODING_UTF8 )
483		{
484			// convert the UCS to UTF-8
485			ConvertUTF32ToUTF8( ucs, value, length );
486		}
487		else
488		{
489			*value = (char)ucs;
490			*length = 1;
491		}
492		return p + delta + 1;
493	}
494
495	// Now try to match it.
496	for( i=0; i<NUM_ENTITY; ++i )
497	{
498		if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
499		{
500			assert( strlen( entity[i].str ) == entity[i].strLength );
501			*value = entity[i].chr;
502			*length = 1;
503			return ( p + entity[i].strLength );
504		}
505	}
506
507	// So it wasn't an entity, its unrecognized, or something like that.
508	*value = *p;	// Don't put back the last one, since we return it!
509	return p+1;
510}
511
512
513bool TiXmlBase::StringEqual( const char* p,
514							 const char* tag,
515							 bool ignoreCase,
516							 TiXmlEncoding encoding )
517{
518	assert( p );
519	assert( tag );
520	if ( !p || !*p )
521	{
522		assert( 0 );
523		return false;
524	}
525
526	const char* q = p;
527
528	if ( ignoreCase )
529	{
530		while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
531		{
532			++q;
533			++tag;
534		}
535
536		if ( *tag == 0 )
537			return true;
538	}
539	else
540	{
541		while ( *q && *tag && *q == *tag )
542		{
543			++q;
544			++tag;
545		}
546
547		if ( *tag == 0 )		// Have we found the end of the tag, and everything equal?
548			return true;
549	}
550	return false;
551}
552
553const char* TiXmlBase::ReadText(	const char* p,
554									TIXML_STRING * text,
555									bool trimWhiteSpace,
556									const char* endTag,
557									bool caseInsensitive,
558									TiXmlEncoding encoding )
559{
560    *text = "";
561	if (    !trimWhiteSpace			// certain tags always keep whitespace
562		 || !condenseWhiteSpace )	// if true, whitespace is always kept
563	{
564		// Keep all the white space.
565		while (	   p && *p
566				&& !StringEqual( p, endTag, caseInsensitive, encoding )
567			  )
568		{
569			int len;
570			char cArr[4] = { 0, 0, 0, 0 };
571			p = GetChar( p, cArr, &len, encoding );
572			text->append( cArr, len );
573		}
574	}
575	else
576	{
577		bool whitespace = false;
578
579		// Remove leading white space:
580		p = SkipWhiteSpace( p, encoding );
581		while (	   p && *p
582				&& !StringEqual( p, endTag, caseInsensitive, encoding ) )
583		{
584			if ( *p == '\r' || *p == '\n' )
585			{
586				whitespace = true;
587				++p;
588			}
589			else if ( IsWhiteSpace( *p ) )
590			{
591				whitespace = true;
592				++p;
593			}
594			else
595			{
596				// If we've found whitespace, add it before the
597				// new character. Any whitespace just becomes a space.
598				if ( whitespace )
599				{
600					(*text) += ' ';
601					whitespace = false;
602				}
603				int len;
604				char cArr[4] = { 0, 0, 0, 0 };
605				p = GetChar( p, cArr, &len, encoding );
606				if ( len == 1 )
607					(*text) += cArr[0];	// more efficient
608				else
609					text->append( cArr, len );
610			}
611		}
612	}
613	return p + strlen( endTag );
614}
615
616#ifdef TIXML_USE_STL
617
618void TiXmlDocument::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
619{
620	// The basic issue with a document is that we don't know what we're
621	// streaming. Read something presumed to be a tag (and hope), then
622	// identify it, and call the appropriate stream method on the tag.
623	//
624	// This "pre-streaming" will never read the closing ">" so the
625	// sub-tag can orient itself.
626
627	if ( !StreamTo( in, '<', tag ) )
628	{
629		SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
630		return;
631	}
632
633	while ( in->good() )
634	{
635		int tagIndex = (int) tag->length();
636		while ( in->good() && in->peek() != '>' )
637		{
638			int c = in->get();
639			if ( c <= 0 )
640			{
641				SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
642				break;
643			}
644			(*tag) += (char) c;
645		}
646
647		if ( in->good() )
648		{
649			// We now have something we presume to be a node of
650			// some sort. Identify it, and call the node to
651			// continue streaming.
652			TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
653
654			if ( node )
655			{
656				node->StreamIn( in, tag );
657				bool isElement = node->ToElement() != 0;
658				delete node;
659				node = 0;
660
661				// If this is the root element, we're done. Parsing will be
662				// done by the >> operator.
663				if ( isElement )
664				{
665					return;
666				}
667			}
668			else
669			{
670				SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
671				return;
672			}
673		}
674	}
675	// We should have returned sooner.
676	SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
677}
678
679#endif
680
681const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
682{
683	ClearError();
684
685	// Parse away, at the document level. Since a document
686	// contains nothing but other tags, most of what happens
687	// here is skipping white space.
688	if ( !p || !*p )
689	{
690		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
691		return 0;
692	}
693
694	// Note that, for a document, this needs to come
695	// before the while space skip, so that parsing
696	// starts from the pointer we are given.
697	location.Clear();
698	if ( prevData )
699	{
700		location.row = prevData->cursor.row;
701		location.col = prevData->cursor.col;
702	}
703	else
704	{
705		location.row = 0;
706		location.col = 0;
707	}
708	TiXmlParsingData data( p, TabSize(), location.row, location.col );
709	location = data.Cursor();
710
711	if ( encoding == TIXML_ENCODING_UNKNOWN )
712	{
713		// Check for the Microsoft UTF-8 lead bytes.
714		const unsigned char* pU = (const unsigned char*)p;
715		if (	*(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
716			 && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
717			 && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
718		{
719			encoding = TIXML_ENCODING_UTF8;
720			useMicrosoftBOM = true;
721		}
722	}
723
724    p = SkipWhiteSpace( p, encoding );
725	if ( !p )
726	{
727		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
728		return 0;
729	}
730
731	while ( p && *p )
732	{
733		TiXmlNode* node = Identify( p, encoding );
734		if ( node )
735		{
736			p = node->Parse( p, &data, encoding );
737			LinkEndChild( node );
738		}
739		else
740		{
741			break;
742		}
743
744		// Did we get encoding info?
745		if (    encoding == TIXML_ENCODING_UNKNOWN
746			 && node->ToDeclaration() )
747		{
748			TiXmlDeclaration* dec = node->ToDeclaration();
749			const char* enc = dec->Encoding();
750			assert( enc );
751
752			if ( *enc == 0 )
753				encoding = TIXML_ENCODING_UTF8;
754			else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
755				encoding = TIXML_ENCODING_UTF8;
756			else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
757				encoding = TIXML_ENCODING_UTF8;	// incorrect, but be nice
758			else
759				encoding = TIXML_ENCODING_LEGACY;
760		}
761
762		p = SkipWhiteSpace( p, encoding );
763	}
764
765	// Was this empty?
766	if ( !firstChild ) {
767		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
768		return 0;
769	}
770
771	// All is well.
772	return p;
773}
774
775void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
776{
777	// The first error in a chain is more accurate - don't set again!
778	if ( error )
779		return;
780
781	assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
782	error   = true;
783	errorId = err;
784	errorDesc = errorString[ errorId ];
785
786	errorLocation.Clear();
787	if ( pError && data )
788	{
789		data->Stamp( pError, encoding );
790		errorLocation = data->Cursor();
791	}
792}
793
794
795TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
796{
797	TiXmlNode* returnNode = 0;
798
799	p = SkipWhiteSpace( p, encoding );
800	if( !p || !*p || *p != '<' )
801	{
802		return 0;
803	}
804
805	TiXmlDocument* doc = GetDocument();
806	p = SkipWhiteSpace( p, encoding );
807
808	if ( !p || !*p )
809	{
810		return 0;
811	}
812
813	// What is this thing?
814	// - Elements start with a letter or underscore, but xml is reserved.
815	// - Comments: <!--
816	// - Decleration: <?xml
817	// - Everthing else is unknown to tinyxml.
818	//
819
820	const char* xmlHeader = { "<?xml" };
821	const char* commentHeader = { "<!--" };
822	const char* dtdHeader = { "<!" };
823	const char* cdataHeader = { "<![CDATA[" };
824
825	if ( StringEqual( p, xmlHeader, true, encoding ) )
826	{
827		#ifdef DEBUG_PARSER
828			TIXML_LOG( "XML parsing Declaration\n" );
829		#endif
830		returnNode = new TiXmlDeclaration();
831	}
832	else if ( StringEqual( p, commentHeader, false, encoding ) )
833	{
834		#ifdef DEBUG_PARSER
835			TIXML_LOG( "XML parsing Comment\n" );
836		#endif
837		returnNode = new TiXmlComment();
838	}
839	else if ( StringEqual( p, cdataHeader, false, encoding ) )
840	{
841		#ifdef DEBUG_PARSER
842			TIXML_LOG( "XML parsing CDATA\n" );
843		#endif
844		TiXmlText* text = new TiXmlText( "" );
845		text->SetCDATA( true );
846		returnNode = text;
847	}
848	else if ( StringEqual( p, dtdHeader, false, encoding ) )
849	{
850		#ifdef DEBUG_PARSER
851			TIXML_LOG( "XML parsing Unknown(1)\n" );
852		#endif
853		returnNode = new TiXmlUnknown();
854	}
855	else if (    IsAlpha( *(p+1), encoding )
856			  || *(p+1) == '_' )
857	{
858		#ifdef DEBUG_PARSER
859			TIXML_LOG( "XML parsing Element\n" );
860		#endif
861		returnNode = new TiXmlElement( "" );
862	}
863	else
864	{
865		#ifdef DEBUG_PARSER
866			TIXML_LOG( "XML parsing Unknown(2)\n" );
867		#endif
868		returnNode = new TiXmlUnknown();
869	}
870
871	if ( returnNode )
872	{
873		// Set the parent, so it can report errors
874		returnNode->parent = this;
875	}
876	else
877	{
878		if ( doc )
879			doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
880	}
881	return returnNode;
882}
883
884#ifdef TIXML_USE_STL
885
886void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag)
887{
888	// We're called with some amount of pre-parsing. That is, some of "this"
889	// element is in "tag". Go ahead and stream to the closing ">"
890	while( in->good() )
891	{
892		int c = in->get();
893		if ( c <= 0 )
894		{
895			TiXmlDocument* document = GetDocument();
896			if ( document )
897				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
898			return;
899		}
900		(*tag) += (char) c ;
901
902		if ( c == '>' )
903			break;
904	}
905
906	if ( tag->length() < 3 ) return;
907
908	// Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
909	// If not, identify and stream.
910
911	if (    tag->at( tag->length() - 1 ) == '>'
912		 && tag->at( tag->length() - 2 ) == '/' )
913	{
914		// All good!
915		return;
916	}
917	else if ( tag->at( tag->length() - 1 ) == '>' )
918	{
919		// There is more. Could be:
920		//		text
921		//		closing tag
922		//		another node.
923		for ( ;; )
924		{
925			StreamWhiteSpace( in, tag );
926
927			// Do we have text?
928			if ( in->good() && in->peek() != '<' )
929			{
930				// Yep, text.
931				TiXmlText text( "" );
932				text.StreamIn( in, tag );
933
934				// What follows text is a closing tag or another node.
935				// Go around again and figure it out.
936				continue;
937			}
938
939			// We now have either a closing tag...or another node.
940			// We should be at a "<", regardless.
941			if ( !in->good() ) return;
942			assert( in->peek() == '<' );
943			int tagIndex = (int) tag->length();
944
945			bool closingTag = false;
946			bool firstCharFound = false;
947
948			for( ;; )
949			{
950				if ( !in->good() )
951					return;
952
953				int c = in->peek();
954				if ( c <= 0 )
955				{
956					TiXmlDocument* document = GetDocument();
957					if ( document )
958						document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
959					return;
960				}
961
962				if ( c == '>' )
963					break;
964
965				*tag += (char) c;
966				in->get();
967
968				if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
969				{
970					firstCharFound = true;
971					if ( c == '/' )
972						closingTag = true;
973				}
974			}
975			// If it was a closing tag, then read in the closing '>' to clean up the input stream.
976			// If it was not, the streaming will be done by the tag.
977			if ( closingTag )
978			{
979				if ( !in->good() )
980					return;
981
982				int c = in->get();
983				if ( c <= 0 )
984				{
985					TiXmlDocument* document = GetDocument();
986					if ( document )
987						document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
988					return;
989				}
990				assert( c == '>' );
991				*tag += (char) c;
992
993				// We are done, once we've found our closing tag.
994				return;
995			}
996			else
997			{
998				// If not a closing tag, id it, and stream.
999				const char* tagloc = tag->c_str() + tagIndex;
1000				TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
1001				if ( !node )
1002					return;
1003				node->StreamIn( in, tag );
1004				delete node;
1005				node = 0;
1006
1007				// No return: go around from the beginning: text, closing tag, or node.
1008			}
1009		}
1010	}
1011}
1012#endif
1013
1014const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1015{
1016	p = SkipWhiteSpace( p, encoding );
1017	TiXmlDocument* document = GetDocument();
1018
1019	if ( !p || !*p )
1020	{
1021		if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
1022		return 0;
1023	}
1024
1025	if ( data )
1026	{
1027		data->Stamp( p, encoding );
1028		location = data->Cursor();
1029	}
1030
1031	if ( *p != '<' )
1032	{
1033		if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
1034		return 0;
1035	}
1036
1037	p = SkipWhiteSpace( p+1, encoding );
1038
1039	// Read the name.
1040	const char* pErr = p;
1041
1042    p = ReadName( p, &value, encoding );
1043	if ( !p || !*p )
1044	{
1045		if ( document )	document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
1046		return 0;
1047	}
1048
1049    TIXML_STRING endTag ("</");
1050	endTag += value;
1051	endTag += ">";
1052
1053	// Check for and read attributes. Also look for an empty
1054	// tag or an end tag.
1055	while ( p && *p )
1056	{
1057		pErr = p;
1058		p = SkipWhiteSpace( p, encoding );
1059		if ( !p || !*p )
1060		{
1061			if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1062			return 0;
1063		}
1064		if ( *p == '/' )
1065		{
1066			++p;
1067			// Empty tag.
1068			if ( *p  != '>' )
1069			{
1070				if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
1071				return 0;
1072			}
1073			return (p+1);
1074		}
1075		else if ( *p == '>' )
1076		{
1077			// Done with attributes (if there were any.)
1078			// Read the value -- which can include other
1079			// elements -- read the end tag, and return.
1080			++p;
1081			p = ReadValue( p, data, encoding );		// Note this is an Element method, and will set the error if one happens.
1082			if ( !p || !*p )
1083				return 0;
1084
1085			// We should find the end tag now
1086			if ( StringEqual( p, endTag.c_str(), false, encoding ) )
1087			{
1088				p += endTag.length();
1089				return p;
1090			}
1091			else
1092			{
1093				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1094				return 0;
1095			}
1096		}
1097		else
1098		{
1099			// Try to read an attribute:
1100			TiXmlAttribute* attrib = new TiXmlAttribute();
1101			if ( !attrib )
1102			{
1103				if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
1104				return 0;
1105			}
1106
1107			attrib->SetDocument( document );
1108			const char* pErr = p;
1109			p = attrib->Parse( p, data, encoding );
1110
1111			if ( !p || !*p )
1112			{
1113				if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1114				delete attrib;
1115				return 0;
1116			}
1117
1118			// Handle the strange case of double attributes:
1119			TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
1120			if ( node )
1121			{
1122				node->SetValue( attrib->Value() );
1123				delete attrib;
1124				return 0;
1125			}
1126
1127			attributeSet.Add( attrib );
1128		}
1129	}
1130	return p;
1131}
1132
1133
1134const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1135{
1136	TiXmlDocument* document = GetDocument();
1137
1138	// Read in text and elements in any order.
1139	const char* pWithWhiteSpace = p;
1140	p = SkipWhiteSpace( p, encoding );
1141
1142	while ( p && *p )
1143	{
1144		if ( *p != '<' )
1145		{
1146			// Take what we have, make a text element.
1147			TiXmlText* textNode = new TiXmlText( "" );
1148
1149			if ( !textNode )
1150			{
1151				if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
1152				    return 0;
1153			}
1154
1155			if ( TiXmlBase::IsWhiteSpaceCondensed() )
1156			{
1157				p = textNode->Parse( p, data, encoding );
1158			}
1159			else
1160			{
1161				// Special case: we want to keep the white space
1162				// so that leading spaces aren't removed.
1163				p = textNode->Parse( pWithWhiteSpace, data, encoding );
1164			}
1165
1166			if ( !textNode->Blank() )
1167				LinkEndChild( textNode );
1168			else
1169				delete textNode;
1170		}
1171		else
1172		{
1173			// We hit a '<'
1174			// Have we hit a new element or an end tag? This could also be
1175			// a TiXmlText in the "CDATA" style.
1176			if ( StringEqual( p, "</", false, encoding ) )
1177			{
1178				return p;
1179			}
1180			else
1181			{
1182				TiXmlNode* node = Identify( p, encoding );
1183				if ( node )
1184				{
1185					p = node->Parse( p, data, encoding );
1186					LinkEndChild( node );
1187				}
1188				else
1189				{
1190					return 0;
1191				}
1192			}
1193		}
1194		pWithWhiteSpace = p;
1195		p = SkipWhiteSpace( p, encoding );
1196	}
1197
1198	if ( !p )
1199	{
1200		if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
1201	}
1202	return p;
1203}
1204
1205
1206#ifdef TIXML_USE_STL
1207void TiXmlUnknown::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1208{
1209	while ( in->good() )
1210	{
1211		int c = in->get();
1212		if ( c <= 0 )
1213		{
1214			TiXmlDocument* document = GetDocument();
1215			if ( document )
1216				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1217			return;
1218		}
1219		(*tag) += (char) c;
1220
1221		if ( c == '>' )
1222		{
1223			// All is well.
1224			return;
1225		}
1226	}
1227}
1228#endif
1229
1230
1231const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1232{
1233	TiXmlDocument* document = GetDocument();
1234	p = SkipWhiteSpace( p, encoding );
1235
1236	if ( data )
1237	{
1238		data->Stamp( p, encoding );
1239		location = data->Cursor();
1240	}
1241	if ( !p || !*p || *p != '<' )
1242	{
1243		if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
1244		return 0;
1245	}
1246	++p;
1247    value = "";
1248
1249	while ( p && *p && *p != '>' )
1250	{
1251		value += *p;
1252		++p;
1253	}
1254
1255	if ( !p )
1256	{
1257		if ( document )	document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
1258	}
1259	if ( *p == '>' )
1260		return p+1;
1261	return p;
1262}
1263
1264#ifdef TIXML_USE_STL
1265void TiXmlComment::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1266{
1267	while ( in->good() )
1268	{
1269		int c = in->get();
1270		if ( c <= 0 )
1271		{
1272			TiXmlDocument* document = GetDocument();
1273			if ( document )
1274				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1275			return;
1276		}
1277
1278		(*tag) += (char) c;
1279
1280		if ( c == '>'
1281			 && tag->at( tag->length() - 2 ) == '-'
1282			 && tag->at( tag->length() - 3 ) == '-' )
1283		{
1284			// All is well.
1285			return;
1286		}
1287	}
1288}
1289#endif
1290
1291
1292const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1293{
1294	TiXmlDocument* document = GetDocument();
1295	value = "";
1296
1297	p = SkipWhiteSpace( p, encoding );
1298
1299	if ( data )
1300	{
1301		data->Stamp( p, encoding );
1302		location = data->Cursor();
1303	}
1304	const char* startTag = "<!--";
1305	const char* endTag   = "-->";
1306
1307	if ( !StringEqual( p, startTag, false, encoding ) )
1308	{
1309		document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
1310		return 0;
1311	}
1312	p += strlen( startTag );
1313	p = ReadText( p, &value, false, endTag, false, encoding );
1314	return p;
1315}
1316
1317
1318const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1319{
1320	p = SkipWhiteSpace( p, encoding );
1321	if ( !p || !*p ) return 0;
1322
1323	int tabsize = 4;
1324	if ( document )
1325		tabsize = document->TabSize();
1326
1327	if ( data )
1328	{
1329		data->Stamp( p, encoding );
1330		location = data->Cursor();
1331	}
1332	// Read the name, the '=' and the value.
1333	const char* pErr = p;
1334	p = ReadName( p, &name, encoding );
1335	if ( !p || !*p )
1336	{
1337		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1338		return 0;
1339	}
1340	p = SkipWhiteSpace( p, encoding );
1341	if ( !p || !*p || *p != '=' )
1342	{
1343		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1344		return 0;
1345	}
1346
1347	++p;	// skip '='
1348	p = SkipWhiteSpace( p, encoding );
1349	if ( !p || !*p )
1350	{
1351		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1352		return 0;
1353	}
1354
1355	const char* end;
1356
1357	if ( *p == '\'' )
1358	{
1359		++p;
1360		end = "\'";
1361		p = ReadText( p, &value, false, end, false, encoding );
1362	}
1363	else if ( *p == '"' )
1364	{
1365		++p;
1366		end = "\"";
1367		p = ReadText( p, &value, false, end, false, encoding );
1368	}
1369	else
1370	{
1371		// All attribute values should be in single or double quotes.
1372		// But this is such a common error that the parser will try
1373		// its best, even without them.
1374		value = "";
1375		while (    p && *p										// existence
1376				&& !IsWhiteSpace( *p ) && *p != '\n' && *p != '\r'	// whitespace
1377				&& *p != '/' && *p != '>' )						// tag end
1378		{
1379			value += *p;
1380			++p;
1381		}
1382	}
1383	return p;
1384}
1385
1386#ifdef TIXML_USE_STL
1387void TiXmlText::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1388{
1389	if ( cdata )
1390	{
1391		int c = in->get();
1392		if ( c <= 0 )
1393		{
1394			TiXmlDocument* document = GetDocument();
1395			if ( document )
1396				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1397			return;
1398		}
1399
1400		(*tag) += (char) c;
1401
1402		if ( c == '>'
1403			 && tag->at( tag->length() - 2 ) == ']'
1404			 && tag->at( tag->length() - 3 ) == ']' )
1405		{
1406			// All is well.
1407			return;
1408		}
1409	}
1410	else
1411	{
1412		while ( in->good() )
1413		{
1414			int c = in->peek();
1415			if ( c == '<' )
1416				return;
1417			if ( c <= 0 )
1418			{
1419				TiXmlDocument* document = GetDocument();
1420				if ( document )
1421					document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1422				return;
1423			}
1424
1425			(*tag) += (char) c;
1426			in->get();
1427		}
1428	}
1429}
1430#endif
1431
1432const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1433{
1434	value = "";
1435	TiXmlDocument* document = GetDocument();
1436
1437	if ( data )
1438	{
1439		data->Stamp( p, encoding );
1440		location = data->Cursor();
1441	}
1442
1443	const char* const startTag = "<![CDATA[";
1444	const char* const endTag   = "]]>";
1445
1446	if ( cdata || StringEqual( p, startTag, false, encoding ) )
1447	{
1448		cdata = true;
1449
1450		if ( !StringEqual( p, startTag, false, encoding ) )
1451		{
1452			document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
1453			return 0;
1454		}
1455		p += strlen( startTag );
1456
1457		// Keep all the white space, ignore the encoding, etc.
1458		while (	   p && *p
1459				&& !StringEqual( p, endTag, false, encoding )
1460			  )
1461		{
1462			value += *p;
1463			++p;
1464		}
1465
1466		TIXML_STRING dummy;
1467		p = ReadText( p, &dummy, false, endTag, false, encoding );
1468		return p;
1469	}
1470	else
1471	{
1472		bool ignoreWhite = true;
1473
1474		const char* end = "<";
1475		p = ReadText( p, &value, ignoreWhite, end, false, encoding );
1476		if ( p )
1477			return p-1;	// don't truncate the '<'
1478		return 0;
1479	}
1480}
1481
1482#ifdef TIXML_USE_STL
1483void TiXmlDeclaration::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1484{
1485	while ( in->good() )
1486	{
1487		int c = in->get();
1488		if ( c <= 0 )
1489		{
1490			TiXmlDocument* document = GetDocument();
1491			if ( document )
1492				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1493			return;
1494		}
1495		(*tag) += (char) c;
1496
1497		if ( c == '>' )
1498		{
1499			// All is well.
1500			return;
1501		}
1502	}
1503}
1504#endif
1505
1506const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
1507{
1508	p = SkipWhiteSpace( p, _encoding );
1509	// Find the beginning, find the end, and look for
1510	// the stuff in-between.
1511	TiXmlDocument* document = GetDocument();
1512	if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
1513	{
1514		if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
1515		return 0;
1516	}
1517	if ( data )
1518	{
1519		data->Stamp( p, _encoding );
1520		location = data->Cursor();
1521	}
1522	p += 5;
1523
1524	version = "";
1525	encoding = "";
1526	standalone = "";
1527
1528	while ( p && *p )
1529	{
1530		if ( *p == '>' )
1531		{
1532			++p;
1533			return p;
1534		}
1535
1536		p = SkipWhiteSpace( p, _encoding );
1537		if ( StringEqual( p, "version", true, _encoding ) )
1538		{
1539			TiXmlAttribute attrib;
1540			p = attrib.Parse( p, data, _encoding );
1541			version = attrib.Value();
1542		}
1543		else if ( StringEqual( p, "encoding", true, _encoding ) )
1544		{
1545			TiXmlAttribute attrib;
1546			p = attrib.Parse( p, data, _encoding );
1547			encoding = attrib.Value();
1548		}
1549		else if ( StringEqual( p, "standalone", true, _encoding ) )
1550		{
1551			TiXmlAttribute attrib;
1552			p = attrib.Parse( p, data, _encoding );
1553			standalone = attrib.Value();
1554		}
1555		else
1556		{
1557			// Read over whatever it is.
1558			while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
1559				++p;
1560		}
1561	}
1562	return 0;
1563}
1564
1565bool TiXmlText::Blank() const
1566{
1567	for ( unsigned i=0; i<value.length(); i++ )
1568		if ( !IsWhiteSpace( value[i] ) )
1569			return false;
1570	return true;
1571}
1572
1573