1/*-------------------------------------------------------------------------
2 * drawElements Quality Program Test Executor
3 * ------------------------------------------
4 *
5 * Copyright 2014 The Android Open Source Project
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 *      http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 *
19 *//*!
20 * \file
21 * \brief XML Parser.
22 *//*--------------------------------------------------------------------*/
23
24#include "xeXMLParser.hpp"
25#include "deInt32.h"
26
27namespace xe
28{
29namespace xml
30{
31
32enum
33{
34	TOKENIZER_INITIAL_BUFFER_SIZE	= 1024
35};
36
37static inline bool isIdentifierStartChar (int ch)
38{
39	return de::inRange<int>(ch, 'a', 'z') || de::inRange<int>(ch, 'A', 'Z');
40}
41
42static inline bool isIdentifierChar (int ch)
43{
44	return isIdentifierStartChar(ch) || de::inRange<int>(ch, '0', '9') || (ch == '-') || (ch == '_');
45}
46
47static inline bool isWhitespaceChar (int ch)
48{
49	return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
50}
51
52static int getNextBufferSize (int curSize, int minNewSize)
53{
54	return de::max(curSize*2, 1<<deLog2Ceil32(minNewSize));
55}
56
57Tokenizer::Tokenizer (void)
58	: m_curToken	(TOKEN_INCOMPLETE)
59	, m_curTokenLen	(0)
60	, m_state		(STATE_DATA)
61	, m_buf			(TOKENIZER_INITIAL_BUFFER_SIZE)
62{
63}
64
65Tokenizer::~Tokenizer (void)
66{
67}
68
69void Tokenizer::clear (void)
70{
71	m_curToken		= TOKEN_INCOMPLETE;
72	m_curTokenLen	= 0;
73	m_state			= STATE_DATA;
74	m_buf.clear();
75}
76
77void Tokenizer::error (const std::string& what)
78{
79	throw ParseError(what);
80}
81
82void Tokenizer::feed (const deUint8* bytes, int numBytes)
83{
84	// Grow buffer if necessary.
85	if (m_buf.getNumFree() < numBytes)
86	{
87		m_buf.resize(getNextBufferSize(m_buf.getSize(), m_buf.getNumElements()+numBytes));
88	}
89
90	// Append to front.
91	m_buf.pushFront(bytes, numBytes);
92
93	// If we haven't parsed complete token, re-try after data feed.
94	if (m_curToken == TOKEN_INCOMPLETE)
95		advance();
96}
97
98int Tokenizer::getChar (int offset) const
99{
100	DE_ASSERT(de::inRange(offset, 0, m_buf.getNumElements()));
101
102	if (offset < m_buf.getNumElements())
103		return m_buf.peekBack(offset);
104	else
105		return END_OF_BUFFER;
106}
107
108void Tokenizer::advance (void)
109{
110	if (m_curToken != TOKEN_INCOMPLETE)
111	{
112		// Parser should not try to advance beyond end of string.
113		DE_ASSERT(m_curToken != TOKEN_END_OF_STRING);
114
115		// If current token is tag end, change state to data.
116		if (m_curToken == TOKEN_TAG_END						||
117			m_curToken == TOKEN_EMPTY_ELEMENT_END			||
118			m_curToken == TOKEN_PROCESSING_INSTRUCTION_END	||
119			m_curToken == TOKEN_COMMENT						||
120			m_curToken == TOKEN_ENTITY)
121			m_state = STATE_DATA;
122
123		// Advance buffer by length of last token.
124		m_buf.popBack(m_curTokenLen);
125
126		// Reset state.
127		m_curToken		= TOKEN_INCOMPLETE;
128		m_curTokenLen	= 0;
129
130		// If we hit end of string here, report it as end of string.
131		if (getChar(0) == END_OF_STRING)
132		{
133			m_curToken		= TOKEN_END_OF_STRING;
134			m_curTokenLen	= 1;
135			return;
136		}
137	}
138
139	int curChar = getChar(m_curTokenLen);
140
141	for (;;)
142	{
143		if (m_state == STATE_DATA)
144		{
145			// Advance until we hit end of buffer or tag start and treat that as data token.
146			if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER || curChar == '<' || curChar == '&')
147			{
148				if (curChar == '<')
149					m_state = STATE_TAG;
150				else if (curChar == '&')
151					m_state = STATE_ENTITY;
152
153				if (m_curTokenLen > 0)
154				{
155					// Report data token.
156					m_curToken = TOKEN_DATA;
157					return;
158				}
159				else if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER)
160				{
161					// Just return incomplete token, no data parsed.
162					return;
163				}
164				else
165				{
166					DE_ASSERT(m_state == STATE_TAG || m_state == STATE_ENTITY);
167					continue;
168				}
169			}
170		}
171		else
172		{
173			// Eat all whitespace if present.
174			if (m_curTokenLen == 0)
175			{
176				while (isWhitespaceChar(curChar))
177				{
178					m_buf.popBack();
179					curChar = getChar(0);
180				}
181			}
182
183			// Handle end of string / buffer.
184			if (curChar == END_OF_STRING)
185				error("Unexpected end of string");
186			else if (curChar == (int)END_OF_BUFFER)
187			{
188				DE_ASSERT(m_curToken == TOKEN_INCOMPLETE);
189				return;
190			}
191
192			if (m_curTokenLen == 0)
193			{
194				// Expect start of identifier, value or special tag token.
195				if (curChar == '\'' || curChar == '"')
196					m_state = STATE_VALUE;
197				else if (isIdentifierStartChar(curChar))
198					m_state = STATE_IDENTIFIER;
199				else if (curChar == '<' || curChar == '?' || curChar == '/')
200					m_state = STATE_TAG;
201				else if (curChar == '&')
202					DE_ASSERT(m_state == STATE_ENTITY);
203				else if (curChar == '=')
204				{
205					m_curToken		= TOKEN_EQUAL;
206					m_curTokenLen	= 1;
207					return;
208				}
209				else if (curChar == '>')
210				{
211					m_curToken		= TOKEN_TAG_END;
212					m_curTokenLen	= 1;
213					return;
214				}
215				else
216					error("Unexpected character");
217			}
218			else if (m_state == STATE_IDENTIFIER)
219			{
220				if (!isIdentifierChar(curChar))
221				{
222					m_curToken = TOKEN_IDENTIFIER;
223					return;
224				}
225			}
226			else if (m_state == STATE_VALUE)
227			{
228				// \todo [2012-06-07 pyry] Escapes.
229				if (curChar == '\'' || curChar == '"')
230				{
231					// \todo [2012-10-17 pyry] Should we actually do the check against getChar(0)?
232					if (curChar != getChar(0))
233						error("Mismatched quote");
234					m_curToken		 = TOKEN_STRING;
235					m_curTokenLen	+= 1;
236					return;
237				}
238			}
239			else if (m_state == STATE_COMMENT)
240			{
241				DE_ASSERT(m_curTokenLen >= 2); // 2 characters have been parsed if we are in comment state.
242
243				if (m_curTokenLen <= 3)
244				{
245					if (curChar != '-')
246						error("Invalid comment start");
247				}
248				else
249				{
250					int prev2 = m_curTokenLen > 5 ? getChar(m_curTokenLen-2) : 0;
251					int prev1 = m_curTokenLen > 4 ? getChar(m_curTokenLen-1) : 0;
252
253					if (prev2 == '-' && prev1 == '-')
254					{
255						if (curChar != '>')
256							error("Invalid comment end");
257						m_curToken		 = TOKEN_COMMENT;
258						m_curTokenLen	+= 1;
259						return;
260					}
261				}
262			}
263			else if (m_state == STATE_ENTITY)
264			{
265				if (m_curTokenLen >= 1)
266				{
267					if (curChar == ';')
268					{
269						m_curToken		 = TOKEN_ENTITY;
270						m_curTokenLen	+= 1;
271						return;
272					}
273					else if (!de::inRange<int>(curChar, '0', '9')	&&
274							 !de::inRange<int>(curChar, 'a', 'z')	&&
275							 !de::inRange<int>(curChar, 'A', 'Z'))
276						error("Invalid entity");
277				}
278			}
279			else
280			{
281				// Special tokens are at most 2 characters.
282				DE_ASSERT(m_state == STATE_TAG && m_curTokenLen == 1);
283
284				int prevChar = getChar(m_curTokenLen-1);
285
286				if (prevChar == '<')
287				{
288					// Tag start.
289					if (curChar == '/')
290					{
291						m_curToken		= TOKEN_END_TAG_START;
292						m_curTokenLen	= 2;
293						return;
294					}
295					else if (curChar == '?')
296					{
297						m_curToken		= TOKEN_PROCESSING_INSTRUCTION_START;
298						m_curTokenLen	= 2;
299						return;
300					}
301					else if (curChar == '!')
302					{
303						m_state = STATE_COMMENT;
304					}
305					else
306					{
307						m_curToken		= TOKEN_TAG_START;
308						m_curTokenLen	= 1;
309						return;
310					}
311				}
312				else if (prevChar == '?')
313				{
314					if (curChar != '>')
315						error("Invalid processing instruction end");
316					m_curToken		= TOKEN_PROCESSING_INSTRUCTION_END;
317					m_curTokenLen	= 2;
318					return;
319				}
320				else if (prevChar == '/')
321				{
322					if (curChar != '>')
323						error("Invalid empty element end");
324					m_curToken		= TOKEN_EMPTY_ELEMENT_END;
325					m_curTokenLen	= 2;
326					return;
327				}
328				else
329					error("Could not parse special token");
330			}
331		}
332
333		m_curTokenLen	+= 1;
334		curChar			 = getChar(m_curTokenLen);
335	}
336}
337
338void Tokenizer::getString (std::string& dst) const
339{
340	DE_ASSERT(m_curToken == TOKEN_STRING);
341	dst.resize(m_curTokenLen-2);
342	for (int ndx = 0; ndx < m_curTokenLen-2; ndx++)
343		dst[ndx] = m_buf.peekBack(ndx+1);
344}
345
346Parser::Parser (void)
347	: m_element		(ELEMENT_INCOMPLETE)
348	, m_state		(STATE_DATA)
349{
350}
351
352Parser::~Parser (void)
353{
354}
355
356void Parser::clear (void)
357{
358	m_tokenizer.clear();
359	m_elementName.clear();
360	m_attributes.clear();
361	m_attribName.clear();
362	m_entityValue.clear();
363
364	m_element	= ELEMENT_INCOMPLETE;
365	m_state		= STATE_DATA;
366}
367
368void Parser::error (const std::string& what)
369{
370	throw ParseError(what);
371}
372
373void Parser::feed (const deUint8* bytes, int numBytes)
374{
375	m_tokenizer.feed(bytes, numBytes);
376
377	if (m_element == ELEMENT_INCOMPLETE)
378		advance();
379}
380
381void Parser::advance (void)
382{
383	if (m_element == ELEMENT_START)
384		m_attributes.clear();
385
386	// \note No token is advanced when element end is reported.
387	if (m_state == STATE_YIELD_EMPTY_ELEMENT_END)
388	{
389		DE_ASSERT(m_element == ELEMENT_START);
390		m_element	= ELEMENT_END;
391		m_state		= STATE_DATA;
392		return;
393	}
394
395	if (m_element != ELEMENT_INCOMPLETE)
396	{
397		m_tokenizer.advance();
398		m_element = ELEMENT_INCOMPLETE;
399	}
400
401	for (;;)
402	{
403		Token curToken = m_tokenizer.getToken();
404
405		// Skip comments.
406		while (curToken == TOKEN_COMMENT)
407		{
408			m_tokenizer.advance();
409			curToken = m_tokenizer.getToken();
410		}
411
412		if (curToken == TOKEN_INCOMPLETE)
413		{
414			DE_ASSERT(m_element == ELEMENT_INCOMPLETE);
415			return;
416		}
417
418		switch (m_state)
419		{
420			case STATE_ENTITY:
421				m_state = STATE_DATA;
422				// Fall-through to STATE_DATA processing.
423
424			case STATE_DATA:
425				switch (curToken)
426				{
427					case TOKEN_DATA:
428						m_element = ELEMENT_DATA;
429						return;
430
431					case TOKEN_END_OF_STRING:
432						m_element = ELEMENT_END_OF_STRING;
433						return;
434
435					case TOKEN_TAG_START:
436						m_state = STATE_START_TAG_OPEN;
437						break;
438
439					case TOKEN_END_TAG_START:
440						m_state = STATE_END_TAG_OPEN;
441						break;
442
443					case TOKEN_PROCESSING_INSTRUCTION_START:
444						m_state = STATE_IN_PROCESSING_INSTRUCTION;
445						break;
446
447					case TOKEN_ENTITY:
448						m_state		= STATE_ENTITY;
449						m_element	= ELEMENT_DATA;
450						parseEntityValue();
451						return;
452
453					default:
454						error("Unexpected token");
455				}
456				break;
457
458			case STATE_IN_PROCESSING_INSTRUCTION:
459				if (curToken == TOKEN_PROCESSING_INSTRUCTION_END)
460					m_state = STATE_DATA;
461				else
462					if (curToken != TOKEN_IDENTIFIER && curToken != TOKEN_EQUAL && curToken != TOKEN_STRING)
463						error("Unexpected token in processing instruction");
464				break;
465
466			case STATE_START_TAG_OPEN:
467				if (curToken != TOKEN_IDENTIFIER)
468					error("Expected identifier");
469				m_tokenizer.getTokenStr(m_elementName);
470				m_state = STATE_ATTRIBUTE_LIST;
471				break;
472
473			case STATE_END_TAG_OPEN:
474				if (curToken != TOKEN_IDENTIFIER)
475					error("Expected identifier");
476				m_tokenizer.getTokenStr(m_elementName);
477				m_state = STATE_EXPECTING_END_TAG_CLOSE;
478				break;
479
480			case STATE_EXPECTING_END_TAG_CLOSE:
481				if (curToken != TOKEN_TAG_END)
482					error("Expected tag end");
483				m_state		= STATE_DATA;
484				m_element	= ELEMENT_END;
485				return;
486
487			case STATE_ATTRIBUTE_LIST:
488				if (curToken == TOKEN_IDENTIFIER)
489				{
490					m_tokenizer.getTokenStr(m_attribName);
491					m_state = STATE_EXPECTING_ATTRIBUTE_EQ;
492				}
493				else if (curToken == TOKEN_EMPTY_ELEMENT_END)
494				{
495					m_state		= STATE_YIELD_EMPTY_ELEMENT_END;
496					m_element	= ELEMENT_START;
497					return;
498				}
499				else if (curToken == TOKEN_TAG_END)
500				{
501					m_state		= STATE_DATA;
502					m_element	= ELEMENT_START;
503					return;
504				}
505				else
506					error("Unexpected token");
507				break;
508
509			case STATE_EXPECTING_ATTRIBUTE_EQ:
510				if (curToken != TOKEN_EQUAL)
511					error("Expected '='");
512				m_state = STATE_EXPECTING_ATTRIBUTE_VALUE;
513				break;
514
515			case STATE_EXPECTING_ATTRIBUTE_VALUE:
516				if (curToken != TOKEN_STRING)
517					error("Expected value");
518				if (hasAttribute(m_attribName.c_str()))
519					error("Duplicate attribute");
520
521				m_tokenizer.getString(m_attributes[m_attribName]);
522				m_state = STATE_ATTRIBUTE_LIST;
523				break;
524
525			default:
526				DE_ASSERT(false);
527		}
528
529		m_tokenizer.advance();
530	}
531}
532
533static char getEntityValue (const std::string& entity)
534{
535	static const struct
536	{
537		const char*		name;
538		char			value;
539	} s_entities[] =
540	{
541			{ "&lt;",			'<' },
542			{ "&gt;",			'>' },
543			{ "&amp;",			'&' },
544			{ "&apos;",			'\''},
545			{ "&quot;",			'"' },
546	};
547
548	for (int ndx = 0; ndx < DE_LENGTH_OF_ARRAY(s_entities); ndx++)
549	{
550		if (entity == s_entities[ndx].name)
551			return s_entities[ndx].value;
552	}
553
554	return 0;
555}
556
557void Parser::parseEntityValue (void)
558{
559	DE_ASSERT(m_state == STATE_ENTITY && m_tokenizer.getToken() == TOKEN_ENTITY);
560
561	std::string entity;
562	m_tokenizer.getTokenStr(entity);
563
564	const char value = getEntityValue(entity);
565	if (value == 0)
566		error("Invalid entity '" + entity + "'");
567
568	m_entityValue.resize(1);
569	m_entityValue[0] = value;
570}
571
572} // xml
573} // xe
574