1 /*-------------------------------------------------------------------------
2  * drawElements Quality Program Test Executor
3  * ------------------------------------------
4  *
5  * Copyright 2014 The Android Open Source Project
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  *      http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  *//*!
20  * \file
21  * \brief XML Parser.
22  *//*--------------------------------------------------------------------*/
23 
24 #include "xeXMLParser.hpp"
25 #include "deInt32.h"
26 
27 namespace xe
28 {
29 namespace xml
30 {
31 
32 enum
33 {
34 	TOKENIZER_INITIAL_BUFFER_SIZE	= 1024
35 };
36 
isIdentifierStartChar(int ch)37 static inline bool isIdentifierStartChar (int ch)
38 {
39 	return de::inRange<int>(ch, 'a', 'z') || de::inRange<int>(ch, 'A', 'Z');
40 }
41 
isIdentifierChar(int ch)42 static inline bool isIdentifierChar (int ch)
43 {
44 	return isIdentifierStartChar(ch) || de::inRange<int>(ch, '0', '9') || (ch == '-') || (ch == '_');
45 }
46 
isWhitespaceChar(int ch)47 static inline bool isWhitespaceChar (int ch)
48 {
49 	return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
50 }
51 
getNextBufferSize(int curSize,int minNewSize)52 static int getNextBufferSize (int curSize, int minNewSize)
53 {
54 	return de::max(curSize*2, 1<<deLog2Ceil32(minNewSize));
55 }
56 
Tokenizer(void)57 Tokenizer::Tokenizer (void)
58 	: m_curToken	(TOKEN_INCOMPLETE)
59 	, m_curTokenLen	(0)
60 	, m_state		(STATE_DATA)
61 	, m_buf			(TOKENIZER_INITIAL_BUFFER_SIZE)
62 {
63 }
64 
~Tokenizer(void)65 Tokenizer::~Tokenizer (void)
66 {
67 }
68 
clear(void)69 void Tokenizer::clear (void)
70 {
71 	m_curToken		= TOKEN_INCOMPLETE;
72 	m_curTokenLen	= 0;
73 	m_state			= STATE_DATA;
74 	m_buf.clear();
75 }
76 
error(const std::string & what)77 void Tokenizer::error (const std::string& what)
78 {
79 	throw ParseError(what);
80 }
81 
feed(const deUint8 * bytes,int numBytes)82 void Tokenizer::feed (const deUint8* bytes, int numBytes)
83 {
84 	// Grow buffer if necessary.
85 	if (m_buf.getNumFree() < numBytes)
86 	{
87 		m_buf.resize(getNextBufferSize(m_buf.getSize(), m_buf.getNumElements()+numBytes));
88 	}
89 
90 	// Append to front.
91 	m_buf.pushFront(bytes, numBytes);
92 
93 	// If we haven't parsed complete token, re-try after data feed.
94 	if (m_curToken == TOKEN_INCOMPLETE)
95 		advance();
96 }
97 
getChar(int offset) const98 int Tokenizer::getChar (int offset) const
99 {
100 	DE_ASSERT(de::inRange(offset, 0, m_buf.getNumElements()));
101 
102 	if (offset < m_buf.getNumElements())
103 		return m_buf.peekBack(offset);
104 	else
105 		return END_OF_BUFFER;
106 }
107 
advance(void)108 void Tokenizer::advance (void)
109 {
110 	if (m_curToken != TOKEN_INCOMPLETE)
111 	{
112 		// Parser should not try to advance beyond end of string.
113 		DE_ASSERT(m_curToken != TOKEN_END_OF_STRING);
114 
115 		// If current token is tag end, change state to data.
116 		if (m_curToken == TOKEN_TAG_END						||
117 			m_curToken == TOKEN_EMPTY_ELEMENT_END			||
118 			m_curToken == TOKEN_PROCESSING_INSTRUCTION_END	||
119 			m_curToken == TOKEN_COMMENT						||
120 			m_curToken == TOKEN_ENTITY)
121 			m_state = STATE_DATA;
122 
123 		// Advance buffer by length of last token.
124 		m_buf.popBack(m_curTokenLen);
125 
126 		// Reset state.
127 		m_curToken		= TOKEN_INCOMPLETE;
128 		m_curTokenLen	= 0;
129 
130 		// If we hit end of string here, report it as end of string.
131 		if (getChar(0) == END_OF_STRING)
132 		{
133 			m_curToken		= TOKEN_END_OF_STRING;
134 			m_curTokenLen	= 1;
135 			return;
136 		}
137 	}
138 
139 	int curChar = getChar(m_curTokenLen);
140 
141 	for (;;)
142 	{
143 		if (m_state == STATE_DATA)
144 		{
145 			// Advance until we hit end of buffer or tag start and treat that as data token.
146 			if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER || curChar == '<' || curChar == '&')
147 			{
148 				if (curChar == '<')
149 					m_state = STATE_TAG;
150 				else if (curChar == '&')
151 					m_state = STATE_ENTITY;
152 
153 				if (m_curTokenLen > 0)
154 				{
155 					// Report data token.
156 					m_curToken = TOKEN_DATA;
157 					return;
158 				}
159 				else if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER)
160 				{
161 					// Just return incomplete token, no data parsed.
162 					return;
163 				}
164 				else
165 				{
166 					DE_ASSERT(m_state == STATE_TAG || m_state == STATE_ENTITY);
167 					continue;
168 				}
169 			}
170 		}
171 		else
172 		{
173 			// Eat all whitespace if present.
174 			if (m_curTokenLen == 0)
175 			{
176 				while (isWhitespaceChar(curChar))
177 				{
178 					m_buf.popBack();
179 					curChar = getChar(0);
180 				}
181 			}
182 
183 			// Handle end of string / buffer.
184 			if (curChar == END_OF_STRING)
185 				error("Unexpected end of string");
186 			else if (curChar == (int)END_OF_BUFFER)
187 			{
188 				DE_ASSERT(m_curToken == TOKEN_INCOMPLETE);
189 				return;
190 			}
191 
192 			if (m_curTokenLen == 0)
193 			{
194 				// Expect start of identifier, value or special tag token.
195 				if (curChar == '\'' || curChar == '"')
196 					m_state = STATE_VALUE;
197 				else if (isIdentifierStartChar(curChar))
198 					m_state = STATE_IDENTIFIER;
199 				else if (curChar == '<' || curChar == '?' || curChar == '/')
200 					m_state = STATE_TAG;
201 				else if (curChar == '&')
202 					DE_ASSERT(m_state == STATE_ENTITY);
203 				else if (curChar == '=')
204 				{
205 					m_curToken		= TOKEN_EQUAL;
206 					m_curTokenLen	= 1;
207 					return;
208 				}
209 				else if (curChar == '>')
210 				{
211 					m_curToken		= TOKEN_TAG_END;
212 					m_curTokenLen	= 1;
213 					return;
214 				}
215 				else
216 					error("Unexpected character");
217 			}
218 			else if (m_state == STATE_IDENTIFIER)
219 			{
220 				if (!isIdentifierChar(curChar))
221 				{
222 					m_curToken = TOKEN_IDENTIFIER;
223 					return;
224 				}
225 			}
226 			else if (m_state == STATE_VALUE)
227 			{
228 				// \todo [2012-06-07 pyry] Escapes.
229 				if (curChar == '\'' || curChar == '"')
230 				{
231 					// \todo [2012-10-17 pyry] Should we actually do the check against getChar(0)?
232 					if (curChar != getChar(0))
233 						error("Mismatched quote");
234 					m_curToken		 = TOKEN_STRING;
235 					m_curTokenLen	+= 1;
236 					return;
237 				}
238 			}
239 			else if (m_state == STATE_COMMENT)
240 			{
241 				DE_ASSERT(m_curTokenLen >= 2); // 2 characters have been parsed if we are in comment state.
242 
243 				if (m_curTokenLen <= 3)
244 				{
245 					if (curChar != '-')
246 						error("Invalid comment start");
247 				}
248 				else
249 				{
250 					int prev2 = m_curTokenLen > 5 ? getChar(m_curTokenLen-2) : 0;
251 					int prev1 = m_curTokenLen > 4 ? getChar(m_curTokenLen-1) : 0;
252 
253 					if (prev2 == '-' && prev1 == '-')
254 					{
255 						if (curChar != '>')
256 							error("Invalid comment end");
257 						m_curToken		 = TOKEN_COMMENT;
258 						m_curTokenLen	+= 1;
259 						return;
260 					}
261 				}
262 			}
263 			else if (m_state == STATE_ENTITY)
264 			{
265 				if (m_curTokenLen >= 1)
266 				{
267 					if (curChar == ';')
268 					{
269 						m_curToken		 = TOKEN_ENTITY;
270 						m_curTokenLen	+= 1;
271 						return;
272 					}
273 					else if (!de::inRange<int>(curChar, '0', '9')	&&
274 							 !de::inRange<int>(curChar, 'a', 'z')	&&
275 							 !de::inRange<int>(curChar, 'A', 'Z'))
276 						error("Invalid entity");
277 				}
278 			}
279 			else
280 			{
281 				// Special tokens are at most 2 characters.
282 				DE_ASSERT(m_state == STATE_TAG && m_curTokenLen == 1);
283 
284 				int prevChar = getChar(m_curTokenLen-1);
285 
286 				if (prevChar == '<')
287 				{
288 					// Tag start.
289 					if (curChar == '/')
290 					{
291 						m_curToken		= TOKEN_END_TAG_START;
292 						m_curTokenLen	= 2;
293 						return;
294 					}
295 					else if (curChar == '?')
296 					{
297 						m_curToken		= TOKEN_PROCESSING_INSTRUCTION_START;
298 						m_curTokenLen	= 2;
299 						return;
300 					}
301 					else if (curChar == '!')
302 					{
303 						m_state = STATE_COMMENT;
304 					}
305 					else
306 					{
307 						m_curToken		= TOKEN_TAG_START;
308 						m_curTokenLen	= 1;
309 						return;
310 					}
311 				}
312 				else if (prevChar == '?')
313 				{
314 					if (curChar != '>')
315 						error("Invalid processing instruction end");
316 					m_curToken		= TOKEN_PROCESSING_INSTRUCTION_END;
317 					m_curTokenLen	= 2;
318 					return;
319 				}
320 				else if (prevChar == '/')
321 				{
322 					if (curChar != '>')
323 						error("Invalid empty element end");
324 					m_curToken		= TOKEN_EMPTY_ELEMENT_END;
325 					m_curTokenLen	= 2;
326 					return;
327 				}
328 				else
329 					error("Could not parse special token");
330 			}
331 		}
332 
333 		m_curTokenLen	+= 1;
334 		curChar			 = getChar(m_curTokenLen);
335 	}
336 }
337 
getString(std::string & dst) const338 void Tokenizer::getString (std::string& dst) const
339 {
340 	DE_ASSERT(m_curToken == TOKEN_STRING);
341 	dst.resize(m_curTokenLen-2);
342 	for (int ndx = 0; ndx < m_curTokenLen-2; ndx++)
343 		dst[ndx] = m_buf.peekBack(ndx+1);
344 }
345 
Parser(void)346 Parser::Parser (void)
347 	: m_element		(ELEMENT_INCOMPLETE)
348 	, m_state		(STATE_DATA)
349 {
350 }
351 
~Parser(void)352 Parser::~Parser (void)
353 {
354 }
355 
clear(void)356 void Parser::clear (void)
357 {
358 	m_tokenizer.clear();
359 	m_elementName.clear();
360 	m_attributes.clear();
361 	m_attribName.clear();
362 	m_entityValue.clear();
363 
364 	m_element	= ELEMENT_INCOMPLETE;
365 	m_state		= STATE_DATA;
366 }
367 
error(const std::string & what)368 void Parser::error (const std::string& what)
369 {
370 	throw ParseError(what);
371 }
372 
feed(const deUint8 * bytes,int numBytes)373 void Parser::feed (const deUint8* bytes, int numBytes)
374 {
375 	m_tokenizer.feed(bytes, numBytes);
376 
377 	if (m_element == ELEMENT_INCOMPLETE)
378 		advance();
379 }
380 
advance(void)381 void Parser::advance (void)
382 {
383 	if (m_element == ELEMENT_START)
384 		m_attributes.clear();
385 
386 	// \note No token is advanced when element end is reported.
387 	if (m_state == STATE_YIELD_EMPTY_ELEMENT_END)
388 	{
389 		DE_ASSERT(m_element == ELEMENT_START);
390 		m_element	= ELEMENT_END;
391 		m_state		= STATE_DATA;
392 		return;
393 	}
394 
395 	if (m_element != ELEMENT_INCOMPLETE)
396 	{
397 		m_tokenizer.advance();
398 		m_element = ELEMENT_INCOMPLETE;
399 	}
400 
401 	for (;;)
402 	{
403 		Token curToken = m_tokenizer.getToken();
404 
405 		// Skip comments.
406 		while (curToken == TOKEN_COMMENT)
407 		{
408 			m_tokenizer.advance();
409 			curToken = m_tokenizer.getToken();
410 		}
411 
412 		if (curToken == TOKEN_INCOMPLETE)
413 		{
414 			DE_ASSERT(m_element == ELEMENT_INCOMPLETE);
415 			return;
416 		}
417 
418 		switch (m_state)
419 		{
420 			case STATE_ENTITY:
421 				m_state = STATE_DATA;
422 				// Fall-through
423 
424 			case STATE_DATA:
425 				switch (curToken)
426 				{
427 					case TOKEN_DATA:
428 						m_element = ELEMENT_DATA;
429 						return;
430 
431 					case TOKEN_END_OF_STRING:
432 						m_element = ELEMENT_END_OF_STRING;
433 						return;
434 
435 					case TOKEN_TAG_START:
436 						m_state = STATE_START_TAG_OPEN;
437 						break;
438 
439 					case TOKEN_END_TAG_START:
440 						m_state = STATE_END_TAG_OPEN;
441 						break;
442 
443 					case TOKEN_PROCESSING_INSTRUCTION_START:
444 						m_state = STATE_IN_PROCESSING_INSTRUCTION;
445 						break;
446 
447 					case TOKEN_ENTITY:
448 						m_state		= STATE_ENTITY;
449 						m_element	= ELEMENT_DATA;
450 						parseEntityValue();
451 						return;
452 
453 					default:
454 						error("Unexpected token");
455 				}
456 				break;
457 
458 			case STATE_IN_PROCESSING_INSTRUCTION:
459 				if (curToken == TOKEN_PROCESSING_INSTRUCTION_END)
460 					m_state = STATE_DATA;
461 				else
462 					if (curToken != TOKEN_IDENTIFIER && curToken != TOKEN_EQUAL && curToken != TOKEN_STRING)
463 						error("Unexpected token in processing instruction");
464 				break;
465 
466 			case STATE_START_TAG_OPEN:
467 				if (curToken != TOKEN_IDENTIFIER)
468 					error("Expected identifier");
469 				m_tokenizer.getTokenStr(m_elementName);
470 				m_state = STATE_ATTRIBUTE_LIST;
471 				break;
472 
473 			case STATE_END_TAG_OPEN:
474 				if (curToken != TOKEN_IDENTIFIER)
475 					error("Expected identifier");
476 				m_tokenizer.getTokenStr(m_elementName);
477 				m_state = STATE_EXPECTING_END_TAG_CLOSE;
478 				break;
479 
480 			case STATE_EXPECTING_END_TAG_CLOSE:
481 				if (curToken != TOKEN_TAG_END)
482 					error("Expected tag end");
483 				m_state		= STATE_DATA;
484 				m_element	= ELEMENT_END;
485 				return;
486 
487 			case STATE_ATTRIBUTE_LIST:
488 				if (curToken == TOKEN_IDENTIFIER)
489 				{
490 					m_tokenizer.getTokenStr(m_attribName);
491 					m_state = STATE_EXPECTING_ATTRIBUTE_EQ;
492 				}
493 				else if (curToken == TOKEN_EMPTY_ELEMENT_END)
494 				{
495 					m_state		= STATE_YIELD_EMPTY_ELEMENT_END;
496 					m_element	= ELEMENT_START;
497 					return;
498 				}
499 				else if (curToken == TOKEN_TAG_END)
500 				{
501 					m_state		= STATE_DATA;
502 					m_element	= ELEMENT_START;
503 					return;
504 				}
505 				else
506 					error("Unexpected token");
507 				break;
508 
509 			case STATE_EXPECTING_ATTRIBUTE_EQ:
510 				if (curToken != TOKEN_EQUAL)
511 					error("Expected '='");
512 				m_state = STATE_EXPECTING_ATTRIBUTE_VALUE;
513 				break;
514 
515 			case STATE_EXPECTING_ATTRIBUTE_VALUE:
516 				if (curToken != TOKEN_STRING)
517 					error("Expected value");
518 				if (hasAttribute(m_attribName.c_str()))
519 					error("Duplicate attribute");
520 
521 				m_tokenizer.getString(m_attributes[m_attribName]);
522 				m_state = STATE_ATTRIBUTE_LIST;
523 				break;
524 
525 			default:
526 				DE_ASSERT(false);
527 		}
528 
529 		m_tokenizer.advance();
530 	}
531 }
532 
getEntityValue(const std::string & entity)533 static char getEntityValue (const std::string& entity)
534 {
535 	static const struct
536 	{
537 		const char*		name;
538 		char			value;
539 	} s_entities[] =
540 	{
541 			{ "&lt;",			'<' },
542 			{ "&gt;",			'>' },
543 			{ "&amp;",			'&' },
544 			{ "&apos;",			'\''},
545 			{ "&quot;",			'"' },
546 	};
547 
548 	for (int ndx = 0; ndx < DE_LENGTH_OF_ARRAY(s_entities); ndx++)
549 	{
550 		if (entity == s_entities[ndx].name)
551 			return s_entities[ndx].value;
552 	}
553 
554 	return 0;
555 }
556 
parseEntityValue(void)557 void Parser::parseEntityValue (void)
558 {
559 	DE_ASSERT(m_state == STATE_ENTITY && m_tokenizer.getToken() == TOKEN_ENTITY);
560 
561 	std::string entity;
562 	m_tokenizer.getTokenStr(entity);
563 
564 	const char value = getEntityValue(entity);
565 	if (value == 0)
566 		error("Invalid entity '" + entity + "'");
567 
568 	m_entityValue.resize(1);
569 	m_entityValue[0] = value;
570 }
571 
572 } // xml
573 } // xe
574