1 /*-------------------------------------------------------------------------
2 * drawElements Quality Program Test Executor
3 * ------------------------------------------
4 *
5 * Copyright 2014 The Android Open Source Project
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 *
19 *//*!
20 * \file
21 * \brief XML Parser.
22 *//*--------------------------------------------------------------------*/
23
24 #include "xeXMLParser.hpp"
25 #include "deInt32.h"
26
27 namespace xe
28 {
29 namespace xml
30 {
31
32 enum
33 {
34 TOKENIZER_INITIAL_BUFFER_SIZE = 1024
35 };
36
isIdentifierStartChar(int ch)37 static inline bool isIdentifierStartChar (int ch)
38 {
39 return de::inRange<int>(ch, 'a', 'z') || de::inRange<int>(ch, 'A', 'Z');
40 }
41
isIdentifierChar(int ch)42 static inline bool isIdentifierChar (int ch)
43 {
44 return isIdentifierStartChar(ch) || de::inRange<int>(ch, '0', '9') || (ch == '-') || (ch == '_');
45 }
46
isWhitespaceChar(int ch)47 static inline bool isWhitespaceChar (int ch)
48 {
49 return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
50 }
51
getNextBufferSize(int curSize,int minNewSize)52 static int getNextBufferSize (int curSize, int minNewSize)
53 {
54 return de::max(curSize*2, 1<<deLog2Ceil32(minNewSize));
55 }
56
Tokenizer(void)57 Tokenizer::Tokenizer (void)
58 : m_curToken (TOKEN_INCOMPLETE)
59 , m_curTokenLen (0)
60 , m_state (STATE_DATA)
61 , m_buf (TOKENIZER_INITIAL_BUFFER_SIZE)
62 {
63 }
64
~Tokenizer(void)65 Tokenizer::~Tokenizer (void)
66 {
67 }
68
clear(void)69 void Tokenizer::clear (void)
70 {
71 m_curToken = TOKEN_INCOMPLETE;
72 m_curTokenLen = 0;
73 m_state = STATE_DATA;
74 m_buf.clear();
75 }
76
error(const std::string & what)77 void Tokenizer::error (const std::string& what)
78 {
79 throw ParseError(what);
80 }
81
feed(const deUint8 * bytes,int numBytes)82 void Tokenizer::feed (const deUint8* bytes, int numBytes)
83 {
84 // Grow buffer if necessary.
85 if (m_buf.getNumFree() < numBytes)
86 {
87 m_buf.resize(getNextBufferSize(m_buf.getSize(), m_buf.getNumElements()+numBytes));
88 }
89
90 // Append to front.
91 m_buf.pushFront(bytes, numBytes);
92
93 // If we haven't parsed complete token, re-try after data feed.
94 if (m_curToken == TOKEN_INCOMPLETE)
95 advance();
96 }
97
getChar(int offset) const98 int Tokenizer::getChar (int offset) const
99 {
100 DE_ASSERT(de::inRange(offset, 0, m_buf.getNumElements()));
101
102 if (offset < m_buf.getNumElements())
103 return m_buf.peekBack(offset);
104 else
105 return END_OF_BUFFER;
106 }
107
advance(void)108 void Tokenizer::advance (void)
109 {
110 if (m_curToken != TOKEN_INCOMPLETE)
111 {
112 // Parser should not try to advance beyond end of string.
113 DE_ASSERT(m_curToken != TOKEN_END_OF_STRING);
114
115 // If current token is tag end, change state to data.
116 if (m_curToken == TOKEN_TAG_END ||
117 m_curToken == TOKEN_EMPTY_ELEMENT_END ||
118 m_curToken == TOKEN_PROCESSING_INSTRUCTION_END ||
119 m_curToken == TOKEN_COMMENT ||
120 m_curToken == TOKEN_ENTITY)
121 m_state = STATE_DATA;
122
123 // Advance buffer by length of last token.
124 m_buf.popBack(m_curTokenLen);
125
126 // Reset state.
127 m_curToken = TOKEN_INCOMPLETE;
128 m_curTokenLen = 0;
129
130 // If we hit end of string here, report it as end of string.
131 if (getChar(0) == END_OF_STRING)
132 {
133 m_curToken = TOKEN_END_OF_STRING;
134 m_curTokenLen = 1;
135 return;
136 }
137 }
138
139 int curChar = getChar(m_curTokenLen);
140
141 for (;;)
142 {
143 if (m_state == STATE_DATA)
144 {
145 // Advance until we hit end of buffer or tag start and treat that as data token.
146 if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER || curChar == '<' || curChar == '&')
147 {
148 if (curChar == '<')
149 m_state = STATE_TAG;
150 else if (curChar == '&')
151 m_state = STATE_ENTITY;
152
153 if (m_curTokenLen > 0)
154 {
155 // Report data token.
156 m_curToken = TOKEN_DATA;
157 return;
158 }
159 else if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER)
160 {
161 // Just return incomplete token, no data parsed.
162 return;
163 }
164 else
165 {
166 DE_ASSERT(m_state == STATE_TAG || m_state == STATE_ENTITY);
167 continue;
168 }
169 }
170 }
171 else
172 {
173 // Eat all whitespace if present.
174 if (m_curTokenLen == 0)
175 {
176 while (isWhitespaceChar(curChar))
177 {
178 m_buf.popBack();
179 curChar = getChar(0);
180 }
181 }
182
183 // Handle end of string / buffer.
184 if (curChar == END_OF_STRING)
185 error("Unexpected end of string");
186 else if (curChar == (int)END_OF_BUFFER)
187 {
188 DE_ASSERT(m_curToken == TOKEN_INCOMPLETE);
189 return;
190 }
191
192 if (m_curTokenLen == 0)
193 {
194 // Expect start of identifier, value or special tag token.
195 if (curChar == '\'' || curChar == '"')
196 m_state = STATE_VALUE;
197 else if (isIdentifierStartChar(curChar))
198 m_state = STATE_IDENTIFIER;
199 else if (curChar == '<' || curChar == '?' || curChar == '/')
200 m_state = STATE_TAG;
201 else if (curChar == '&')
202 DE_ASSERT(m_state == STATE_ENTITY);
203 else if (curChar == '=')
204 {
205 m_curToken = TOKEN_EQUAL;
206 m_curTokenLen = 1;
207 return;
208 }
209 else if (curChar == '>')
210 {
211 m_curToken = TOKEN_TAG_END;
212 m_curTokenLen = 1;
213 return;
214 }
215 else
216 error("Unexpected character");
217 }
218 else if (m_state == STATE_IDENTIFIER)
219 {
220 if (!isIdentifierChar(curChar))
221 {
222 m_curToken = TOKEN_IDENTIFIER;
223 return;
224 }
225 }
226 else if (m_state == STATE_VALUE)
227 {
228 // \todo [2012-06-07 pyry] Escapes.
229 if (curChar == '\'' || curChar == '"')
230 {
231 // \todo [2012-10-17 pyry] Should we actually do the check against getChar(0)?
232 if (curChar != getChar(0))
233 error("Mismatched quote");
234 m_curToken = TOKEN_STRING;
235 m_curTokenLen += 1;
236 return;
237 }
238 }
239 else if (m_state == STATE_COMMENT)
240 {
241 DE_ASSERT(m_curTokenLen >= 2); // 2 characters have been parsed if we are in comment state.
242
243 if (m_curTokenLen <= 3)
244 {
245 if (curChar != '-')
246 error("Invalid comment start");
247 }
248 else
249 {
250 int prev2 = m_curTokenLen > 5 ? getChar(m_curTokenLen-2) : 0;
251 int prev1 = m_curTokenLen > 4 ? getChar(m_curTokenLen-1) : 0;
252
253 if (prev2 == '-' && prev1 == '-')
254 {
255 if (curChar != '>')
256 error("Invalid comment end");
257 m_curToken = TOKEN_COMMENT;
258 m_curTokenLen += 1;
259 return;
260 }
261 }
262 }
263 else if (m_state == STATE_ENTITY)
264 {
265 if (m_curTokenLen >= 1)
266 {
267 if (curChar == ';')
268 {
269 m_curToken = TOKEN_ENTITY;
270 m_curTokenLen += 1;
271 return;
272 }
273 else if (!de::inRange<int>(curChar, '0', '9') &&
274 !de::inRange<int>(curChar, 'a', 'z') &&
275 !de::inRange<int>(curChar, 'A', 'Z'))
276 error("Invalid entity");
277 }
278 }
279 else
280 {
281 // Special tokens are at most 2 characters.
282 DE_ASSERT(m_state == STATE_TAG && m_curTokenLen == 1);
283
284 int prevChar = getChar(m_curTokenLen-1);
285
286 if (prevChar == '<')
287 {
288 // Tag start.
289 if (curChar == '/')
290 {
291 m_curToken = TOKEN_END_TAG_START;
292 m_curTokenLen = 2;
293 return;
294 }
295 else if (curChar == '?')
296 {
297 m_curToken = TOKEN_PROCESSING_INSTRUCTION_START;
298 m_curTokenLen = 2;
299 return;
300 }
301 else if (curChar == '!')
302 {
303 m_state = STATE_COMMENT;
304 }
305 else
306 {
307 m_curToken = TOKEN_TAG_START;
308 m_curTokenLen = 1;
309 return;
310 }
311 }
312 else if (prevChar == '?')
313 {
314 if (curChar != '>')
315 error("Invalid processing instruction end");
316 m_curToken = TOKEN_PROCESSING_INSTRUCTION_END;
317 m_curTokenLen = 2;
318 return;
319 }
320 else if (prevChar == '/')
321 {
322 if (curChar != '>')
323 error("Invalid empty element end");
324 m_curToken = TOKEN_EMPTY_ELEMENT_END;
325 m_curTokenLen = 2;
326 return;
327 }
328 else
329 error("Could not parse special token");
330 }
331 }
332
333 m_curTokenLen += 1;
334 curChar = getChar(m_curTokenLen);
335 }
336 }
337
getString(std::string & dst) const338 void Tokenizer::getString (std::string& dst) const
339 {
340 DE_ASSERT(m_curToken == TOKEN_STRING);
341 dst.resize(m_curTokenLen-2);
342 for (int ndx = 0; ndx < m_curTokenLen-2; ndx++)
343 dst[ndx] = m_buf.peekBack(ndx+1);
344 }
345
Parser(void)346 Parser::Parser (void)
347 : m_element (ELEMENT_INCOMPLETE)
348 , m_state (STATE_DATA)
349 {
350 }
351
~Parser(void)352 Parser::~Parser (void)
353 {
354 }
355
clear(void)356 void Parser::clear (void)
357 {
358 m_tokenizer.clear();
359 m_elementName.clear();
360 m_attributes.clear();
361 m_attribName.clear();
362 m_entityValue.clear();
363
364 m_element = ELEMENT_INCOMPLETE;
365 m_state = STATE_DATA;
366 }
367
error(const std::string & what)368 void Parser::error (const std::string& what)
369 {
370 throw ParseError(what);
371 }
372
feed(const deUint8 * bytes,int numBytes)373 void Parser::feed (const deUint8* bytes, int numBytes)
374 {
375 m_tokenizer.feed(bytes, numBytes);
376
377 if (m_element == ELEMENT_INCOMPLETE)
378 advance();
379 }
380
advance(void)381 void Parser::advance (void)
382 {
383 if (m_element == ELEMENT_START)
384 m_attributes.clear();
385
386 // \note No token is advanced when element end is reported.
387 if (m_state == STATE_YIELD_EMPTY_ELEMENT_END)
388 {
389 DE_ASSERT(m_element == ELEMENT_START);
390 m_element = ELEMENT_END;
391 m_state = STATE_DATA;
392 return;
393 }
394
395 if (m_element != ELEMENT_INCOMPLETE)
396 {
397 m_tokenizer.advance();
398 m_element = ELEMENT_INCOMPLETE;
399 }
400
401 for (;;)
402 {
403 Token curToken = m_tokenizer.getToken();
404
405 // Skip comments.
406 while (curToken == TOKEN_COMMENT)
407 {
408 m_tokenizer.advance();
409 curToken = m_tokenizer.getToken();
410 }
411
412 if (curToken == TOKEN_INCOMPLETE)
413 {
414 DE_ASSERT(m_element == ELEMENT_INCOMPLETE);
415 return;
416 }
417
418 switch (m_state)
419 {
420 case STATE_ENTITY:
421 m_state = STATE_DATA;
422 // Fall-through to STATE_DATA processing.
423
424 case STATE_DATA:
425 switch (curToken)
426 {
427 case TOKEN_DATA:
428 m_element = ELEMENT_DATA;
429 return;
430
431 case TOKEN_END_OF_STRING:
432 m_element = ELEMENT_END_OF_STRING;
433 return;
434
435 case TOKEN_TAG_START:
436 m_state = STATE_START_TAG_OPEN;
437 break;
438
439 case TOKEN_END_TAG_START:
440 m_state = STATE_END_TAG_OPEN;
441 break;
442
443 case TOKEN_PROCESSING_INSTRUCTION_START:
444 m_state = STATE_IN_PROCESSING_INSTRUCTION;
445 break;
446
447 case TOKEN_ENTITY:
448 m_state = STATE_ENTITY;
449 m_element = ELEMENT_DATA;
450 parseEntityValue();
451 return;
452
453 default:
454 error("Unexpected token");
455 }
456 break;
457
458 case STATE_IN_PROCESSING_INSTRUCTION:
459 if (curToken == TOKEN_PROCESSING_INSTRUCTION_END)
460 m_state = STATE_DATA;
461 else
462 if (curToken != TOKEN_IDENTIFIER && curToken != TOKEN_EQUAL && curToken != TOKEN_STRING)
463 error("Unexpected token in processing instruction");
464 break;
465
466 case STATE_START_TAG_OPEN:
467 if (curToken != TOKEN_IDENTIFIER)
468 error("Expected identifier");
469 m_tokenizer.getTokenStr(m_elementName);
470 m_state = STATE_ATTRIBUTE_LIST;
471 break;
472
473 case STATE_END_TAG_OPEN:
474 if (curToken != TOKEN_IDENTIFIER)
475 error("Expected identifier");
476 m_tokenizer.getTokenStr(m_elementName);
477 m_state = STATE_EXPECTING_END_TAG_CLOSE;
478 break;
479
480 case STATE_EXPECTING_END_TAG_CLOSE:
481 if (curToken != TOKEN_TAG_END)
482 error("Expected tag end");
483 m_state = STATE_DATA;
484 m_element = ELEMENT_END;
485 return;
486
487 case STATE_ATTRIBUTE_LIST:
488 if (curToken == TOKEN_IDENTIFIER)
489 {
490 m_tokenizer.getTokenStr(m_attribName);
491 m_state = STATE_EXPECTING_ATTRIBUTE_EQ;
492 }
493 else if (curToken == TOKEN_EMPTY_ELEMENT_END)
494 {
495 m_state = STATE_YIELD_EMPTY_ELEMENT_END;
496 m_element = ELEMENT_START;
497 return;
498 }
499 else if (curToken == TOKEN_TAG_END)
500 {
501 m_state = STATE_DATA;
502 m_element = ELEMENT_START;
503 return;
504 }
505 else
506 error("Unexpected token");
507 break;
508
509 case STATE_EXPECTING_ATTRIBUTE_EQ:
510 if (curToken != TOKEN_EQUAL)
511 error("Expected '='");
512 m_state = STATE_EXPECTING_ATTRIBUTE_VALUE;
513 break;
514
515 case STATE_EXPECTING_ATTRIBUTE_VALUE:
516 if (curToken != TOKEN_STRING)
517 error("Expected value");
518 if (hasAttribute(m_attribName.c_str()))
519 error("Duplicate attribute");
520
521 m_tokenizer.getString(m_attributes[m_attribName]);
522 m_state = STATE_ATTRIBUTE_LIST;
523 break;
524
525 default:
526 DE_ASSERT(false);
527 }
528
529 m_tokenizer.advance();
530 }
531 }
532
getEntityValue(const std::string & entity)533 static char getEntityValue (const std::string& entity)
534 {
535 static const struct
536 {
537 const char* name;
538 char value;
539 } s_entities[] =
540 {
541 { "<", '<' },
542 { ">", '>' },
543 { "&", '&' },
544 { "'", '\''},
545 { """, '"' },
546 };
547
548 for (int ndx = 0; ndx < DE_LENGTH_OF_ARRAY(s_entities); ndx++)
549 {
550 if (entity == s_entities[ndx].name)
551 return s_entities[ndx].value;
552 }
553
554 return 0;
555 }
556
parseEntityValue(void)557 void Parser::parseEntityValue (void)
558 {
559 DE_ASSERT(m_state == STATE_ENTITY && m_tokenizer.getToken() == TOKEN_ENTITY);
560
561 std::string entity;
562 m_tokenizer.getTokenStr(entity);
563
564 const char value = getEntityValue(entity);
565 if (value == 0)
566 error("Invalid entity '" + entity + "'");
567
568 m_entityValue.resize(1);
569 m_entityValue[0] = value;
570 }
571
572 } // xml
573 } // xe
574