1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include <algorithm>
8 #include <memory>
9 #include <vector>
10
11 #include "core/fxcrt/fx_ext.h"
12 #include "core/fxcrt/fx_xml.h"
13 #include "core/fxcrt/xml_int.h"
14 #include "third_party/base/ptr_util.h"
15 #include "third_party/base/stl_util.h"
16
17 namespace {
18
19 #define FXCRTM_XML_CHARTYPE_Normal 0x00
20 #define FXCRTM_XML_CHARTYPE_SpaceChar 0x01
21 #define FXCRTM_XML_CHARTYPE_Letter 0x02
22 #define FXCRTM_XML_CHARTYPE_Digital 0x04
23 #define FXCRTM_XML_CHARTYPE_NameIntro 0x08
24 #define FXCRTM_XML_CHARTYPE_NameChar 0x10
25 #define FXCRTM_XML_CHARTYPE_HexDigital 0x20
26 #define FXCRTM_XML_CHARTYPE_HexLowerLetter 0x40
27 #define FXCRTM_XML_CHARTYPE_HexUpperLetter 0x60
28 #define FXCRTM_XML_CHARTYPE_HexChar 0x60
29
30 const uint8_t g_FXCRT_XML_ByteTypes[256] = {
31 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
32 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
33 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
34 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x10, 0x00,
35 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x08, 0x00,
36 0x00, 0x00, 0x00, 0x00, 0x00, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x1A,
37 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
38 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x18,
39 0x00, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
40 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
41 0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x1A, 0x1A, 0x1A,
42 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
43 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
44 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
45 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
46 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
47 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
48 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
49 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
50 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
51 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
52 0x1A, 0x1A, 0x01, 0x01,
53 };
54
g_FXCRT_XML_IsWhiteSpace(uint8_t ch)55 bool g_FXCRT_XML_IsWhiteSpace(uint8_t ch) {
56 return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_SpaceChar);
57 }
58
g_FXCRT_XML_IsDigital(uint8_t ch)59 bool g_FXCRT_XML_IsDigital(uint8_t ch) {
60 return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_Digital);
61 }
62
g_FXCRT_XML_IsNameIntro(uint8_t ch)63 bool g_FXCRT_XML_IsNameIntro(uint8_t ch) {
64 return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameIntro);
65 }
66
g_FXCRT_XML_IsNameChar(uint8_t ch)67 bool g_FXCRT_XML_IsNameChar(uint8_t ch) {
68 return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameChar);
69 }
70
71 class CXML_DataBufAcc : public IFX_BufferedReadStream {
72 public:
73 template <typename T, typename... Args>
74 friend CFX_RetainPtr<T> pdfium::MakeRetain(Args&&... args);
75
76 // IFX_BufferedReadStream
77 bool IsEOF() override;
78 FX_FILESIZE GetPosition() override;
79 size_t ReadBlock(void* buffer, size_t size) override;
80 bool ReadNextBlock(bool bRestart) override;
81 const uint8_t* GetBlockBuffer() override;
82 size_t GetBlockSize() override;
83 FX_FILESIZE GetBlockOffset() override;
84
85 private:
86 CXML_DataBufAcc(const uint8_t* pBuffer, size_t size);
87 ~CXML_DataBufAcc() override;
88
89 const uint8_t* m_pBuffer;
90 size_t m_dwSize;
91 size_t m_dwCurPos;
92 };
93
CXML_DataBufAcc(const uint8_t * pBuffer,size_t size)94 CXML_DataBufAcc::CXML_DataBufAcc(const uint8_t* pBuffer, size_t size)
95 : m_pBuffer(pBuffer), m_dwSize(size), m_dwCurPos(0) {}
96
~CXML_DataBufAcc()97 CXML_DataBufAcc::~CXML_DataBufAcc() {}
98
IsEOF()99 bool CXML_DataBufAcc::IsEOF() {
100 return m_dwCurPos >= m_dwSize;
101 }
102
GetPosition()103 FX_FILESIZE CXML_DataBufAcc::GetPosition() {
104 return static_cast<FX_FILESIZE>(m_dwCurPos);
105 }
106
ReadBlock(void * buffer,size_t size)107 size_t CXML_DataBufAcc::ReadBlock(void* buffer, size_t size) {
108 return 0;
109 }
110
ReadNextBlock(bool bRestart)111 bool CXML_DataBufAcc::ReadNextBlock(bool bRestart) {
112 if (bRestart)
113 m_dwCurPos = 0;
114
115 if (m_dwCurPos < m_dwSize) {
116 m_dwCurPos = m_dwSize;
117 return true;
118 }
119 return false;
120 }
121
GetBlockBuffer()122 const uint8_t* CXML_DataBufAcc::GetBlockBuffer() {
123 return m_pBuffer;
124 }
125
GetBlockSize()126 size_t CXML_DataBufAcc::GetBlockSize() {
127 return m_dwSize;
128 }
129
GetBlockOffset()130 FX_FILESIZE CXML_DataBufAcc::GetBlockOffset() {
131 return 0;
132 }
133
134 class CXML_DataStmAcc : public IFX_BufferedReadStream {
135 public:
136 template <typename T, typename... Args>
137 friend CFX_RetainPtr<T> pdfium::MakeRetain(Args&&... args);
138
139 // IFX_BufferedReadStream
140 bool IsEOF() override;
141 FX_FILESIZE GetPosition() override;
142 size_t ReadBlock(void* buffer, size_t size) override;
143 bool ReadNextBlock(bool bRestart) override;
144 const uint8_t* GetBlockBuffer() override;
145 size_t GetBlockSize() override;
146 FX_FILESIZE GetBlockOffset() override;
147
148 private:
149 explicit CXML_DataStmAcc(
150 const CFX_RetainPtr<IFX_SeekableReadStream>& pFileRead);
151 ~CXML_DataStmAcc() override;
152
153 CFX_RetainPtr<IFX_SeekableReadStream> m_pFileRead;
154 uint8_t* m_pBuffer;
155 FX_FILESIZE m_nStart;
156 size_t m_dwSize;
157 };
158
CXML_DataStmAcc(const CFX_RetainPtr<IFX_SeekableReadStream> & pFileRead)159 CXML_DataStmAcc::CXML_DataStmAcc(
160 const CFX_RetainPtr<IFX_SeekableReadStream>& pFileRead)
161 : m_pFileRead(pFileRead), m_pBuffer(nullptr), m_nStart(0), m_dwSize(0) {
162 ASSERT(m_pFileRead);
163 }
164
~CXML_DataStmAcc()165 CXML_DataStmAcc::~CXML_DataStmAcc() {
166 FX_Free(m_pBuffer);
167 }
168
IsEOF()169 bool CXML_DataStmAcc::IsEOF() {
170 return m_nStart + static_cast<FX_FILESIZE>(m_dwSize) >=
171 m_pFileRead->GetSize();
172 }
173
GetPosition()174 FX_FILESIZE CXML_DataStmAcc::GetPosition() {
175 return m_nStart + static_cast<FX_FILESIZE>(m_dwSize);
176 }
177
ReadBlock(void * buffer,size_t size)178 size_t CXML_DataStmAcc::ReadBlock(void* buffer, size_t size) {
179 return 0;
180 }
181
ReadNextBlock(bool bRestart)182 bool CXML_DataStmAcc::ReadNextBlock(bool bRestart) {
183 if (bRestart)
184 m_nStart = 0;
185
186 FX_FILESIZE nLength = m_pFileRead->GetSize();
187 m_nStart += static_cast<FX_FILESIZE>(m_dwSize);
188 if (m_nStart >= nLength)
189 return false;
190
191 static const FX_FILESIZE FX_XMLDATASTREAM_BufferSize = 32 * 1024;
192 m_dwSize = static_cast<size_t>(
193 std::min(FX_XMLDATASTREAM_BufferSize, nLength - m_nStart));
194 if (!m_pBuffer)
195 m_pBuffer = FX_Alloc(uint8_t, m_dwSize);
196
197 return m_pFileRead->ReadBlock(m_pBuffer, m_nStart, m_dwSize);
198 }
199
GetBlockBuffer()200 const uint8_t* CXML_DataStmAcc::GetBlockBuffer() {
201 return (const uint8_t*)m_pBuffer;
202 }
203
GetBlockSize()204 size_t CXML_DataStmAcc::GetBlockSize() {
205 return m_dwSize;
206 }
207
GetBlockOffset()208 FX_FILESIZE CXML_DataStmAcc::GetBlockOffset() {
209 return m_nStart;
210 }
211
212 } // namespace
213
CXML_Parser()214 CXML_Parser::CXML_Parser()
215 : m_nOffset(0),
216 m_pBuffer(nullptr),
217 m_dwBufferSize(0),
218 m_nBufferOffset(0),
219 m_dwIndex(0) {}
220
~CXML_Parser()221 CXML_Parser::~CXML_Parser() {}
222
Init(const uint8_t * pBuffer,size_t size)223 bool CXML_Parser::Init(const uint8_t* pBuffer, size_t size) {
224 m_pDataAcc = pdfium::MakeRetain<CXML_DataBufAcc>(pBuffer, size);
225 m_nOffset = 0;
226 return ReadNextBlock();
227 }
228
ReadNextBlock()229 bool CXML_Parser::ReadNextBlock() {
230 if (!m_pDataAcc->ReadNextBlock())
231 return false;
232
233 m_pBuffer = m_pDataAcc->GetBlockBuffer();
234 m_dwBufferSize = m_pDataAcc->GetBlockSize();
235 m_nBufferOffset = m_pDataAcc->GetBlockOffset();
236 m_dwIndex = 0;
237 return m_dwBufferSize > 0;
238 }
239
IsEOF()240 bool CXML_Parser::IsEOF() {
241 return m_pDataAcc->IsEOF() && m_dwIndex >= m_dwBufferSize;
242 }
243
SkipWhiteSpaces()244 void CXML_Parser::SkipWhiteSpaces() {
245 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
246 if (IsEOF())
247 return;
248
249 do {
250 while (m_dwIndex < m_dwBufferSize &&
251 g_FXCRT_XML_IsWhiteSpace(m_pBuffer[m_dwIndex])) {
252 m_dwIndex++;
253 }
254 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
255 if (m_dwIndex < m_dwBufferSize || IsEOF())
256 break;
257 } while (ReadNextBlock());
258 }
259
GetName(CFX_ByteString * space,CFX_ByteString * name)260 void CXML_Parser::GetName(CFX_ByteString* space, CFX_ByteString* name) {
261 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
262 if (IsEOF())
263 return;
264
265 CFX_ByteTextBuf buf;
266 uint8_t ch;
267 do {
268 while (m_dwIndex < m_dwBufferSize) {
269 ch = m_pBuffer[m_dwIndex];
270 if (ch == ':') {
271 *space = buf.AsStringC();
272 buf.Clear();
273 } else if (g_FXCRT_XML_IsNameChar(ch)) {
274 buf.AppendChar(ch);
275 } else {
276 break;
277 }
278 m_dwIndex++;
279 }
280 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
281 if (m_dwIndex < m_dwBufferSize || IsEOF())
282 break;
283 } while (ReadNextBlock());
284 *name = buf.AsStringC();
285 }
286
SkipLiterals(const CFX_ByteStringC & str)287 void CXML_Parser::SkipLiterals(const CFX_ByteStringC& str) {
288 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
289 if (IsEOF()) {
290 return;
291 }
292 int32_t i = 0, iLen = str.GetLength();
293 do {
294 while (m_dwIndex < m_dwBufferSize) {
295 if (str.GetAt(i) != m_pBuffer[m_dwIndex++]) {
296 i = 0;
297 continue;
298 }
299 i++;
300 if (i == iLen)
301 break;
302 }
303 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
304 if (i == iLen)
305 return;
306
307 if (m_dwIndex < m_dwBufferSize || IsEOF())
308 break;
309 } while (ReadNextBlock());
310 while (!m_pDataAcc->IsEOF()) {
311 ReadNextBlock();
312 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwBufferSize);
313 }
314 m_dwIndex = m_dwBufferSize;
315 }
316
GetCharRef()317 uint32_t CXML_Parser::GetCharRef() {
318 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
319 if (IsEOF())
320 return 0;
321
322 uint8_t ch;
323 int32_t iState = 0;
324 CFX_ByteTextBuf buf;
325 uint32_t code = 0;
326 do {
327 while (m_dwIndex < m_dwBufferSize) {
328 ch = m_pBuffer[m_dwIndex];
329 switch (iState) {
330 case 0:
331 if (ch == '#') {
332 m_dwIndex++;
333 iState = 2;
334 break;
335 }
336 iState = 1;
337 case 1:
338 m_dwIndex++;
339 if (ch == ';') {
340 CFX_ByteStringC ref = buf.AsStringC();
341 if (ref == "gt")
342 code = '>';
343 else if (ref == "lt")
344 code = '<';
345 else if (ref == "amp")
346 code = '&';
347 else if (ref == "apos")
348 code = '\'';
349 else if (ref == "quot")
350 code = '"';
351 iState = 10;
352 break;
353 }
354 buf.AppendByte(ch);
355 break;
356 case 2:
357 if (ch == 'x') {
358 m_dwIndex++;
359 iState = 4;
360 break;
361 }
362 iState = 3;
363 case 3:
364 m_dwIndex++;
365 if (ch == ';') {
366 iState = 10;
367 break;
368 }
369 if (g_FXCRT_XML_IsDigital(ch))
370 code = code * 10 + FXSYS_toDecimalDigit(static_cast<FX_WCHAR>(ch));
371 break;
372 case 4:
373 m_dwIndex++;
374 if (ch == ';') {
375 iState = 10;
376 break;
377 }
378 uint8_t nHex =
379 g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_HexChar;
380 if (nHex) {
381 if (nHex == FXCRTM_XML_CHARTYPE_HexDigital) {
382 code =
383 (code << 4) + FXSYS_toDecimalDigit(static_cast<FX_WCHAR>(ch));
384 } else if (nHex == FXCRTM_XML_CHARTYPE_HexLowerLetter) {
385 code = (code << 4) + ch - 87;
386 } else {
387 code = (code << 4) + ch - 55;
388 }
389 }
390 break;
391 }
392 if (iState == 10)
393 break;
394 }
395 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
396 if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF()) {
397 break;
398 }
399 } while (ReadNextBlock());
400 return code;
401 }
402
GetAttrValue(CFX_WideString & value)403 void CXML_Parser::GetAttrValue(CFX_WideString& value) {
404 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
405 if (IsEOF())
406 return;
407
408 CFX_UTF8Decoder decoder;
409 uint8_t mark = 0, ch = 0;
410 do {
411 while (m_dwIndex < m_dwBufferSize) {
412 ch = m_pBuffer[m_dwIndex];
413 if (mark == 0) {
414 if (ch != '\'' && ch != '"')
415 return;
416
417 mark = ch;
418 m_dwIndex++;
419 ch = 0;
420 continue;
421 }
422 m_dwIndex++;
423 if (ch == mark)
424 break;
425
426 if (ch == '&') {
427 decoder.AppendChar(GetCharRef());
428 if (IsEOF()) {
429 value = decoder.GetResult();
430 return;
431 }
432 } else {
433 decoder.Input(ch);
434 }
435 }
436 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
437 if (ch == mark || m_dwIndex < m_dwBufferSize || IsEOF())
438 break;
439 } while (ReadNextBlock());
440 value = decoder.GetResult();
441 }
442
GetTagName(bool bStartTag,bool * bEndTag,CFX_ByteString * space,CFX_ByteString * name)443 void CXML_Parser::GetTagName(bool bStartTag,
444 bool* bEndTag,
445 CFX_ByteString* space,
446 CFX_ByteString* name) {
447 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
448 if (IsEOF())
449 return;
450
451 *bEndTag = false;
452 uint8_t ch;
453 int32_t iState = bStartTag ? 1 : 0;
454 do {
455 while (m_dwIndex < m_dwBufferSize) {
456 ch = m_pBuffer[m_dwIndex];
457 switch (iState) {
458 case 0:
459 m_dwIndex++;
460 if (ch != '<')
461 break;
462
463 iState = 1;
464 break;
465 case 1:
466 if (ch == '?') {
467 m_dwIndex++;
468 SkipLiterals("?>");
469 iState = 0;
470 break;
471 }
472 if (ch == '!') {
473 m_dwIndex++;
474 SkipLiterals("-->");
475 iState = 0;
476 break;
477 }
478 if (ch == '/') {
479 m_dwIndex++;
480 GetName(space, name);
481 *bEndTag = true;
482 } else {
483 GetName(space, name);
484 *bEndTag = false;
485 }
486 return;
487 }
488 }
489 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
490 if (m_dwIndex < m_dwBufferSize || IsEOF())
491 break;
492 } while (ReadNextBlock());
493 }
494
ParseElement(CXML_Element * pParent,bool bStartTag)495 std::unique_ptr<CXML_Element> CXML_Parser::ParseElement(CXML_Element* pParent,
496 bool bStartTag) {
497 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
498 if (IsEOF())
499 return nullptr;
500
501 CFX_ByteString tag_name;
502 CFX_ByteString tag_space;
503 bool bEndTag;
504 GetTagName(bStartTag, &bEndTag, &tag_space, &tag_name);
505 if (tag_name.IsEmpty() || bEndTag)
506 return nullptr;
507
508 auto pElement = pdfium::MakeUnique<CXML_Element>(
509 pParent, tag_space.AsStringC(), tag_name.AsStringC());
510 do {
511 CFX_ByteString attr_space;
512 CFX_ByteString attr_name;
513 while (m_dwIndex < m_dwBufferSize) {
514 SkipWhiteSpaces();
515 if (IsEOF())
516 break;
517
518 if (!g_FXCRT_XML_IsNameIntro(m_pBuffer[m_dwIndex]))
519 break;
520
521 GetName(&attr_space, &attr_name);
522 SkipWhiteSpaces();
523 if (IsEOF())
524 break;
525
526 if (m_pBuffer[m_dwIndex] != '=')
527 break;
528
529 m_dwIndex++;
530 SkipWhiteSpaces();
531 if (IsEOF())
532 break;
533
534 CFX_WideString attr_value;
535 GetAttrValue(attr_value);
536 pElement->m_AttrMap.SetAt(attr_space, attr_name, attr_value);
537 }
538 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
539 if (m_dwIndex < m_dwBufferSize || IsEOF())
540 break;
541 } while (ReadNextBlock());
542 SkipWhiteSpaces();
543 if (IsEOF())
544 return pElement;
545
546 uint8_t ch = m_pBuffer[m_dwIndex++];
547 if (ch == '/') {
548 m_dwIndex++;
549 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
550 return pElement;
551 }
552 if (ch != '>') {
553 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
554 return nullptr;
555 }
556 SkipWhiteSpaces();
557 if (IsEOF())
558 return pElement;
559
560 CFX_UTF8Decoder decoder;
561 CFX_WideTextBuf content;
562 bool bCDATA = false;
563 int32_t iState = 0;
564 do {
565 while (m_dwIndex < m_dwBufferSize) {
566 ch = m_pBuffer[m_dwIndex++];
567 switch (iState) {
568 case 0:
569 if (ch == '<') {
570 iState = 1;
571 } else if (ch == '&') {
572 decoder.ClearStatus();
573 decoder.AppendChar(GetCharRef());
574 } else {
575 decoder.Input(ch);
576 }
577 break;
578 case 1:
579 if (ch == '!') {
580 iState = 2;
581 } else if (ch == '?') {
582 SkipLiterals("?>");
583 SkipWhiteSpaces();
584 iState = 0;
585 } else if (ch == '/') {
586 CFX_ByteString space;
587 CFX_ByteString name;
588 GetName(&space, &name);
589 SkipWhiteSpaces();
590 m_dwIndex++;
591 iState = 10;
592 } else {
593 content << decoder.GetResult();
594 CFX_WideString dataStr = content.MakeString();
595 if (!bCDATA)
596 dataStr.TrimRight(L" \t\r\n");
597
598 InsertContentSegment(bCDATA, dataStr.AsStringC(), pElement.get());
599 content.Clear();
600 decoder.Clear();
601 bCDATA = false;
602 iState = 0;
603 m_dwIndex--;
604 std::unique_ptr<CXML_Element> pSubElement(
605 ParseElement(pElement.get(), true));
606 if (!pSubElement)
607 break;
608
609 pElement->m_Children.push_back(
610 {CXML_Element::Element, pSubElement.release()});
611 SkipWhiteSpaces();
612 }
613 break;
614 case 2:
615 if (ch == '[') {
616 SkipLiterals("]]>");
617 } else if (ch == '-') {
618 m_dwIndex++;
619 SkipLiterals("-->");
620 } else {
621 SkipLiterals(">");
622 }
623 decoder.Clear();
624 SkipWhiteSpaces();
625 iState = 0;
626 break;
627 }
628 if (iState == 10) {
629 break;
630 }
631 }
632 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
633 if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF())
634 break;
635 } while (ReadNextBlock());
636 content << decoder.GetResult();
637 CFX_WideString dataStr = content.MakeString();
638 dataStr.TrimRight(L" \t\r\n");
639
640 InsertContentSegment(bCDATA, dataStr.AsStringC(), pElement.get());
641 content.Clear();
642 decoder.Clear();
643 bCDATA = false;
644 return pElement;
645 }
646
InsertContentSegment(bool bCDATA,const CFX_WideStringC & content,CXML_Element * pElement)647 void CXML_Parser::InsertContentSegment(bool bCDATA,
648 const CFX_WideStringC& content,
649 CXML_Element* pElement) {
650 if (content.IsEmpty())
651 return;
652
653 CXML_Content* pContent = new CXML_Content;
654 pContent->Set(bCDATA, content);
655 pElement->m_Children.push_back({CXML_Element::Content, pContent});
656 }
657
Parse(const void * pBuffer,size_t size)658 std::unique_ptr<CXML_Element> CXML_Element::Parse(const void* pBuffer,
659 size_t size) {
660 CXML_Parser parser;
661 if (!parser.Init(static_cast<const uint8_t*>(pBuffer), size))
662 return nullptr;
663 return parser.ParseElement(nullptr, false);
664 }
665
CXML_Element(const CXML_Element * pParent,const CFX_ByteStringC & qSpace,const CFX_ByteStringC & tagname)666 CXML_Element::CXML_Element(const CXML_Element* pParent,
667 const CFX_ByteStringC& qSpace,
668 const CFX_ByteStringC& tagname)
669 : m_pParent(pParent), m_QSpaceName(qSpace), m_TagName(tagname) {}
670
~CXML_Element()671 CXML_Element::~CXML_Element() {
672 Empty();
673 }
674
Empty()675 void CXML_Element::Empty() {
676 RemoveChildren();
677 }
RemoveChildren()678 void CXML_Element::RemoveChildren() {
679 for (const ChildRecord& record : m_Children) {
680 if (record.type == Content) {
681 delete static_cast<CXML_Content*>(record.child);
682 } else if (record.type == Element) {
683 CXML_Element* child = static_cast<CXML_Element*>(record.child);
684 child->RemoveChildren();
685 delete child;
686 }
687 }
688 m_Children.clear();
689 }
GetTagName(bool bQualified) const690 CFX_ByteString CXML_Element::GetTagName(bool bQualified) const {
691 if (!bQualified || m_QSpaceName.IsEmpty()) {
692 return m_TagName;
693 }
694 CFX_ByteString bsTag = m_QSpaceName;
695 bsTag += ":";
696 bsTag += m_TagName;
697 return bsTag;
698 }
699
GetNamespace(bool bQualified) const700 CFX_ByteString CXML_Element::GetNamespace(bool bQualified) const {
701 return bQualified ? m_QSpaceName : GetNamespaceURI(m_QSpaceName);
702 }
703
GetNamespaceURI(const CFX_ByteString & qName) const704 CFX_ByteString CXML_Element::GetNamespaceURI(
705 const CFX_ByteString& qName) const {
706 const CFX_WideString* pwsSpace;
707 const CXML_Element* pElement = this;
708 do {
709 if (qName.IsEmpty())
710 pwsSpace = pElement->m_AttrMap.Lookup("", "xmlns");
711 else
712 pwsSpace = pElement->m_AttrMap.Lookup("xmlns", qName);
713 if (pwsSpace)
714 break;
715
716 pElement = pElement->GetParent();
717 } while (pElement);
718 return pwsSpace ? pwsSpace->UTF8Encode() : CFX_ByteString();
719 }
720
GetAttrByIndex(int index,CFX_ByteString & space,CFX_ByteString & name,CFX_WideString & value) const721 void CXML_Element::GetAttrByIndex(int index,
722 CFX_ByteString& space,
723 CFX_ByteString& name,
724 CFX_WideString& value) const {
725 if (index < 0 || index >= m_AttrMap.GetSize())
726 return;
727
728 CXML_AttrItem& item = m_AttrMap.GetAt(index);
729 space = item.m_QSpaceName;
730 name = item.m_AttrName;
731 value = item.m_Value;
732 }
733
HasAttr(const CFX_ByteStringC & name) const734 bool CXML_Element::HasAttr(const CFX_ByteStringC& name) const {
735 CFX_ByteStringC bsSpace;
736 CFX_ByteStringC bsName;
737 FX_XML_SplitQualifiedName(name, bsSpace, bsName);
738 return !!m_AttrMap.Lookup(CFX_ByteString(bsSpace), CFX_ByteString(bsName));
739 }
740
GetAttrValue(const CFX_ByteStringC & name,CFX_WideString & attribute) const741 bool CXML_Element::GetAttrValue(const CFX_ByteStringC& name,
742 CFX_WideString& attribute) const {
743 CFX_ByteStringC bsSpace;
744 CFX_ByteStringC bsName;
745 FX_XML_SplitQualifiedName(name, bsSpace, bsName);
746 return GetAttrValue(bsSpace, bsName, attribute);
747 }
748
GetAttrValue(const CFX_ByteStringC & space,const CFX_ByteStringC & name,CFX_WideString & attribute) const749 bool CXML_Element::GetAttrValue(const CFX_ByteStringC& space,
750 const CFX_ByteStringC& name,
751 CFX_WideString& attribute) const {
752 const CFX_WideString* pValue =
753 m_AttrMap.Lookup(CFX_ByteString(space), CFX_ByteString(name));
754 if (!pValue)
755 return false;
756
757 attribute = *pValue;
758 return true;
759 }
760
GetAttrInteger(const CFX_ByteStringC & name,int & attribute) const761 bool CXML_Element::GetAttrInteger(const CFX_ByteStringC& name,
762 int& attribute) const {
763 CFX_ByteStringC bsSpace;
764 CFX_ByteStringC bsName;
765 FX_XML_SplitQualifiedName(name, bsSpace, bsName);
766 const CFX_WideString* pwsValue =
767 m_AttrMap.Lookup(CFX_ByteString(bsSpace), CFX_ByteString(bsName));
768 if (!pwsValue)
769 return false;
770
771 attribute = pwsValue->GetInteger();
772 return true;
773 }
774
GetAttrInteger(const CFX_ByteStringC & space,const CFX_ByteStringC & name,int & attribute) const775 bool CXML_Element::GetAttrInteger(const CFX_ByteStringC& space,
776 const CFX_ByteStringC& name,
777 int& attribute) const {
778 const CFX_WideString* pwsValue =
779 m_AttrMap.Lookup(CFX_ByteString(space), CFX_ByteString(name));
780 if (!pwsValue)
781 return false;
782
783 attribute = pwsValue->GetInteger();
784 return true;
785 }
786
GetAttrFloat(const CFX_ByteStringC & name,FX_FLOAT & attribute) const787 bool CXML_Element::GetAttrFloat(const CFX_ByteStringC& name,
788 FX_FLOAT& attribute) const {
789 CFX_ByteStringC bsSpace;
790 CFX_ByteStringC bsName;
791 FX_XML_SplitQualifiedName(name, bsSpace, bsName);
792 return GetAttrFloat(bsSpace, bsName, attribute);
793 }
794
GetAttrFloat(const CFX_ByteStringC & space,const CFX_ByteStringC & name,FX_FLOAT & attribute) const795 bool CXML_Element::GetAttrFloat(const CFX_ByteStringC& space,
796 const CFX_ByteStringC& name,
797 FX_FLOAT& attribute) const {
798 const CFX_WideString* pValue =
799 m_AttrMap.Lookup(CFX_ByteString(space), CFX_ByteString(name));
800 if (!pValue)
801 return false;
802
803 attribute = pValue->GetFloat();
804 return true;
805 }
806
GetChildType(uint32_t index) const807 CXML_Element::ChildType CXML_Element::GetChildType(uint32_t index) const {
808 return index < m_Children.size() ? m_Children[index].type : Invalid;
809 }
810
GetContent(uint32_t index) const811 CFX_WideString CXML_Element::GetContent(uint32_t index) const {
812 if (index < m_Children.size() && m_Children[index].type == Content) {
813 CXML_Content* pContent =
814 static_cast<CXML_Content*>(m_Children[index].child);
815 if (pContent)
816 return pContent->m_Content;
817 }
818 return CFX_WideString();
819 }
820
GetElement(uint32_t index) const821 CXML_Element* CXML_Element::GetElement(uint32_t index) const {
822 if (index < m_Children.size() && m_Children[index].type == Element)
823 return static_cast<CXML_Element*>(m_Children[index].child);
824 return nullptr;
825 }
826
CountElements(const CFX_ByteStringC & space,const CFX_ByteStringC & tag) const827 uint32_t CXML_Element::CountElements(const CFX_ByteStringC& space,
828 const CFX_ByteStringC& tag) const {
829 int count = 0;
830 for (const ChildRecord& record : m_Children) {
831 if (record.type != Element)
832 continue;
833
834 CXML_Element* pKid = static_cast<CXML_Element*>(record.child);
835 if ((space.IsEmpty() || pKid->m_QSpaceName == space) &&
836 pKid->m_TagName == tag) {
837 count++;
838 }
839 }
840 return count;
841 }
842
GetElement(const CFX_ByteStringC & space,const CFX_ByteStringC & tag,int index) const843 CXML_Element* CXML_Element::GetElement(const CFX_ByteStringC& space,
844 const CFX_ByteStringC& tag,
845 int index) const {
846 if (index < 0)
847 return nullptr;
848
849 for (const ChildRecord& record : m_Children) {
850 if (record.type != Element)
851 continue;
852
853 CXML_Element* pKid = static_cast<CXML_Element*>(record.child);
854 if ((space.IsEmpty() || pKid->m_QSpaceName == space) &&
855 pKid->m_TagName == tag) {
856 if (index-- == 0)
857 return pKid;
858 }
859 }
860 return nullptr;
861 }
862
FindElement(CXML_Element * pChild) const863 uint32_t CXML_Element::FindElement(CXML_Element* pChild) const {
864 int index = 0;
865 for (const ChildRecord& record : m_Children) {
866 if (record.type == Element &&
867 static_cast<CXML_Element*>(record.child) == pChild) {
868 return index;
869 }
870 ++index;
871 }
872 return (uint32_t)-1;
873 }
874
Matches(const CFX_ByteString & space,const CFX_ByteString & name) const875 bool CXML_AttrItem::Matches(const CFX_ByteString& space,
876 const CFX_ByteString& name) const {
877 return (space.IsEmpty() || m_QSpaceName == space) && m_AttrName == name;
878 }
879
CXML_AttrMap()880 CXML_AttrMap::CXML_AttrMap() {}
881
~CXML_AttrMap()882 CXML_AttrMap::~CXML_AttrMap() {}
883
Lookup(const CFX_ByteString & space,const CFX_ByteString & name) const884 const CFX_WideString* CXML_AttrMap::Lookup(const CFX_ByteString& space,
885 const CFX_ByteString& name) const {
886 if (!m_pMap)
887 return nullptr;
888
889 for (const auto& item : *m_pMap) {
890 if (item.Matches(space, name))
891 return &item.m_Value;
892 }
893 return nullptr;
894 }
895
SetAt(const CFX_ByteString & space,const CFX_ByteString & name,const CFX_WideString & value)896 void CXML_AttrMap::SetAt(const CFX_ByteString& space,
897 const CFX_ByteString& name,
898 const CFX_WideString& value) {
899 if (!m_pMap)
900 m_pMap = pdfium::MakeUnique<std::vector<CXML_AttrItem>>();
901
902 for (CXML_AttrItem& item : *m_pMap) {
903 if (item.Matches(space, name)) {
904 item.m_Value = value;
905 return;
906 }
907 }
908
909 m_pMap->push_back({space, name, CFX_WideString(value)});
910 }
911
GetSize() const912 int CXML_AttrMap::GetSize() const {
913 return m_pMap ? pdfium::CollectionSize<int>(*m_pMap) : 0;
914 }
915
GetAt(int index) const916 CXML_AttrItem& CXML_AttrMap::GetAt(int index) const {
917 return (*m_pMap)[index];
918 }
919