1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include <algorithm>
8 #include <memory>
9 #include <vector>
10 
11 #include "core/fxcrt/fx_ext.h"
12 #include "core/fxcrt/fx_xml.h"
13 #include "core/fxcrt/xml_int.h"
14 #include "third_party/base/ptr_util.h"
15 #include "third_party/base/stl_util.h"
16 
17 namespace {
18 
19 #define FXCRTM_XML_CHARTYPE_Normal 0x00
20 #define FXCRTM_XML_CHARTYPE_SpaceChar 0x01
21 #define FXCRTM_XML_CHARTYPE_Letter 0x02
22 #define FXCRTM_XML_CHARTYPE_Digital 0x04
23 #define FXCRTM_XML_CHARTYPE_NameIntro 0x08
24 #define FXCRTM_XML_CHARTYPE_NameChar 0x10
25 #define FXCRTM_XML_CHARTYPE_HexDigital 0x20
26 #define FXCRTM_XML_CHARTYPE_HexLowerLetter 0x40
27 #define FXCRTM_XML_CHARTYPE_HexUpperLetter 0x60
28 #define FXCRTM_XML_CHARTYPE_HexChar 0x60
29 
30 const uint8_t g_FXCRT_XML_ByteTypes[256] = {
31     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
32     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
33     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
34     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x10, 0x00,
35     0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x08, 0x00,
36     0x00, 0x00, 0x00, 0x00, 0x00, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x1A,
37     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
38     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x18,
39     0x00, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
40     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
41     0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x1A, 0x1A, 0x1A,
42     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
43     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
44     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
45     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
46     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
47     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
48     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
49     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
50     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
51     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
52     0x1A, 0x1A, 0x01, 0x01,
53 };
54 
g_FXCRT_XML_IsWhiteSpace(uint8_t ch)55 bool g_FXCRT_XML_IsWhiteSpace(uint8_t ch) {
56   return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_SpaceChar);
57 }
58 
g_FXCRT_XML_IsDigital(uint8_t ch)59 bool g_FXCRT_XML_IsDigital(uint8_t ch) {
60   return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_Digital);
61 }
62 
g_FXCRT_XML_IsNameIntro(uint8_t ch)63 bool g_FXCRT_XML_IsNameIntro(uint8_t ch) {
64   return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameIntro);
65 }
66 
g_FXCRT_XML_IsNameChar(uint8_t ch)67 bool g_FXCRT_XML_IsNameChar(uint8_t ch) {
68   return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameChar);
69 }
70 
71 class CXML_DataBufAcc : public IFX_BufferedReadStream {
72  public:
73   template <typename T, typename... Args>
74   friend CFX_RetainPtr<T> pdfium::MakeRetain(Args&&... args);
75 
76   // IFX_BufferedReadStream
77   bool IsEOF() override;
78   FX_FILESIZE GetPosition() override;
79   size_t ReadBlock(void* buffer, size_t size) override;
80   bool ReadNextBlock(bool bRestart) override;
81   const uint8_t* GetBlockBuffer() override;
82   size_t GetBlockSize() override;
83   FX_FILESIZE GetBlockOffset() override;
84 
85  private:
86   CXML_DataBufAcc(const uint8_t* pBuffer, size_t size);
87   ~CXML_DataBufAcc() override;
88 
89   const uint8_t* m_pBuffer;
90   size_t m_dwSize;
91   size_t m_dwCurPos;
92 };
93 
CXML_DataBufAcc(const uint8_t * pBuffer,size_t size)94 CXML_DataBufAcc::CXML_DataBufAcc(const uint8_t* pBuffer, size_t size)
95     : m_pBuffer(pBuffer), m_dwSize(size), m_dwCurPos(0) {}
96 
~CXML_DataBufAcc()97 CXML_DataBufAcc::~CXML_DataBufAcc() {}
98 
IsEOF()99 bool CXML_DataBufAcc::IsEOF() {
100   return m_dwCurPos >= m_dwSize;
101 }
102 
GetPosition()103 FX_FILESIZE CXML_DataBufAcc::GetPosition() {
104   return static_cast<FX_FILESIZE>(m_dwCurPos);
105 }
106 
ReadBlock(void * buffer,size_t size)107 size_t CXML_DataBufAcc::ReadBlock(void* buffer, size_t size) {
108   return 0;
109 }
110 
ReadNextBlock(bool bRestart)111 bool CXML_DataBufAcc::ReadNextBlock(bool bRestart) {
112   if (bRestart)
113     m_dwCurPos = 0;
114 
115   if (m_dwCurPos < m_dwSize) {
116     m_dwCurPos = m_dwSize;
117     return true;
118   }
119   return false;
120 }
121 
GetBlockBuffer()122 const uint8_t* CXML_DataBufAcc::GetBlockBuffer() {
123   return m_pBuffer;
124 }
125 
GetBlockSize()126 size_t CXML_DataBufAcc::GetBlockSize() {
127   return m_dwSize;
128 }
129 
GetBlockOffset()130 FX_FILESIZE CXML_DataBufAcc::GetBlockOffset() {
131   return 0;
132 }
133 
134 class CXML_DataStmAcc : public IFX_BufferedReadStream {
135  public:
136   template <typename T, typename... Args>
137   friend CFX_RetainPtr<T> pdfium::MakeRetain(Args&&... args);
138 
139   // IFX_BufferedReadStream
140   bool IsEOF() override;
141   FX_FILESIZE GetPosition() override;
142   size_t ReadBlock(void* buffer, size_t size) override;
143   bool ReadNextBlock(bool bRestart) override;
144   const uint8_t* GetBlockBuffer() override;
145   size_t GetBlockSize() override;
146   FX_FILESIZE GetBlockOffset() override;
147 
148  private:
149   explicit CXML_DataStmAcc(
150       const CFX_RetainPtr<IFX_SeekableReadStream>& pFileRead);
151   ~CXML_DataStmAcc() override;
152 
153   CFX_RetainPtr<IFX_SeekableReadStream> m_pFileRead;
154   uint8_t* m_pBuffer;
155   FX_FILESIZE m_nStart;
156   size_t m_dwSize;
157 };
158 
CXML_DataStmAcc(const CFX_RetainPtr<IFX_SeekableReadStream> & pFileRead)159 CXML_DataStmAcc::CXML_DataStmAcc(
160     const CFX_RetainPtr<IFX_SeekableReadStream>& pFileRead)
161     : m_pFileRead(pFileRead), m_pBuffer(nullptr), m_nStart(0), m_dwSize(0) {
162   ASSERT(m_pFileRead);
163 }
164 
~CXML_DataStmAcc()165 CXML_DataStmAcc::~CXML_DataStmAcc() {
166   FX_Free(m_pBuffer);
167 }
168 
IsEOF()169 bool CXML_DataStmAcc::IsEOF() {
170   return m_nStart + static_cast<FX_FILESIZE>(m_dwSize) >=
171          m_pFileRead->GetSize();
172 }
173 
GetPosition()174 FX_FILESIZE CXML_DataStmAcc::GetPosition() {
175   return m_nStart + static_cast<FX_FILESIZE>(m_dwSize);
176 }
177 
ReadBlock(void * buffer,size_t size)178 size_t CXML_DataStmAcc::ReadBlock(void* buffer, size_t size) {
179   return 0;
180 }
181 
ReadNextBlock(bool bRestart)182 bool CXML_DataStmAcc::ReadNextBlock(bool bRestart) {
183   if (bRestart)
184     m_nStart = 0;
185 
186   FX_FILESIZE nLength = m_pFileRead->GetSize();
187   m_nStart += static_cast<FX_FILESIZE>(m_dwSize);
188   if (m_nStart >= nLength)
189     return false;
190 
191   static const FX_FILESIZE FX_XMLDATASTREAM_BufferSize = 32 * 1024;
192   m_dwSize = static_cast<size_t>(
193       std::min(FX_XMLDATASTREAM_BufferSize, nLength - m_nStart));
194   if (!m_pBuffer)
195     m_pBuffer = FX_Alloc(uint8_t, m_dwSize);
196 
197   return m_pFileRead->ReadBlock(m_pBuffer, m_nStart, m_dwSize);
198 }
199 
GetBlockBuffer()200 const uint8_t* CXML_DataStmAcc::GetBlockBuffer() {
201   return (const uint8_t*)m_pBuffer;
202 }
203 
GetBlockSize()204 size_t CXML_DataStmAcc::GetBlockSize() {
205   return m_dwSize;
206 }
207 
GetBlockOffset()208 FX_FILESIZE CXML_DataStmAcc::GetBlockOffset() {
209   return m_nStart;
210 }
211 
212 }  // namespace
213 
CXML_Parser()214 CXML_Parser::CXML_Parser()
215     : m_nOffset(0),
216       m_pBuffer(nullptr),
217       m_dwBufferSize(0),
218       m_nBufferOffset(0),
219       m_dwIndex(0) {}
220 
~CXML_Parser()221 CXML_Parser::~CXML_Parser() {}
222 
Init(const uint8_t * pBuffer,size_t size)223 bool CXML_Parser::Init(const uint8_t* pBuffer, size_t size) {
224   m_pDataAcc = pdfium::MakeRetain<CXML_DataBufAcc>(pBuffer, size);
225   m_nOffset = 0;
226   return ReadNextBlock();
227 }
228 
ReadNextBlock()229 bool CXML_Parser::ReadNextBlock() {
230   if (!m_pDataAcc->ReadNextBlock())
231     return false;
232 
233   m_pBuffer = m_pDataAcc->GetBlockBuffer();
234   m_dwBufferSize = m_pDataAcc->GetBlockSize();
235   m_nBufferOffset = m_pDataAcc->GetBlockOffset();
236   m_dwIndex = 0;
237   return m_dwBufferSize > 0;
238 }
239 
IsEOF()240 bool CXML_Parser::IsEOF() {
241   return m_pDataAcc->IsEOF() && m_dwIndex >= m_dwBufferSize;
242 }
243 
SkipWhiteSpaces()244 void CXML_Parser::SkipWhiteSpaces() {
245   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
246   if (IsEOF())
247     return;
248 
249   do {
250     while (m_dwIndex < m_dwBufferSize &&
251            g_FXCRT_XML_IsWhiteSpace(m_pBuffer[m_dwIndex])) {
252       m_dwIndex++;
253     }
254     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
255     if (m_dwIndex < m_dwBufferSize || IsEOF())
256       break;
257   } while (ReadNextBlock());
258 }
259 
GetName(CFX_ByteString * space,CFX_ByteString * name)260 void CXML_Parser::GetName(CFX_ByteString* space, CFX_ByteString* name) {
261   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
262   if (IsEOF())
263     return;
264 
265   CFX_ByteTextBuf buf;
266   uint8_t ch;
267   do {
268     while (m_dwIndex < m_dwBufferSize) {
269       ch = m_pBuffer[m_dwIndex];
270       if (ch == ':') {
271         *space = buf.AsStringC();
272         buf.Clear();
273       } else if (g_FXCRT_XML_IsNameChar(ch)) {
274         buf.AppendChar(ch);
275       } else {
276         break;
277       }
278       m_dwIndex++;
279     }
280     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
281     if (m_dwIndex < m_dwBufferSize || IsEOF())
282       break;
283   } while (ReadNextBlock());
284   *name = buf.AsStringC();
285 }
286 
SkipLiterals(const CFX_ByteStringC & str)287 void CXML_Parser::SkipLiterals(const CFX_ByteStringC& str) {
288   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
289   if (IsEOF()) {
290     return;
291   }
292   int32_t i = 0, iLen = str.GetLength();
293   do {
294     while (m_dwIndex < m_dwBufferSize) {
295       if (str.GetAt(i) != m_pBuffer[m_dwIndex++]) {
296         i = 0;
297         continue;
298       }
299       i++;
300       if (i == iLen)
301         break;
302     }
303     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
304     if (i == iLen)
305       return;
306 
307     if (m_dwIndex < m_dwBufferSize || IsEOF())
308       break;
309   } while (ReadNextBlock());
310   while (!m_pDataAcc->IsEOF()) {
311     ReadNextBlock();
312     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwBufferSize);
313   }
314   m_dwIndex = m_dwBufferSize;
315 }
316 
GetCharRef()317 uint32_t CXML_Parser::GetCharRef() {
318   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
319   if (IsEOF())
320     return 0;
321 
322   uint8_t ch;
323   int32_t iState = 0;
324   CFX_ByteTextBuf buf;
325   uint32_t code = 0;
326   do {
327     while (m_dwIndex < m_dwBufferSize) {
328       ch = m_pBuffer[m_dwIndex];
329       switch (iState) {
330         case 0:
331           if (ch == '#') {
332             m_dwIndex++;
333             iState = 2;
334             break;
335           }
336           iState = 1;
337         case 1:
338           m_dwIndex++;
339           if (ch == ';') {
340             CFX_ByteStringC ref = buf.AsStringC();
341             if (ref == "gt")
342               code = '>';
343             else if (ref == "lt")
344               code = '<';
345             else if (ref == "amp")
346               code = '&';
347             else if (ref == "apos")
348               code = '\'';
349             else if (ref == "quot")
350               code = '"';
351             iState = 10;
352             break;
353           }
354           buf.AppendByte(ch);
355           break;
356         case 2:
357           if (ch == 'x') {
358             m_dwIndex++;
359             iState = 4;
360             break;
361           }
362           iState = 3;
363         case 3:
364           m_dwIndex++;
365           if (ch == ';') {
366             iState = 10;
367             break;
368           }
369           if (g_FXCRT_XML_IsDigital(ch))
370             code = code * 10 + FXSYS_toDecimalDigit(static_cast<FX_WCHAR>(ch));
371           break;
372         case 4:
373           m_dwIndex++;
374           if (ch == ';') {
375             iState = 10;
376             break;
377           }
378           uint8_t nHex =
379               g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_HexChar;
380           if (nHex) {
381             if (nHex == FXCRTM_XML_CHARTYPE_HexDigital) {
382               code =
383                   (code << 4) + FXSYS_toDecimalDigit(static_cast<FX_WCHAR>(ch));
384             } else if (nHex == FXCRTM_XML_CHARTYPE_HexLowerLetter) {
385               code = (code << 4) + ch - 87;
386             } else {
387               code = (code << 4) + ch - 55;
388             }
389           }
390           break;
391       }
392       if (iState == 10)
393         break;
394     }
395     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
396     if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF()) {
397       break;
398     }
399   } while (ReadNextBlock());
400   return code;
401 }
402 
GetAttrValue(CFX_WideString & value)403 void CXML_Parser::GetAttrValue(CFX_WideString& value) {
404   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
405   if (IsEOF())
406     return;
407 
408   CFX_UTF8Decoder decoder;
409   uint8_t mark = 0, ch = 0;
410   do {
411     while (m_dwIndex < m_dwBufferSize) {
412       ch = m_pBuffer[m_dwIndex];
413       if (mark == 0) {
414         if (ch != '\'' && ch != '"')
415           return;
416 
417         mark = ch;
418         m_dwIndex++;
419         ch = 0;
420         continue;
421       }
422       m_dwIndex++;
423       if (ch == mark)
424         break;
425 
426       if (ch == '&') {
427         decoder.AppendChar(GetCharRef());
428         if (IsEOF()) {
429           value = decoder.GetResult();
430           return;
431         }
432       } else {
433         decoder.Input(ch);
434       }
435     }
436     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
437     if (ch == mark || m_dwIndex < m_dwBufferSize || IsEOF())
438       break;
439   } while (ReadNextBlock());
440   value = decoder.GetResult();
441 }
442 
GetTagName(bool bStartTag,bool * bEndTag,CFX_ByteString * space,CFX_ByteString * name)443 void CXML_Parser::GetTagName(bool bStartTag,
444                              bool* bEndTag,
445                              CFX_ByteString* space,
446                              CFX_ByteString* name) {
447   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
448   if (IsEOF())
449     return;
450 
451   *bEndTag = false;
452   uint8_t ch;
453   int32_t iState = bStartTag ? 1 : 0;
454   do {
455     while (m_dwIndex < m_dwBufferSize) {
456       ch = m_pBuffer[m_dwIndex];
457       switch (iState) {
458         case 0:
459           m_dwIndex++;
460           if (ch != '<')
461             break;
462 
463           iState = 1;
464           break;
465         case 1:
466           if (ch == '?') {
467             m_dwIndex++;
468             SkipLiterals("?>");
469             iState = 0;
470             break;
471           }
472           if (ch == '!') {
473             m_dwIndex++;
474             SkipLiterals("-->");
475             iState = 0;
476             break;
477           }
478           if (ch == '/') {
479             m_dwIndex++;
480             GetName(space, name);
481             *bEndTag = true;
482           } else {
483             GetName(space, name);
484             *bEndTag = false;
485           }
486           return;
487       }
488     }
489     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
490     if (m_dwIndex < m_dwBufferSize || IsEOF())
491       break;
492   } while (ReadNextBlock());
493 }
494 
ParseElement(CXML_Element * pParent,bool bStartTag)495 std::unique_ptr<CXML_Element> CXML_Parser::ParseElement(CXML_Element* pParent,
496                                                         bool bStartTag) {
497   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
498   if (IsEOF())
499     return nullptr;
500 
501   CFX_ByteString tag_name;
502   CFX_ByteString tag_space;
503   bool bEndTag;
504   GetTagName(bStartTag, &bEndTag, &tag_space, &tag_name);
505   if (tag_name.IsEmpty() || bEndTag)
506     return nullptr;
507 
508   auto pElement = pdfium::MakeUnique<CXML_Element>(
509       pParent, tag_space.AsStringC(), tag_name.AsStringC());
510   do {
511     CFX_ByteString attr_space;
512     CFX_ByteString attr_name;
513     while (m_dwIndex < m_dwBufferSize) {
514       SkipWhiteSpaces();
515       if (IsEOF())
516         break;
517 
518       if (!g_FXCRT_XML_IsNameIntro(m_pBuffer[m_dwIndex]))
519         break;
520 
521       GetName(&attr_space, &attr_name);
522       SkipWhiteSpaces();
523       if (IsEOF())
524         break;
525 
526       if (m_pBuffer[m_dwIndex] != '=')
527         break;
528 
529       m_dwIndex++;
530       SkipWhiteSpaces();
531       if (IsEOF())
532         break;
533 
534       CFX_WideString attr_value;
535       GetAttrValue(attr_value);
536       pElement->m_AttrMap.SetAt(attr_space, attr_name, attr_value);
537     }
538     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
539     if (m_dwIndex < m_dwBufferSize || IsEOF())
540       break;
541   } while (ReadNextBlock());
542   SkipWhiteSpaces();
543   if (IsEOF())
544     return pElement;
545 
546   uint8_t ch = m_pBuffer[m_dwIndex++];
547   if (ch == '/') {
548     m_dwIndex++;
549     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
550     return pElement;
551   }
552   if (ch != '>') {
553     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
554     return nullptr;
555   }
556   SkipWhiteSpaces();
557   if (IsEOF())
558     return pElement;
559 
560   CFX_UTF8Decoder decoder;
561   CFX_WideTextBuf content;
562   bool bCDATA = false;
563   int32_t iState = 0;
564   do {
565     while (m_dwIndex < m_dwBufferSize) {
566       ch = m_pBuffer[m_dwIndex++];
567       switch (iState) {
568         case 0:
569           if (ch == '<') {
570             iState = 1;
571           } else if (ch == '&') {
572             decoder.ClearStatus();
573             decoder.AppendChar(GetCharRef());
574           } else {
575             decoder.Input(ch);
576           }
577           break;
578         case 1:
579           if (ch == '!') {
580             iState = 2;
581           } else if (ch == '?') {
582             SkipLiterals("?>");
583             SkipWhiteSpaces();
584             iState = 0;
585           } else if (ch == '/') {
586             CFX_ByteString space;
587             CFX_ByteString name;
588             GetName(&space, &name);
589             SkipWhiteSpaces();
590             m_dwIndex++;
591             iState = 10;
592           } else {
593             content << decoder.GetResult();
594             CFX_WideString dataStr = content.MakeString();
595             if (!bCDATA)
596               dataStr.TrimRight(L" \t\r\n");
597 
598             InsertContentSegment(bCDATA, dataStr.AsStringC(), pElement.get());
599             content.Clear();
600             decoder.Clear();
601             bCDATA = false;
602             iState = 0;
603             m_dwIndex--;
604             std::unique_ptr<CXML_Element> pSubElement(
605                 ParseElement(pElement.get(), true));
606             if (!pSubElement)
607               break;
608 
609             pElement->m_Children.push_back(
610                 {CXML_Element::Element, pSubElement.release()});
611             SkipWhiteSpaces();
612           }
613           break;
614         case 2:
615           if (ch == '[') {
616             SkipLiterals("]]>");
617           } else if (ch == '-') {
618             m_dwIndex++;
619             SkipLiterals("-->");
620           } else {
621             SkipLiterals(">");
622           }
623           decoder.Clear();
624           SkipWhiteSpaces();
625           iState = 0;
626           break;
627       }
628       if (iState == 10) {
629         break;
630       }
631     }
632     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
633     if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF())
634       break;
635   } while (ReadNextBlock());
636   content << decoder.GetResult();
637   CFX_WideString dataStr = content.MakeString();
638   dataStr.TrimRight(L" \t\r\n");
639 
640   InsertContentSegment(bCDATA, dataStr.AsStringC(), pElement.get());
641   content.Clear();
642   decoder.Clear();
643   bCDATA = false;
644   return pElement;
645 }
646 
InsertContentSegment(bool bCDATA,const CFX_WideStringC & content,CXML_Element * pElement)647 void CXML_Parser::InsertContentSegment(bool bCDATA,
648                                        const CFX_WideStringC& content,
649                                        CXML_Element* pElement) {
650   if (content.IsEmpty())
651     return;
652 
653   CXML_Content* pContent = new CXML_Content;
654   pContent->Set(bCDATA, content);
655   pElement->m_Children.push_back({CXML_Element::Content, pContent});
656 }
657 
Parse(const void * pBuffer,size_t size)658 std::unique_ptr<CXML_Element> CXML_Element::Parse(const void* pBuffer,
659                                                   size_t size) {
660   CXML_Parser parser;
661   if (!parser.Init(static_cast<const uint8_t*>(pBuffer), size))
662     return nullptr;
663   return parser.ParseElement(nullptr, false);
664 }
665 
CXML_Element(const CXML_Element * pParent,const CFX_ByteStringC & qSpace,const CFX_ByteStringC & tagname)666 CXML_Element::CXML_Element(const CXML_Element* pParent,
667                            const CFX_ByteStringC& qSpace,
668                            const CFX_ByteStringC& tagname)
669     : m_pParent(pParent), m_QSpaceName(qSpace), m_TagName(tagname) {}
670 
~CXML_Element()671 CXML_Element::~CXML_Element() {
672   Empty();
673 }
674 
Empty()675 void CXML_Element::Empty() {
676   RemoveChildren();
677 }
RemoveChildren()678 void CXML_Element::RemoveChildren() {
679   for (const ChildRecord& record : m_Children) {
680     if (record.type == Content) {
681       delete static_cast<CXML_Content*>(record.child);
682     } else if (record.type == Element) {
683       CXML_Element* child = static_cast<CXML_Element*>(record.child);
684       child->RemoveChildren();
685       delete child;
686     }
687   }
688   m_Children.clear();
689 }
GetTagName(bool bQualified) const690 CFX_ByteString CXML_Element::GetTagName(bool bQualified) const {
691   if (!bQualified || m_QSpaceName.IsEmpty()) {
692     return m_TagName;
693   }
694   CFX_ByteString bsTag = m_QSpaceName;
695   bsTag += ":";
696   bsTag += m_TagName;
697   return bsTag;
698 }
699 
GetNamespace(bool bQualified) const700 CFX_ByteString CXML_Element::GetNamespace(bool bQualified) const {
701   return bQualified ? m_QSpaceName : GetNamespaceURI(m_QSpaceName);
702 }
703 
GetNamespaceURI(const CFX_ByteString & qName) const704 CFX_ByteString CXML_Element::GetNamespaceURI(
705     const CFX_ByteString& qName) const {
706   const CFX_WideString* pwsSpace;
707   const CXML_Element* pElement = this;
708   do {
709     if (qName.IsEmpty())
710       pwsSpace = pElement->m_AttrMap.Lookup("", "xmlns");
711     else
712       pwsSpace = pElement->m_AttrMap.Lookup("xmlns", qName);
713     if (pwsSpace)
714       break;
715 
716     pElement = pElement->GetParent();
717   } while (pElement);
718   return pwsSpace ? pwsSpace->UTF8Encode() : CFX_ByteString();
719 }
720 
GetAttrByIndex(int index,CFX_ByteString & space,CFX_ByteString & name,CFX_WideString & value) const721 void CXML_Element::GetAttrByIndex(int index,
722                                   CFX_ByteString& space,
723                                   CFX_ByteString& name,
724                                   CFX_WideString& value) const {
725   if (index < 0 || index >= m_AttrMap.GetSize())
726     return;
727 
728   CXML_AttrItem& item = m_AttrMap.GetAt(index);
729   space = item.m_QSpaceName;
730   name = item.m_AttrName;
731   value = item.m_Value;
732 }
733 
HasAttr(const CFX_ByteStringC & name) const734 bool CXML_Element::HasAttr(const CFX_ByteStringC& name) const {
735   CFX_ByteStringC bsSpace;
736   CFX_ByteStringC bsName;
737   FX_XML_SplitQualifiedName(name, bsSpace, bsName);
738   return !!m_AttrMap.Lookup(CFX_ByteString(bsSpace), CFX_ByteString(bsName));
739 }
740 
GetAttrValue(const CFX_ByteStringC & name,CFX_WideString & attribute) const741 bool CXML_Element::GetAttrValue(const CFX_ByteStringC& name,
742                                 CFX_WideString& attribute) const {
743   CFX_ByteStringC bsSpace;
744   CFX_ByteStringC bsName;
745   FX_XML_SplitQualifiedName(name, bsSpace, bsName);
746   return GetAttrValue(bsSpace, bsName, attribute);
747 }
748 
GetAttrValue(const CFX_ByteStringC & space,const CFX_ByteStringC & name,CFX_WideString & attribute) const749 bool CXML_Element::GetAttrValue(const CFX_ByteStringC& space,
750                                 const CFX_ByteStringC& name,
751                                 CFX_WideString& attribute) const {
752   const CFX_WideString* pValue =
753       m_AttrMap.Lookup(CFX_ByteString(space), CFX_ByteString(name));
754   if (!pValue)
755     return false;
756 
757   attribute = *pValue;
758   return true;
759 }
760 
GetAttrInteger(const CFX_ByteStringC & name,int & attribute) const761 bool CXML_Element::GetAttrInteger(const CFX_ByteStringC& name,
762                                   int& attribute) const {
763   CFX_ByteStringC bsSpace;
764   CFX_ByteStringC bsName;
765   FX_XML_SplitQualifiedName(name, bsSpace, bsName);
766   const CFX_WideString* pwsValue =
767       m_AttrMap.Lookup(CFX_ByteString(bsSpace), CFX_ByteString(bsName));
768   if (!pwsValue)
769     return false;
770 
771   attribute = pwsValue->GetInteger();
772   return true;
773 }
774 
GetAttrInteger(const CFX_ByteStringC & space,const CFX_ByteStringC & name,int & attribute) const775 bool CXML_Element::GetAttrInteger(const CFX_ByteStringC& space,
776                                   const CFX_ByteStringC& name,
777                                   int& attribute) const {
778   const CFX_WideString* pwsValue =
779       m_AttrMap.Lookup(CFX_ByteString(space), CFX_ByteString(name));
780   if (!pwsValue)
781     return false;
782 
783   attribute = pwsValue->GetInteger();
784   return true;
785 }
786 
GetAttrFloat(const CFX_ByteStringC & name,FX_FLOAT & attribute) const787 bool CXML_Element::GetAttrFloat(const CFX_ByteStringC& name,
788                                 FX_FLOAT& attribute) const {
789   CFX_ByteStringC bsSpace;
790   CFX_ByteStringC bsName;
791   FX_XML_SplitQualifiedName(name, bsSpace, bsName);
792   return GetAttrFloat(bsSpace, bsName, attribute);
793 }
794 
GetAttrFloat(const CFX_ByteStringC & space,const CFX_ByteStringC & name,FX_FLOAT & attribute) const795 bool CXML_Element::GetAttrFloat(const CFX_ByteStringC& space,
796                                 const CFX_ByteStringC& name,
797                                 FX_FLOAT& attribute) const {
798   const CFX_WideString* pValue =
799       m_AttrMap.Lookup(CFX_ByteString(space), CFX_ByteString(name));
800   if (!pValue)
801     return false;
802 
803   attribute = pValue->GetFloat();
804   return true;
805 }
806 
GetChildType(uint32_t index) const807 CXML_Element::ChildType CXML_Element::GetChildType(uint32_t index) const {
808   return index < m_Children.size() ? m_Children[index].type : Invalid;
809 }
810 
GetContent(uint32_t index) const811 CFX_WideString CXML_Element::GetContent(uint32_t index) const {
812   if (index < m_Children.size() && m_Children[index].type == Content) {
813     CXML_Content* pContent =
814         static_cast<CXML_Content*>(m_Children[index].child);
815     if (pContent)
816       return pContent->m_Content;
817   }
818   return CFX_WideString();
819 }
820 
GetElement(uint32_t index) const821 CXML_Element* CXML_Element::GetElement(uint32_t index) const {
822   if (index < m_Children.size() && m_Children[index].type == Element)
823     return static_cast<CXML_Element*>(m_Children[index].child);
824   return nullptr;
825 }
826 
CountElements(const CFX_ByteStringC & space,const CFX_ByteStringC & tag) const827 uint32_t CXML_Element::CountElements(const CFX_ByteStringC& space,
828                                      const CFX_ByteStringC& tag) const {
829   int count = 0;
830   for (const ChildRecord& record : m_Children) {
831     if (record.type != Element)
832       continue;
833 
834     CXML_Element* pKid = static_cast<CXML_Element*>(record.child);
835     if ((space.IsEmpty() || pKid->m_QSpaceName == space) &&
836         pKid->m_TagName == tag) {
837       count++;
838     }
839   }
840   return count;
841 }
842 
GetElement(const CFX_ByteStringC & space,const CFX_ByteStringC & tag,int index) const843 CXML_Element* CXML_Element::GetElement(const CFX_ByteStringC& space,
844                                        const CFX_ByteStringC& tag,
845                                        int index) const {
846   if (index < 0)
847     return nullptr;
848 
849   for (const ChildRecord& record : m_Children) {
850     if (record.type != Element)
851       continue;
852 
853     CXML_Element* pKid = static_cast<CXML_Element*>(record.child);
854     if ((space.IsEmpty() || pKid->m_QSpaceName == space) &&
855         pKid->m_TagName == tag) {
856       if (index-- == 0)
857         return pKid;
858     }
859   }
860   return nullptr;
861 }
862 
FindElement(CXML_Element * pChild) const863 uint32_t CXML_Element::FindElement(CXML_Element* pChild) const {
864   int index = 0;
865   for (const ChildRecord& record : m_Children) {
866     if (record.type == Element &&
867         static_cast<CXML_Element*>(record.child) == pChild) {
868       return index;
869     }
870     ++index;
871   }
872   return (uint32_t)-1;
873 }
874 
Matches(const CFX_ByteString & space,const CFX_ByteString & name) const875 bool CXML_AttrItem::Matches(const CFX_ByteString& space,
876                             const CFX_ByteString& name) const {
877   return (space.IsEmpty() || m_QSpaceName == space) && m_AttrName == name;
878 }
879 
CXML_AttrMap()880 CXML_AttrMap::CXML_AttrMap() {}
881 
~CXML_AttrMap()882 CXML_AttrMap::~CXML_AttrMap() {}
883 
Lookup(const CFX_ByteString & space,const CFX_ByteString & name) const884 const CFX_WideString* CXML_AttrMap::Lookup(const CFX_ByteString& space,
885                                            const CFX_ByteString& name) const {
886   if (!m_pMap)
887     return nullptr;
888 
889   for (const auto& item : *m_pMap) {
890     if (item.Matches(space, name))
891       return &item.m_Value;
892   }
893   return nullptr;
894 }
895 
SetAt(const CFX_ByteString & space,const CFX_ByteString & name,const CFX_WideString & value)896 void CXML_AttrMap::SetAt(const CFX_ByteString& space,
897                          const CFX_ByteString& name,
898                          const CFX_WideString& value) {
899   if (!m_pMap)
900     m_pMap = pdfium::MakeUnique<std::vector<CXML_AttrItem>>();
901 
902   for (CXML_AttrItem& item : *m_pMap) {
903     if (item.Matches(space, name)) {
904       item.m_Value = value;
905       return;
906     }
907   }
908 
909   m_pMap->push_back({space, name, CFX_WideString(value)});
910 }
911 
GetSize() const912 int CXML_AttrMap::GetSize() const {
913   return m_pMap ? pdfium::CollectionSize<int>(*m_pMap) : 0;
914 }
915 
GetAt(int index) const916 CXML_AttrItem& CXML_AttrMap::GetAt(int index) const {
917   return (*m_pMap)[index];
918 }
919