1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include <algorithm>
8 #include <memory>
9 #include <sstream>
10 #include <string>
11 #include <utility>
12 #include <vector>
13 
14 #include "core/fxcrt/cfx_utf8decoder.h"
15 #include "core/fxcrt/cfx_widetextbuf.h"
16 #include "core/fxcrt/fx_extension.h"
17 #include "core/fxcrt/xml/cxml_content.h"
18 #include "core/fxcrt/xml/cxml_element.h"
19 #include "core/fxcrt/xml/cxml_parser.h"
20 #include "third_party/base/ptr_util.h"
21 #include "third_party/base/stl_util.h"
22 
23 namespace {
24 
25 #define FXCRTM_XML_CHARTYPE_Normal 0x00
26 #define FXCRTM_XML_CHARTYPE_SpaceChar 0x01
27 #define FXCRTM_XML_CHARTYPE_Letter 0x02
28 #define FXCRTM_XML_CHARTYPE_Digital 0x04
29 #define FXCRTM_XML_CHARTYPE_NameIntro 0x08
30 #define FXCRTM_XML_CHARTYPE_NameChar 0x10
31 #define FXCRTM_XML_CHARTYPE_HexDigital 0x20
32 #define FXCRTM_XML_CHARTYPE_HexLowerLetter 0x40
33 #define FXCRTM_XML_CHARTYPE_HexUpperLetter 0x60
34 #define FXCRTM_XML_CHARTYPE_HexChar 0x60
35 
36 const uint8_t g_FXCRT_XML_ByteTypes[256] = {
37     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
38     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
39     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
40     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x10, 0x00,
41     0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x08, 0x00,
42     0x00, 0x00, 0x00, 0x00, 0x00, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x1A,
43     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
44     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x18,
45     0x00, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
46     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
47     0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x1A, 0x1A, 0x1A,
48     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
49     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
50     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
51     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
52     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
53     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
54     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
55     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
56     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
57     0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
58     0x1A, 0x1A, 0x01, 0x01,
59 };
60 
61 constexpr int kMaxDepth = 1024;
62 
g_FXCRT_XML_IsWhiteSpace(uint8_t ch)63 bool g_FXCRT_XML_IsWhiteSpace(uint8_t ch) {
64   return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_SpaceChar);
65 }
66 
g_FXCRT_XML_IsDigital(uint8_t ch)67 bool g_FXCRT_XML_IsDigital(uint8_t ch) {
68   return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_Digital);
69 }
70 
g_FXCRT_XML_IsNameIntro(uint8_t ch)71 bool g_FXCRT_XML_IsNameIntro(uint8_t ch) {
72   return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameIntro);
73 }
74 
g_FXCRT_XML_IsNameChar(uint8_t ch)75 bool g_FXCRT_XML_IsNameChar(uint8_t ch) {
76   return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameChar);
77 }
78 
79 }  // namespace
80 
CXML_Parser()81 CXML_Parser::CXML_Parser()
82     : m_nOffset(0),
83       m_pBuffer(nullptr),
84       m_dwBufferSize(0),
85       m_nBufferOffset(0),
86       m_dwIndex(0) {}
87 
~CXML_Parser()88 CXML_Parser::~CXML_Parser() {}
89 
Init(const uint8_t * pBuffer,size_t size)90 bool CXML_Parser::Init(const uint8_t* pBuffer, size_t size) {
91   m_pDataAcc = pdfium::MakeUnique<CXML_DataBufAcc>(pBuffer, size);
92   m_nOffset = 0;
93   return ReadNextBlock();
94 }
95 
ReadNextBlock()96 bool CXML_Parser::ReadNextBlock() {
97   if (!m_pDataAcc->ReadNextBlock())
98     return false;
99 
100   m_pBuffer = m_pDataAcc->GetBlockBuffer();
101   m_dwBufferSize = m_pDataAcc->GetBlockSize();
102   m_nBufferOffset = 0;
103   m_dwIndex = 0;
104   return m_dwBufferSize > 0;
105 }
106 
IsEOF()107 bool CXML_Parser::IsEOF() {
108   return m_pDataAcc->IsEOF() && m_dwIndex >= m_dwBufferSize;
109 }
110 
SkipWhiteSpaces()111 void CXML_Parser::SkipWhiteSpaces() {
112   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
113   if (IsEOF())
114     return;
115 
116   do {
117     while (m_dwIndex < m_dwBufferSize &&
118            g_FXCRT_XML_IsWhiteSpace(m_pBuffer[m_dwIndex])) {
119       m_dwIndex++;
120     }
121     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
122     if (m_dwIndex < m_dwBufferSize || IsEOF())
123       break;
124   } while (ReadNextBlock());
125 }
126 
GetName(ByteString * space,ByteString * name)127 void CXML_Parser::GetName(ByteString* space, ByteString* name) {
128   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
129   if (IsEOF())
130     return;
131 
132   std::ostringstream buf;
133   do {
134     while (m_dwIndex < m_dwBufferSize) {
135       uint8_t ch = m_pBuffer[m_dwIndex];
136       if (ch == ':') {
137         *space = ByteString(buf);
138         buf.str("");
139       } else if (g_FXCRT_XML_IsNameChar(ch)) {
140         buf << static_cast<char>(ch);
141       } else {
142         break;
143       }
144       m_dwIndex++;
145     }
146     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
147     if (m_dwIndex < m_dwBufferSize || IsEOF())
148       break;
149   } while (ReadNextBlock());
150   *name = ByteString(buf);
151 }
152 
SkipLiterals(const ByteStringView & str)153 void CXML_Parser::SkipLiterals(const ByteStringView& str) {
154   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
155   if (IsEOF()) {
156     return;
157   }
158   int32_t i = 0, iLen = str.GetLength();
159   do {
160     while (m_dwIndex < m_dwBufferSize) {
161       if (str[i] != m_pBuffer[m_dwIndex++]) {
162         i = 0;
163         continue;
164       }
165       i++;
166       if (i == iLen)
167         break;
168     }
169     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
170     if (i == iLen)
171       return;
172 
173     if (m_dwIndex < m_dwBufferSize || IsEOF())
174       break;
175   } while (ReadNextBlock());
176   while (!m_pDataAcc->IsEOF()) {
177     ReadNextBlock();
178     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwBufferSize);
179   }
180   m_dwIndex = m_dwBufferSize;
181 }
182 
GetCharRef()183 uint32_t CXML_Parser::GetCharRef() {
184   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
185   if (IsEOF())
186     return 0;
187 
188   uint8_t ch;
189   int32_t iState = 0;
190   std::ostringstream buf;
191   uint32_t code = 0;
192   do {
193     while (m_dwIndex < m_dwBufferSize) {
194       ch = m_pBuffer[m_dwIndex];
195       switch (iState) {
196         case 0:
197           if (ch == '#') {
198             m_dwIndex++;
199             iState = 2;
200             break;
201           }
202           iState = 1;
203         case 1:
204           m_dwIndex++;
205           if (ch == ';') {
206             std::string ref = buf.str();
207             if (ref == "gt")
208               code = '>';
209             else if (ref == "lt")
210               code = '<';
211             else if (ref == "amp")
212               code = '&';
213             else if (ref == "apos")
214               code = '\'';
215             else if (ref == "quot")
216               code = '"';
217             iState = 10;
218             break;
219           }
220           buf << static_cast<char>(ch);
221           break;
222         case 2:
223           if (ch == 'x') {
224             m_dwIndex++;
225             iState = 4;
226             break;
227           }
228           iState = 3;
229         case 3:
230           m_dwIndex++;
231           if (ch == ';') {
232             iState = 10;
233             break;
234           }
235           if (g_FXCRT_XML_IsDigital(ch))
236             code = code * 10 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
237           break;
238         case 4:
239           m_dwIndex++;
240           if (ch == ';') {
241             iState = 10;
242             break;
243           }
244           uint8_t nHex =
245               g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_HexChar;
246           if (nHex) {
247             if (nHex == FXCRTM_XML_CHARTYPE_HexDigital) {
248               code = (code << 4) +
249                      FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
250             } else if (nHex == FXCRTM_XML_CHARTYPE_HexLowerLetter) {
251               code = (code << 4) + ch - 87;
252             } else {
253               code = (code << 4) + ch - 55;
254             }
255           }
256           break;
257       }
258       if (iState == 10)
259         break;
260     }
261     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
262     if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF()) {
263       break;
264     }
265   } while (ReadNextBlock());
266   return code;
267 }
268 
GetAttrValue()269 WideString CXML_Parser::GetAttrValue() {
270   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
271   if (IsEOF())
272     return WideString();
273 
274   CFX_UTF8Decoder decoder;
275   uint8_t mark = 0;
276   uint8_t ch = 0;
277   do {
278     while (m_dwIndex < m_dwBufferSize) {
279       ch = m_pBuffer[m_dwIndex];
280       if (mark == 0) {
281         if (ch != '\'' && ch != '"')
282           return WideString();
283 
284         mark = ch;
285         m_dwIndex++;
286         ch = 0;
287         continue;
288       }
289       m_dwIndex++;
290       if (ch == mark)
291         break;
292 
293       if (ch == '&') {
294         decoder.AppendCodePoint(GetCharRef());
295         if (IsEOF())
296           return WideString(decoder.GetResult());
297       } else {
298         decoder.Input(ch);
299       }
300     }
301     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
302     if (ch == mark || m_dwIndex < m_dwBufferSize || IsEOF())
303       break;
304   } while (ReadNextBlock());
305   return WideString(decoder.GetResult());
306 }
307 
GetTagName(bool bStartTag,bool * bEndTag,ByteString * space,ByteString * name)308 void CXML_Parser::GetTagName(bool bStartTag,
309                              bool* bEndTag,
310                              ByteString* space,
311                              ByteString* name) {
312   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
313   if (IsEOF())
314     return;
315 
316   *bEndTag = false;
317   uint8_t ch;
318   int32_t iState = bStartTag ? 1 : 0;
319   do {
320     while (m_dwIndex < m_dwBufferSize) {
321       ch = m_pBuffer[m_dwIndex];
322       switch (iState) {
323         case 0:
324           m_dwIndex++;
325           if (ch != '<')
326             break;
327 
328           iState = 1;
329           break;
330         case 1:
331           if (ch == '?') {
332             m_dwIndex++;
333             SkipLiterals("?>");
334             iState = 0;
335             break;
336           }
337           if (ch == '!') {
338             m_dwIndex++;
339             SkipLiterals("-->");
340             iState = 0;
341             break;
342           }
343           if (ch == '/') {
344             m_dwIndex++;
345             GetName(space, name);
346             *bEndTag = true;
347           } else {
348             GetName(space, name);
349             *bEndTag = false;
350           }
351           return;
352       }
353     }
354     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
355     if (m_dwIndex < m_dwBufferSize || IsEOF())
356       break;
357   } while (ReadNextBlock());
358 }
359 
ParseElement(CXML_Element * pParent,bool bStartTag)360 std::unique_ptr<CXML_Element> CXML_Parser::ParseElement(CXML_Element* pParent,
361                                                         bool bStartTag) {
362   return ParseElementInternal(pParent, bStartTag, 0);
363 }
364 
ParseElementInternal(CXML_Element * pParent,bool bStartTag,int nDepth)365 std::unique_ptr<CXML_Element> CXML_Parser::ParseElementInternal(
366     CXML_Element* pParent,
367     bool bStartTag,
368     int nDepth) {
369   if (nDepth > kMaxDepth)
370     return nullptr;
371 
372   m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
373   if (IsEOF())
374     return nullptr;
375 
376   ByteString tag_name;
377   ByteString tag_space;
378   bool bEndTag;
379   GetTagName(bStartTag, &bEndTag, &tag_space, &tag_name);
380   if (tag_name.IsEmpty() || bEndTag)
381     return nullptr;
382 
383   auto pElement = pdfium::MakeUnique<CXML_Element>(
384       pParent, tag_space.AsStringView(), tag_name.AsStringView());
385   do {
386     ByteString attr_space;
387     ByteString attr_name;
388     while (m_dwIndex < m_dwBufferSize) {
389       SkipWhiteSpaces();
390       if (IsEOF())
391         break;
392 
393       if (!g_FXCRT_XML_IsNameIntro(m_pBuffer[m_dwIndex]))
394         break;
395 
396       GetName(&attr_space, &attr_name);
397       SkipWhiteSpaces();
398       if (IsEOF())
399         break;
400 
401       if (m_pBuffer[m_dwIndex] != '=')
402         break;
403 
404       m_dwIndex++;
405       SkipWhiteSpaces();
406       if (IsEOF())
407         break;
408 
409       WideString attr_value = GetAttrValue();
410       pElement->SetAttribute(attr_space, attr_name, attr_value);
411     }
412     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
413     if (m_dwIndex < m_dwBufferSize || IsEOF())
414       break;
415   } while (ReadNextBlock());
416   SkipWhiteSpaces();
417   if (IsEOF())
418     return pElement;
419 
420   uint8_t ch = m_pBuffer[m_dwIndex++];
421   if (ch == '/') {
422     m_dwIndex++;
423     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
424     return pElement;
425   }
426   if (ch != '>') {
427     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
428     return nullptr;
429   }
430   SkipWhiteSpaces();
431   if (IsEOF())
432     return pElement;
433 
434   CFX_UTF8Decoder decoder;
435   CFX_WideTextBuf content;
436   bool bCDATA = false;
437   int32_t iState = 0;
438   do {
439     while (m_dwIndex < m_dwBufferSize) {
440       ch = m_pBuffer[m_dwIndex++];
441       switch (iState) {
442         case 0:
443           if (ch == '<') {
444             iState = 1;
445           } else if (ch == '&') {
446             decoder.ClearStatus();
447             decoder.AppendCodePoint(GetCharRef());
448           } else {
449             decoder.Input(ch);
450           }
451           break;
452         case 1:
453           if (ch == '!') {
454             iState = 2;
455           } else if (ch == '?') {
456             SkipLiterals("?>");
457             SkipWhiteSpaces();
458             iState = 0;
459           } else if (ch == '/') {
460             ByteString space;
461             ByteString name;
462             GetName(&space, &name);
463             SkipWhiteSpaces();
464             m_dwIndex++;
465             iState = 10;
466           } else {
467             content << decoder.GetResult();
468             WideString dataStr = content.MakeString();
469             if (!bCDATA)
470               dataStr.TrimRight(L" \t\r\n");
471 
472             InsertContentSegment(bCDATA, dataStr.AsStringView(),
473                                  pElement.get());
474             content.Clear();
475             decoder.Clear();
476             bCDATA = false;
477             iState = 0;
478             m_dwIndex--;
479             std::unique_ptr<CXML_Element> pSubElement =
480                 ParseElementInternal(pElement.get(), true, nDepth + 1);
481             if (!pSubElement)
482               break;
483 
484             pElement->AppendChild(std::move(pSubElement));
485             SkipWhiteSpaces();
486           }
487           break;
488         case 2:
489           if (ch == '[') {
490             SkipLiterals("]]>");
491           } else if (ch == '-') {
492             m_dwIndex++;
493             SkipLiterals("-->");
494           } else {
495             SkipLiterals(">");
496           }
497           decoder.Clear();
498           SkipWhiteSpaces();
499           iState = 0;
500           break;
501       }
502       if (iState == 10) {
503         break;
504       }
505     }
506     m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
507     if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF())
508       break;
509   } while (ReadNextBlock());
510   content << decoder.GetResult();
511   WideString dataStr = content.MakeString();
512   dataStr.TrimRight(L" \t\r\n");
513 
514   InsertContentSegment(bCDATA, dataStr.AsStringView(), pElement.get());
515   content.Clear();
516   decoder.Clear();
517   bCDATA = false;
518   return pElement;
519 }
520 
InsertContentSegment(bool bCDATA,const WideStringView & content,CXML_Element * pElement)521 void CXML_Parser::InsertContentSegment(bool bCDATA,
522                                        const WideStringView& content,
523                                        CXML_Element* pElement) {
524   if (content.IsEmpty())
525     return;
526 
527   pElement->AppendChild(pdfium::MakeUnique<CXML_Content>(bCDATA, content));
528 }
529