1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdfapi/parser/cpdf_document.h"
8 
9 #include <set>
10 #include <utility>
11 #include <vector>
12 
13 #include "build/build_config.h"
14 #include "core/fpdfapi/parser/cpdf_array.h"
15 #include "core/fpdfapi/parser/cpdf_dictionary.h"
16 #include "core/fpdfapi/parser/cpdf_linearized_header.h"
17 #include "core/fpdfapi/parser/cpdf_name.h"
18 #include "core/fpdfapi/parser/cpdf_number.h"
19 #include "core/fpdfapi/parser/cpdf_parser.h"
20 #include "core/fpdfapi/parser/cpdf_read_validator.h"
21 #include "core/fpdfapi/parser/cpdf_reference.h"
22 #include "core/fpdfapi/parser/cpdf_stream.h"
23 #include "core/fpdfapi/parser/cpdf_stream_acc.h"
24 #include "core/fpdfapi/parser/cpdf_string.h"
25 #include "core/fxcodec/jbig2/JBig2_DocumentContext.h"
26 #include "core/fxcrt/fx_codepage.h"
27 #include "third_party/base/ptr_util.h"
28 #include "third_party/base/stl_util.h"
29 
30 namespace {
31 
32 const int kMaxPageLevel = 1024;
33 
CountPages(CPDF_Dictionary * pPages,std::set<CPDF_Dictionary * > * visited_pages)34 int CountPages(CPDF_Dictionary* pPages,
35                std::set<CPDF_Dictionary*>* visited_pages) {
36   int count = pPages->GetIntegerFor("Count");
37   if (count > 0 && count < CPDF_Document::kPageMaxNum)
38     return count;
39   CPDF_Array* pKidList = pPages->GetArrayFor("Kids");
40   if (!pKidList)
41     return 0;
42   count = 0;
43   for (size_t i = 0; i < pKidList->size(); i++) {
44     CPDF_Dictionary* pKid = pKidList->GetDictAt(i);
45     if (!pKid || pdfium::ContainsKey(*visited_pages, pKid))
46       continue;
47     if (pKid->KeyExist("Kids")) {
48       // Use |visited_pages| to help detect circular references of pages.
49       pdfium::ScopedSetInsertion<CPDF_Dictionary*> local_add(visited_pages,
50                                                              pKid);
51       count += CountPages(pKid, visited_pages);
52     } else {
53       // This page is a leaf node.
54       count++;
55     }
56   }
57   pPages->SetNewFor<CPDF_Number>("Count", count);
58   return count;
59 }
60 
61 }  // namespace
62 
CPDF_Document(std::unique_ptr<RenderDataIface> pRenderData,std::unique_ptr<PageDataIface> pPageData)63 CPDF_Document::CPDF_Document(std::unique_ptr<RenderDataIface> pRenderData,
64                              std::unique_ptr<PageDataIface> pPageData)
65     : m_pDocRender(std::move(pRenderData)),
66       m_pDocPage(std::move(pPageData)),
67       m_StockFontClearer(m_pDocPage.get()) {
68   m_pDocRender->SetDocument(this);
69   m_pDocPage->SetDocument(this);
70 }
71 
72 CPDF_Document::~CPDF_Document() = default;
73 
ParseIndirectObject(uint32_t objnum)74 RetainPtr<CPDF_Object> CPDF_Document::ParseIndirectObject(uint32_t objnum) {
75   return m_pParser ? m_pParser->ParseIndirectObject(objnum) : nullptr;
76 }
77 
TryInit()78 bool CPDF_Document::TryInit() {
79   SetLastObjNum(m_pParser->GetLastObjNum());
80 
81   CPDF_Object* pRootObj = GetOrParseIndirectObject(m_pParser->GetRootObjNum());
82   if (pRootObj)
83     m_pRootDict.Reset(pRootObj->GetDict());
84 
85   LoadPages();
86   return GetRoot() && GetPageCount() > 0;
87 }
88 
LoadDoc(const RetainPtr<IFX_SeekableReadStream> & pFileAccess,const char * password)89 CPDF_Parser::Error CPDF_Document::LoadDoc(
90     const RetainPtr<IFX_SeekableReadStream>& pFileAccess,
91     const char* password) {
92   if (!m_pParser)
93     SetParser(pdfium::MakeUnique<CPDF_Parser>(this));
94 
95   return HandleLoadResult(m_pParser->StartParse(pFileAccess, password));
96 }
97 
LoadLinearizedDoc(const RetainPtr<CPDF_ReadValidator> & validator,const char * password)98 CPDF_Parser::Error CPDF_Document::LoadLinearizedDoc(
99     const RetainPtr<CPDF_ReadValidator>& validator,
100     const char* password) {
101   if (!m_pParser)
102     SetParser(pdfium::MakeUnique<CPDF_Parser>(this));
103 
104   return HandleLoadResult(m_pParser->StartLinearizedParse(validator, password));
105 }
106 
LoadPages()107 void CPDF_Document::LoadPages() {
108   const CPDF_LinearizedHeader* linearized_header =
109       m_pParser->GetLinearizedHeader();
110   if (!linearized_header) {
111     m_PageList.resize(RetrievePageCount());
112     return;
113   }
114 
115   m_PageList.resize(linearized_header->GetPageCount());
116   ASSERT(linearized_header->GetFirstPageNo() < m_PageList.size());
117   m_PageList[linearized_header->GetFirstPageNo()] =
118       linearized_header->GetFirstPageObjNum();
119 }
120 
TraversePDFPages(int iPage,int * nPagesToGo,size_t level)121 CPDF_Dictionary* CPDF_Document::TraversePDFPages(int iPage,
122                                                  int* nPagesToGo,
123                                                  size_t level) {
124   if (*nPagesToGo < 0 || m_bReachedMaxPageLevel)
125     return nullptr;
126 
127   CPDF_Dictionary* pPages = m_pTreeTraversal[level].first;
128   CPDF_Array* pKidList = pPages->GetArrayFor("Kids");
129   if (!pKidList) {
130     m_pTreeTraversal.pop_back();
131     if (*nPagesToGo != 1)
132       return nullptr;
133     m_PageList[iPage] = pPages->GetObjNum();
134     return pPages;
135   }
136   if (level >= kMaxPageLevel) {
137     m_pTreeTraversal.pop_back();
138     m_bReachedMaxPageLevel = true;
139     return nullptr;
140   }
141   CPDF_Dictionary* page = nullptr;
142   for (size_t i = m_pTreeTraversal[level].second; i < pKidList->size(); i++) {
143     if (*nPagesToGo == 0)
144       break;
145     pKidList->ConvertToIndirectObjectAt(i, this);
146     CPDF_Dictionary* pKid = pKidList->GetDictAt(i);
147     if (!pKid) {
148       (*nPagesToGo)--;
149       m_pTreeTraversal[level].second++;
150       continue;
151     }
152     if (pKid == pPages) {
153       m_pTreeTraversal[level].second++;
154       continue;
155     }
156     if (!pKid->KeyExist("Kids")) {
157       m_PageList[iPage - (*nPagesToGo) + 1] = pKid->GetObjNum();
158       (*nPagesToGo)--;
159       m_pTreeTraversal[level].second++;
160       if (*nPagesToGo == 0) {
161         page = pKid;
162         break;
163       }
164     } else {
165       // If the vector has size level+1, the child is not in yet
166       if (m_pTreeTraversal.size() == level + 1)
167         m_pTreeTraversal.push_back(std::make_pair(pKid, 0));
168       // Now m_pTreeTraversal[level+1] should exist and be equal to pKid.
169       CPDF_Dictionary* pageKid = TraversePDFPages(iPage, nPagesToGo, level + 1);
170       // Check if child was completely processed, i.e. it popped itself out
171       if (m_pTreeTraversal.size() == level + 1)
172         m_pTreeTraversal[level].second++;
173       // If child did not finish, no pages to go, or max level reached, end
174       if (m_pTreeTraversal.size() != level + 1 || *nPagesToGo == 0 ||
175           m_bReachedMaxPageLevel) {
176         page = pageKid;
177         break;
178       }
179     }
180   }
181   if (m_pTreeTraversal[level].second == pKidList->size())
182     m_pTreeTraversal.pop_back();
183   return page;
184 }
185 
ResetTraversal()186 void CPDF_Document::ResetTraversal() {
187   m_iNextPageToTraverse = 0;
188   m_bReachedMaxPageLevel = false;
189   m_pTreeTraversal.clear();
190 }
191 
SetParser(std::unique_ptr<CPDF_Parser> pParser)192 void CPDF_Document::SetParser(std::unique_ptr<CPDF_Parser> pParser) {
193   ASSERT(!m_pParser);
194   m_pParser = std::move(pParser);
195 }
196 
HandleLoadResult(CPDF_Parser::Error error)197 CPDF_Parser::Error CPDF_Document::HandleLoadResult(CPDF_Parser::Error error) {
198   if (error == CPDF_Parser::SUCCESS)
199     m_bHasValidCrossReferenceTable = !m_pParser->xref_table_rebuilt();
200   return error;
201 }
202 
GetPagesDict() const203 const CPDF_Dictionary* CPDF_Document::GetPagesDict() const {
204   const CPDF_Dictionary* pRoot = GetRoot();
205   return pRoot ? pRoot->GetDictFor("Pages") : nullptr;
206 }
207 
GetPagesDict()208 CPDF_Dictionary* CPDF_Document::GetPagesDict() {
209   return const_cast<CPDF_Dictionary*>(
210       static_cast<const CPDF_Document*>(this)->GetPagesDict());
211 }
212 
IsPageLoaded(int iPage) const213 bool CPDF_Document::IsPageLoaded(int iPage) const {
214   return !!m_PageList[iPage];
215 }
216 
GetPageDictionary(int iPage)217 CPDF_Dictionary* CPDF_Document::GetPageDictionary(int iPage) {
218   if (!pdfium::IndexInBounds(m_PageList, iPage))
219     return nullptr;
220 
221   const uint32_t objnum = m_PageList[iPage];
222   if (objnum) {
223     CPDF_Dictionary* result = ToDictionary(GetOrParseIndirectObject(objnum));
224     if (result)
225       return result;
226   }
227 
228   CPDF_Dictionary* pPages = GetPagesDict();
229   if (!pPages)
230     return nullptr;
231 
232   if (m_pTreeTraversal.empty()) {
233     ResetTraversal();
234     m_pTreeTraversal.push_back(std::make_pair(pPages, 0));
235   }
236   int nPagesToGo = iPage - m_iNextPageToTraverse + 1;
237   CPDF_Dictionary* pPage = TraversePDFPages(iPage, &nPagesToGo, 0);
238   m_iNextPageToTraverse = iPage + 1;
239   return pPage;
240 }
241 
SetPageObjNum(int iPage,uint32_t objNum)242 void CPDF_Document::SetPageObjNum(int iPage, uint32_t objNum) {
243   m_PageList[iPage] = objNum;
244 }
245 
FindPageIndex(const CPDF_Dictionary * pNode,uint32_t * skip_count,uint32_t objnum,int * index,int level) const246 int CPDF_Document::FindPageIndex(const CPDF_Dictionary* pNode,
247                                  uint32_t* skip_count,
248                                  uint32_t objnum,
249                                  int* index,
250                                  int level) const {
251   if (!pNode->KeyExist("Kids")) {
252     if (objnum == pNode->GetObjNum())
253       return *index;
254 
255     if (*skip_count)
256       (*skip_count)--;
257 
258     (*index)++;
259     return -1;
260   }
261 
262   const CPDF_Array* pKidList = pNode->GetArrayFor("Kids");
263   if (!pKidList)
264     return -1;
265 
266   if (level >= kMaxPageLevel)
267     return -1;
268 
269   size_t count = pNode->GetIntegerFor("Count");
270   if (count <= *skip_count) {
271     (*skip_count) -= count;
272     (*index) += count;
273     return -1;
274   }
275 
276   if (count && count == pKidList->size()) {
277     for (size_t i = 0; i < count; i++) {
278       const CPDF_Reference* pKid = ToReference(pKidList->GetObjectAt(i));
279       if (pKid && pKid->GetRefObjNum() == objnum)
280         return static_cast<int>(*index + i);
281     }
282   }
283 
284   for (size_t i = 0; i < pKidList->size(); i++) {
285     const CPDF_Dictionary* pKid = pKidList->GetDictAt(i);
286     if (!pKid || pKid == pNode)
287       continue;
288 
289     int found_index = FindPageIndex(pKid, skip_count, objnum, index, level + 1);
290     if (found_index >= 0)
291       return found_index;
292   }
293   return -1;
294 }
295 
GetPageIndex(uint32_t objnum)296 int CPDF_Document::GetPageIndex(uint32_t objnum) {
297   uint32_t nPages = m_PageList.size();
298   uint32_t skip_count = 0;
299   bool bSkipped = false;
300   for (uint32_t i = 0; i < nPages; i++) {
301     if (m_PageList[i] == objnum)
302       return i;
303 
304     if (!bSkipped && m_PageList[i] == 0) {
305       skip_count = i;
306       bSkipped = true;
307     }
308   }
309   const CPDF_Dictionary* pPages = GetPagesDict();
310   if (!pPages)
311     return -1;
312 
313   int start_index = 0;
314   int found_index = FindPageIndex(pPages, &skip_count, objnum, &start_index, 0);
315 
316   // Corrupt page tree may yield out-of-range results.
317   if (!pdfium::IndexInBounds(m_PageList, found_index))
318     return -1;
319 
320   m_PageList[found_index] = objnum;
321   return found_index;
322 }
323 
GetPageCount() const324 int CPDF_Document::GetPageCount() const {
325   return pdfium::CollectionSize<int>(m_PageList);
326 }
327 
RetrievePageCount()328 int CPDF_Document::RetrievePageCount() {
329   CPDF_Dictionary* pPages = GetPagesDict();
330   if (!pPages)
331     return 0;
332 
333   if (!pPages->KeyExist("Kids"))
334     return 1;
335 
336   std::set<CPDF_Dictionary*> visited_pages;
337   visited_pages.insert(pPages);
338   return CountPages(pPages, &visited_pages);
339 }
340 
GetUserPermissions() const341 uint32_t CPDF_Document::GetUserPermissions() const {
342   if (m_pParser)
343     return m_pParser->GetPermissions();
344 
345   return m_pExtension ? m_pExtension->GetUserPermissions() : 0;
346 }
347 
CreateNewDoc()348 void CPDF_Document::CreateNewDoc() {
349   ASSERT(!m_pRootDict);
350   ASSERT(!m_pInfoDict);
351   m_pRootDict.Reset(NewIndirect<CPDF_Dictionary>());
352   m_pRootDict->SetNewFor<CPDF_Name>("Type", "Catalog");
353 
354   CPDF_Dictionary* pPages = NewIndirect<CPDF_Dictionary>();
355   pPages->SetNewFor<CPDF_Name>("Type", "Pages");
356   pPages->SetNewFor<CPDF_Number>("Count", 0);
357   pPages->SetNewFor<CPDF_Array>("Kids");
358   m_pRootDict->SetNewFor<CPDF_Reference>("Pages", this, pPages->GetObjNum());
359   m_pInfoDict.Reset(NewIndirect<CPDF_Dictionary>());
360 }
361 
CreateNewPage(int iPage)362 CPDF_Dictionary* CPDF_Document::CreateNewPage(int iPage) {
363   CPDF_Dictionary* pDict = NewIndirect<CPDF_Dictionary>();
364   pDict->SetNewFor<CPDF_Name>("Type", "Page");
365   uint32_t dwObjNum = pDict->GetObjNum();
366   if (!InsertNewPage(iPage, pDict)) {
367     DeleteIndirectObject(dwObjNum);
368     return nullptr;
369   }
370   return pDict;
371 }
372 
InsertDeletePDFPage(CPDF_Dictionary * pPages,int nPagesToGo,CPDF_Dictionary * pPageDict,bool bInsert,std::set<CPDF_Dictionary * > * pVisited)373 bool CPDF_Document::InsertDeletePDFPage(CPDF_Dictionary* pPages,
374                                         int nPagesToGo,
375                                         CPDF_Dictionary* pPageDict,
376                                         bool bInsert,
377                                         std::set<CPDF_Dictionary*>* pVisited) {
378   CPDF_Array* pKidList = pPages->GetArrayFor("Kids");
379   if (!pKidList)
380     return false;
381 
382   for (size_t i = 0; i < pKidList->size(); i++) {
383     CPDF_Dictionary* pKid = pKidList->GetDictAt(i);
384     if (pKid->GetStringFor("Type") == "Page") {
385       if (nPagesToGo != 0) {
386         nPagesToGo--;
387         continue;
388       }
389       if (bInsert) {
390         pKidList->InsertNewAt<CPDF_Reference>(i, this, pPageDict->GetObjNum());
391         pPageDict->SetNewFor<CPDF_Reference>("Parent", this,
392                                              pPages->GetObjNum());
393       } else {
394         pKidList->RemoveAt(i);
395       }
396       pPages->SetNewFor<CPDF_Number>(
397           "Count", pPages->GetIntegerFor("Count") + (bInsert ? 1 : -1));
398       ResetTraversal();
399       break;
400     }
401     int nPages = pKid->GetIntegerFor("Count");
402     if (nPagesToGo >= nPages) {
403       nPagesToGo -= nPages;
404       continue;
405     }
406     if (pdfium::ContainsKey(*pVisited, pKid))
407       return false;
408 
409     pdfium::ScopedSetInsertion<CPDF_Dictionary*> insertion(pVisited, pKid);
410     if (!InsertDeletePDFPage(pKid, nPagesToGo, pPageDict, bInsert, pVisited))
411       return false;
412 
413     pPages->SetNewFor<CPDF_Number>(
414         "Count", pPages->GetIntegerFor("Count") + (bInsert ? 1 : -1));
415     break;
416   }
417   return true;
418 }
419 
InsertNewPage(int iPage,CPDF_Dictionary * pPageDict)420 bool CPDF_Document::InsertNewPage(int iPage, CPDF_Dictionary* pPageDict) {
421   CPDF_Dictionary* pRoot = GetRoot();
422   CPDF_Dictionary* pPages = pRoot ? pRoot->GetDictFor("Pages") : nullptr;
423   if (!pPages)
424     return false;
425 
426   int nPages = GetPageCount();
427   if (iPage < 0 || iPage > nPages)
428     return false;
429 
430   if (iPage == nPages) {
431     CPDF_Array* pPagesList = pPages->GetArrayFor("Kids");
432     if (!pPagesList)
433       pPagesList = pPages->SetNewFor<CPDF_Array>("Kids");
434     pPagesList->AddNew<CPDF_Reference>(this, pPageDict->GetObjNum());
435     pPages->SetNewFor<CPDF_Number>("Count", nPages + 1);
436     pPageDict->SetNewFor<CPDF_Reference>("Parent", this, pPages->GetObjNum());
437     ResetTraversal();
438   } else {
439     std::set<CPDF_Dictionary*> stack = {pPages};
440     if (!InsertDeletePDFPage(pPages, iPage, pPageDict, true, &stack))
441       return false;
442   }
443   m_PageList.insert(m_PageList.begin() + iPage, pPageDict->GetObjNum());
444   return true;
445 }
446 
GetInfo()447 CPDF_Dictionary* CPDF_Document::GetInfo() {
448   if (m_pInfoDict)
449     return m_pInfoDict.Get();
450 
451   if (!m_pParser || !m_pParser->GetInfoObjNum())
452     return nullptr;
453 
454   auto ref =
455       pdfium::MakeRetain<CPDF_Reference>(this, m_pParser->GetInfoObjNum());
456   m_pInfoDict.Reset(ToDictionary(ref->GetDirect()));
457   return m_pInfoDict.Get();
458 }
459 
DeletePage(int iPage)460 void CPDF_Document::DeletePage(int iPage) {
461   CPDF_Dictionary* pPages = GetPagesDict();
462   if (!pPages)
463     return;
464 
465   int nPages = pPages->GetIntegerFor("Count");
466   if (iPage < 0 || iPage >= nPages)
467     return;
468 
469   std::set<CPDF_Dictionary*> stack = {pPages};
470   if (!InsertDeletePDFPage(pPages, iPage, nullptr, false, &stack))
471     return;
472 
473   m_PageList.erase(m_PageList.begin() + iPage);
474 }
475 
StockFontClearer(CPDF_Document::PageDataIface * pPageData)476 CPDF_Document::StockFontClearer::StockFontClearer(
477     CPDF_Document::PageDataIface* pPageData)
478     : m_pPageData(pPageData) {}
479 
~StockFontClearer()480 CPDF_Document::StockFontClearer::~StockFontClearer() {
481   m_pPageData->ClearStockFont();
482 }
483 
484 CPDF_Document::PageDataIface::PageDataIface() = default;
485 
486 CPDF_Document::PageDataIface::~PageDataIface() = default;
487 
488 CPDF_Document::RenderDataIface::RenderDataIface() = default;
489 
490 CPDF_Document::RenderDataIface::~RenderDataIface() = default;
491