1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
8 #define CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
9 
10 #include <limits>
11 #include <map>
12 #include <memory>
13 #include <set>
14 #include <vector>
15 
16 #include "core/fpdfapi/parser/cpdf_cross_ref_table.h"
17 #include "core/fpdfapi/parser/cpdf_indirect_object_holder.h"
18 #include "core/fxcrt/fx_string.h"
19 #include "core/fxcrt/fx_system.h"
20 #include "core/fxcrt/retain_ptr.h"
21 #include "core/fxcrt/unowned_ptr.h"
22 
23 class CPDF_Array;
24 class CPDF_CryptoHandler;
25 class CPDF_Dictionary;
26 class CPDF_LinearizedHeader;
27 class CPDF_Object;
28 class CPDF_ObjectStream;
29 class CPDF_ReadValidator;
30 class CPDF_SecurityHandler;
31 class CPDF_SyntaxParser;
32 class IFX_SeekableReadStream;
33 
34 class CPDF_Parser {
35  public:
36   class ParsedObjectsHolder : public CPDF_IndirectObjectHolder {
37    public:
38     virtual bool TryInit() = 0;
39   };
40 
41   enum Error {
42     SUCCESS = 0,
43     FILE_ERROR,
44     FORMAT_ERROR,
45     PASSWORD_ERROR,
46     HANDLER_ERROR
47   };
48 
49   // A limit on the maximum object number in the xref table. Theoretical limits
50   // are higher, but this may be large enough in practice.
51   // Note: This was 1M, but https://crbug.com/910009 encountered a PDF with
52   // object numbers in the 1.7M range. The PDF only has 10K objects, but they
53   // are non-consecutive.
54   static constexpr uint32_t kMaxObjectNumber = 4 * 1024 * 1024;
55 
56   static const size_t kInvalidPos = std::numeric_limits<size_t>::max();
57 
58   explicit CPDF_Parser(ParsedObjectsHolder* holder);
59   CPDF_Parser();
60   ~CPDF_Parser();
61 
62   Error StartParse(const RetainPtr<IFX_SeekableReadStream>& pFile,
63                    const char* password);
64   Error StartLinearizedParse(const RetainPtr<CPDF_ReadValidator>& validator,
65                              const char* password);
66 
SetPassword(const char * password)67   void SetPassword(const char* password) { m_Password = password; }
GetPassword()68   ByteString GetPassword() const { return m_Password; }
69 
70   // Take the GetPassword() value and encode it, if necessary, based on the
71   // password encoding conversion.
72   ByteString GetEncodedPassword() const;
73 
74   const CPDF_Dictionary* GetTrailer() const;
75   CPDF_Dictionary* GetMutableTrailerForTesting();
76 
77   // Returns a new trailer which combines the last read trailer with the /Root
78   // and /Info from previous ones.
79   RetainPtr<CPDF_Dictionary> GetCombinedTrailer() const;
80 
GetLastXRefOffset()81   FX_FILESIZE GetLastXRefOffset() const { return m_LastXRefOffset; }
82 
83   uint32_t GetPermissions() const;
84   uint32_t GetRootObjNum() const;
85   uint32_t GetInfoObjNum() const;
86   const CPDF_Array* GetIDArray() const;
87   CPDF_Dictionary* GetRoot() const;
88 
89   const CPDF_Dictionary* GetEncryptDict() const;
90 
91   RetainPtr<CPDF_Object> ParseIndirectObject(uint32_t objnum);
92 
93   uint32_t GetLastObjNum() const;
94   bool IsValidObjectNumber(uint32_t objnum) const;
95   FX_FILESIZE GetObjectPositionOrZero(uint32_t objnum) const;
96   bool IsObjectFreeOrNull(uint32_t objnum) const;
GetSecurityHandler()97   const RetainPtr<CPDF_SecurityHandler>& GetSecurityHandler() const {
98     return m_pSecurityHandler;
99   }
100   bool IsObjectFree(uint32_t objnum) const;
101 
GetFileVersion()102   int GetFileVersion() const { return m_FileVersion; }
IsXRefStream()103   bool IsXRefStream() const { return m_bXRefStream; }
104 
105   RetainPtr<CPDF_Object> ParseIndirectObjectAt(FX_FILESIZE pos,
106                                                uint32_t objnum);
107 
108   uint32_t GetFirstPageNo() const;
GetLinearizedHeader()109   const CPDF_LinearizedHeader* GetLinearizedHeader() const {
110     return m_pLinearized.get();
111   }
112 
GetCrossRefTable()113   const CPDF_CrossRefTable* GetCrossRefTable() const {
114     return m_CrossRefTable.get();
115   }
116 
xref_table_rebuilt()117   bool xref_table_rebuilt() const { return m_bXRefTableRebuilt; }
118 
GetSyntax()119   CPDF_SyntaxParser* GetSyntax() const { return m_pSyntax.get(); }
120 
121   void SetLinearizedHeader(std::unique_ptr<CPDF_LinearizedHeader> pLinearized);
122 
123  protected:
124   using ObjectType = CPDF_CrossRefTable::ObjectType;
125   using ObjectInfo = CPDF_CrossRefTable::ObjectInfo;
126 
127   bool LoadCrossRefV4(FX_FILESIZE pos, bool bSkip);
128   bool RebuildCrossRef();
129 
130   std::unique_ptr<CPDF_SyntaxParser> m_pSyntax;
131 
132  private:
133   friend class cpdf_parser_BadStartXrefShouldNotBuildCrossRefTable_Test;
134   friend class cpdf_parser_ParseStartXRefWithHeaderOffset_Test;
135   friend class cpdf_parser_ParseStartXRef_Test;
136   friend class cpdf_parser_ParseLinearizedWithHeaderOffset_Test;
137   friend class CPDF_DataAvail;
138 
139   struct CrossRefObjData {
140     uint32_t obj_num = 0;
141     ObjectInfo info;
142   };
143 
144   Error StartParseInternal();
145   FX_FILESIZE ParseStartXRef();
146   bool LoadAllCrossRefV4(FX_FILESIZE xref_offset);
147   bool LoadAllCrossRefV5(FX_FILESIZE xref_offset);
148   bool LoadCrossRefV5(FX_FILESIZE* pos, bool bMainXRef);
149   RetainPtr<CPDF_Dictionary> LoadTrailerV4();
150   Error SetEncryptHandler();
151   void ReleaseEncryptHandler();
152   bool LoadLinearizedAllCrossRefV4(FX_FILESIZE main_xref_offset);
153   bool LoadLinearizedAllCrossRefV5(FX_FILESIZE main_xref_offset);
154   Error LoadLinearizedMainXRefTable();
155   const CPDF_ObjectStream* GetObjectStream(uint32_t object_number);
156   std::unique_ptr<CPDF_LinearizedHeader> ParseLinearizedHeader();
157   void ShrinkObjectMap(uint32_t size);
158   // A simple check whether the cross reference table matches with
159   // the objects.
160   bool VerifyCrossRefV4();
161 
162   // If out_objects is null, the parser position will be moved to end subsection
163   // without additional validation.
164   bool ParseAndAppendCrossRefSubsectionData(
165       uint32_t start_objnum,
166       uint32_t count,
167       std::vector<CrossRefObjData>* out_objects);
168   bool ParseCrossRefV4(std::vector<CrossRefObjData>* out_objects);
169   void MergeCrossRefObjectsData(const std::vector<CrossRefObjData>& objects);
170 
171   bool InitSyntaxParser(const RetainPtr<CPDF_ReadValidator>& validator);
172   bool ParseFileVersion();
173 
174   ObjectType GetObjectType(uint32_t objnum) const;
175   ObjectType GetObjectTypeFromCrossRefStreamType(
176       uint32_t cross_ref_stream_type) const;
177 
178   std::unique_ptr<ParsedObjectsHolder> m_pOwnedObjectsHolder;
179   UnownedPtr<ParsedObjectsHolder> m_pObjectsHolder;
180 
181   bool m_bHasParsed = false;
182   bool m_bXRefStream = false;
183   bool m_bXRefTableRebuilt = false;
184   int m_FileVersion = 0;
185   // m_CrossRefTable must be destroyed after m_pSecurityHandler due to the
186   // ownership of the ID array data.
187   std::unique_ptr<CPDF_CrossRefTable> m_CrossRefTable;
188   FX_FILESIZE m_LastXRefOffset;
189   RetainPtr<CPDF_SecurityHandler> m_pSecurityHandler;
190   ByteString m_Password;
191   std::unique_ptr<CPDF_LinearizedHeader> m_pLinearized;
192 
193   // A map of object numbers to indirect streams.
194   std::map<uint32_t, std::unique_ptr<CPDF_ObjectStream>> m_ObjectStreamMap;
195 
196   // All indirect object numbers that are being parsed.
197   std::set<uint32_t> m_ParsingObjNums;
198 
199   uint32_t m_MetadataObjnum = 0;
200 };
201 
202 #endif  // CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
203