1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
8 #define CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
9 
10 #include <limits>
11 #include <map>
12 #include <memory>
13 #include <set>
14 #include <vector>
15 
16 #include "core/fpdfapi/parser/cpdf_syntax_parser.h"
17 #include "core/fxcrt/fx_string.h"
18 #include "core/fxcrt/fx_system.h"
19 #include "core/fxcrt/retain_ptr.h"
20 #include "core/fxcrt/unowned_ptr.h"
21 
22 class CPDF_Array;
23 class CPDF_CryptoHandler;
24 class CPDF_Dictionary;
25 class CPDF_Document;
26 class CPDF_IndirectObjectHolder;
27 class CPDF_LinearizedHeader;
28 class CPDF_Object;
29 class CPDF_SecurityHandler;
30 class CPDF_StreamAcc;
31 class CPDF_SyntaxParser;
32 class IFX_SeekableReadStream;
33 
34 class CPDF_Parser {
35  public:
36   enum Error {
37     SUCCESS = 0,
38     FILE_ERROR,
39     FORMAT_ERROR,
40     PASSWORD_ERROR,
41     HANDLER_ERROR
42   };
43 
44   // A limit on the maximum object number in the xref table. Theoretical limits
45   // are higher, but this may be large enough in practice.
46   static const uint32_t kMaxObjectNumber = 1048576;
47 
48   static const size_t kInvalidPos = std::numeric_limits<size_t>::max();
49 
50   CPDF_Parser();
51   ~CPDF_Parser();
52 
53   Error StartParse(const RetainPtr<IFX_SeekableReadStream>& pFile,
54                    CPDF_Document* pDocument);
55   Error StartLinearizedParse(const RetainPtr<IFX_SeekableReadStream>& pFile,
56                              CPDF_Document* pDocument);
57 
SetPassword(const char * password)58   void SetPassword(const char* password) { m_Password = password; }
GetPassword()59   ByteString GetPassword() { return m_Password; }
60 
61   CPDF_Dictionary* GetTrailer() const;
62 
63   // Returns a new trailer which combines the last read trailer with the /Root
64   // and /Info from previous ones.
65   std::unique_ptr<CPDF_Dictionary> GetCombinedTrailer() const;
66 
GetLastXRefOffset()67   FX_FILESIZE GetLastXRefOffset() const { return m_LastXRefOffset; }
68 
69   uint32_t GetPermissions() const;
70   uint32_t GetRootObjNum();
71   uint32_t GetInfoObjNum();
72   const CPDF_Array* GetIDArray() const;
73 
GetEncryptDict()74   CPDF_Dictionary* GetEncryptDict() const { return m_pEncryptDict.Get(); }
75 
76   std::unique_ptr<CPDF_Object> ParseIndirectObject(
77       CPDF_IndirectObjectHolder* pObjList,
78       uint32_t objnum);
79 
80   uint32_t GetLastObjNum() const;
81   bool IsValidObjectNumber(uint32_t objnum) const;
82   FX_FILESIZE GetObjectPositionOrZero(uint32_t objnum) const;
83   uint16_t GetObjectGenNum(uint32_t objnum) const;
84   bool IsObjectFreeOrNull(uint32_t objnum) const;
GetSecurityHandler()85   CPDF_SecurityHandler* GetSecurityHandler() const {
86     return m_pSecurityHandler.get();
87   }
88   RetainPtr<IFX_SeekableReadStream> GetFileAccess() const;
89   bool IsObjectFree(uint32_t objnum) const;
90 
91   FX_FILESIZE GetObjectOffset(uint32_t objnum) const;
92 
GetFileVersion()93   int GetFileVersion() const { return m_FileVersion; }
IsXRefStream()94   bool IsXRefStream() const { return m_bXRefStream; }
95 
96   std::unique_ptr<CPDF_Object> ParseIndirectObjectAt(
97       CPDF_IndirectObjectHolder* pObjList,
98       FX_FILESIZE pos,
99       uint32_t objnum);
100 
101   std::unique_ptr<CPDF_Object> ParseIndirectObjectAtByStrict(
102       CPDF_IndirectObjectHolder* pObjList,
103       FX_FILESIZE pos,
104       uint32_t objnum,
105       FX_FILESIZE* pResultPos);
106 
107   uint32_t GetFirstPageNo() const;
108 
109  protected:
110   enum class ObjectType : uint8_t {
111     kFree = 0x00,
112     kNotCompressed = 0x01,
113     kCompressed = 0x02,
114     kNull = 0xFF,
115   };
116 
117   struct ObjectInfo {
ObjectInfoObjectInfo118     ObjectInfo() : pos(0), type(ObjectType::kFree), gennum(0) {}
119     // if type is ObjectType::kCompressed the archive_obj_num should be used.
120     // if type is ObjectType::kNotCompressed the pos should be used.
121     // In other cases its are unused.
122     union {
123       FX_FILESIZE pos;
124       FX_FILESIZE archive_obj_num;
125     };
126     ObjectType type;
127     uint16_t gennum;
128   };
129 
130   std::unique_ptr<CPDF_SyntaxParser> m_pSyntax;
131   std::map<uint32_t, ObjectInfo> m_ObjectInfo;
132 
133   bool LoadCrossRefV4(FX_FILESIZE pos, bool bSkip);
134   bool RebuildCrossRef();
135 
136  private:
137   friend class CPDF_DataAvail;
138 
139   class TrailerData;
140 
141   enum class ParserState {
142     kDefault,
143     kComment,
144     kWhitespace,
145     kString,
146     kHexString,
147     kEscapedString,
148     kXref,
149     kObjNum,
150     kPostObjNum,
151     kGenNum,
152     kPostGenNum,
153     kTrailer,
154     kBeginObj,
155     kEndObj
156   };
157 
158   struct CrossRefObjData {
159     uint32_t obj_num = 0;
160     ObjectInfo info;
161   };
162 
163   Error StartParseInternal(CPDF_Document* pDocument);
164   FX_FILESIZE ParseStartXRef();
165   bool LoadAllCrossRefV4(FX_FILESIZE pos);
166   bool LoadAllCrossRefV5(FX_FILESIZE pos);
167   bool LoadCrossRefV5(FX_FILESIZE* pos, bool bMainXRef);
168   std::unique_ptr<CPDF_Dictionary> LoadTrailerV4();
169   Error SetEncryptHandler();
170   void ReleaseEncryptHandler();
171   bool LoadLinearizedAllCrossRefV4(FX_FILESIZE pos);
172   bool LoadLinearizedAllCrossRefV5(FX_FILESIZE pos);
173   Error LoadLinearizedMainXRefTable();
174   RetainPtr<CPDF_StreamAcc> GetObjectStream(uint32_t number);
175   std::unique_ptr<CPDF_LinearizedHeader> ParseLinearizedHeader();
176   void SetEncryptDictionary(CPDF_Dictionary* pDict);
177   void ShrinkObjectMap(uint32_t size);
178   // A simple check whether the cross reference table matches with
179   // the objects.
180   bool VerifyCrossRefV4();
181 
182   // If out_objects is null, the parser position will be moved to end subsection
183   // without additional validation.
184   bool ParseAndAppendCrossRefSubsectionData(
185       uint32_t start_objnum,
186       uint32_t count,
187       std::vector<CrossRefObjData>* out_objects);
188   bool ParseCrossRefV4(std::vector<CrossRefObjData>* out_objects);
189   void MergeCrossRefObjectsData(const std::vector<CrossRefObjData>& objects);
190 
191   std::unique_ptr<CPDF_Object> ParseIndirectObjectAtInternal(
192       CPDF_IndirectObjectHolder* pObjList,
193       FX_FILESIZE pos,
194       uint32_t objnum,
195       CPDF_SyntaxParser::ParseType parse_type,
196       FX_FILESIZE* pResultPos);
197 
198   bool InitSyntaxParser(const RetainPtr<IFX_SeekableReadStream>& file_access);
199   bool ParseFileVersion();
200 
201   UnownedPtr<CPDF_Document> m_pDocument;
202   ObjectType GetObjectType(uint32_t objnum) const;
203   ObjectType GetObjectTypeFromCrossRefStreamType(
204       int cross_ref_stream_type) const;
205 
206   bool m_bHasParsed;
207   bool m_bXRefStream;
208   int m_FileVersion;
209   // m_TrailerData must be destroyed after m_pSecurityHandler due to the
210   // ownership of the ID array data.
211   std::unique_ptr<TrailerData> m_TrailerData;
212   UnownedPtr<CPDF_Dictionary> m_pEncryptDict;
213   FX_FILESIZE m_LastXRefOffset;
214   std::unique_ptr<CPDF_SecurityHandler> m_pSecurityHandler;
215   ByteString m_Password;
216   std::unique_ptr<CPDF_LinearizedHeader> m_pLinearized;
217 
218   // A map of object numbers to indirect streams.
219   std::map<uint32_t, RetainPtr<CPDF_StreamAcc>> m_ObjectStreamMap;
220 
221   // Mapping of object numbers to offsets. The offsets are relative to the first
222   // object in the stream.
223   using StreamObjectCache = std::map<uint32_t, uint32_t>;
224 
225   // Mapping of streams to their object caches. This is valid as long as the
226   // streams in |m_ObjectStreamMap| are valid.
227   std::map<RetainPtr<CPDF_StreamAcc>, StreamObjectCache> m_ObjCache;
228 
229   // All indirect object numbers that are being parsed.
230   std::set<uint32_t> m_ParsingObjNums;
231 
232   uint32_t m_MetadataObjnum = 0;
233 };
234 
235 #endif  // CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
236