1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
8 #define CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
9 
10 #include <map>
11 #include <memory>
12 #include <set>
13 #include <vector>
14 
15 #include "core/fxcrt/fx_basic.h"
16 
17 class CPDF_Array;
18 class CPDF_CryptoHandler;
19 class CPDF_Dictionary;
20 class CPDF_Document;
21 class CPDF_IndirectObjectHolder;
22 class CPDF_LinearizedHeader;
23 class CPDF_Object;
24 class CPDF_SecurityHandler;
25 class CPDF_StreamAcc;
26 class CPDF_SyntaxParser;
27 class IFX_SeekableReadStream;
28 
29 class CPDF_Parser {
30  public:
31   enum Error {
32     SUCCESS = 0,
33     FILE_ERROR,
34     FORMAT_ERROR,
35     PASSWORD_ERROR,
36     HANDLER_ERROR
37   };
38 
39   // A limit on the maximum object number in the xref table. Theoretical limits
40   // are higher, but this may be large enough in practice.
41   static const uint32_t kMaxObjectNumber = 1048576;
42 
43   CPDF_Parser();
44   ~CPDF_Parser();
45 
46   Error StartParse(const CFX_RetainPtr<IFX_SeekableReadStream>& pFile,
47                    CPDF_Document* pDocument);
48   Error StartLinearizedParse(const CFX_RetainPtr<IFX_SeekableReadStream>& pFile,
49                              CPDF_Document* pDocument);
50 
SetPassword(const FX_CHAR * password)51   void SetPassword(const FX_CHAR* password) { m_Password = password; }
GetPassword()52   CFX_ByteString GetPassword() { return m_Password; }
GetTrailer()53   CPDF_Dictionary* GetTrailer() const { return m_pTrailer.get(); }
GetLastXRefOffset()54   FX_FILESIZE GetLastXRefOffset() const { return m_LastXRefOffset; }
55 
56   uint32_t GetPermissions() const;
57   uint32_t GetRootObjNum();
58   uint32_t GetInfoObjNum();
59   CPDF_Array* GetIDArray();
60 
GetEncryptDict()61   CPDF_Dictionary* GetEncryptDict() const { return m_pEncryptDict; }
62 
63   std::unique_ptr<CPDF_Object> ParseIndirectObject(
64       CPDF_IndirectObjectHolder* pObjList,
65       uint32_t objnum);
66 
67   uint32_t GetLastObjNum() const;
68   bool IsValidObjectNumber(uint32_t objnum) const;
69   FX_FILESIZE GetObjectPositionOrZero(uint32_t objnum) const;
70   uint8_t GetObjectType(uint32_t objnum) const;
71   uint16_t GetObjectGenNum(uint32_t objnum) const;
IsVersionUpdated()72   bool IsVersionUpdated() const { return m_bVersionUpdated; }
73   bool IsObjectFreeOrNull(uint32_t objnum) const;
74   CPDF_CryptoHandler* GetCryptoHandler();
75   CFX_RetainPtr<IFX_SeekableReadStream> GetFileAccess() const;
76 
77   FX_FILESIZE GetObjectOffset(uint32_t objnum) const;
78   FX_FILESIZE GetObjectSize(uint32_t objnum) const;
79 
80   void GetIndirectBinary(uint32_t objnum, uint8_t*& pBuffer, uint32_t& size);
GetFileVersion()81   int GetFileVersion() const { return m_FileVersion; }
IsXRefStream()82   bool IsXRefStream() const { return m_bXRefStream; }
83 
84   std::unique_ptr<CPDF_Object> ParseIndirectObjectAt(
85       CPDF_IndirectObjectHolder* pObjList,
86       FX_FILESIZE pos,
87       uint32_t objnum);
88 
89   std::unique_ptr<CPDF_Object> ParseIndirectObjectAtByStrict(
90       CPDF_IndirectObjectHolder* pObjList,
91       FX_FILESIZE pos,
92       uint32_t objnum,
93       FX_FILESIZE* pResultPos);
94 
95   uint32_t GetFirstPageNo() const;
96 
97  protected:
98   struct ObjectInfo {
ObjectInfoObjectInfo99     ObjectInfo() : pos(0), type(0), gennum(0) {}
100 
101     FX_FILESIZE pos;
102     uint8_t type;
103     uint16_t gennum;
104   };
105 
106   std::unique_ptr<CPDF_SyntaxParser> m_pSyntax;
107   std::map<uint32_t, ObjectInfo> m_ObjectInfo;
108 
109   bool LoadCrossRefV4(FX_FILESIZE pos, FX_FILESIZE streampos, bool bSkip);
110   bool RebuildCrossRef();
111 
112  private:
113   friend class CPDF_DataAvail;
114 
115   enum class ParserState {
116     kDefault,
117     kComment,
118     kWhitespace,
119     kString,
120     kHexString,
121     kEscapedString,
122     kXref,
123     kObjNum,
124     kPostObjNum,
125     kGenNum,
126     kPostGenNum,
127     kTrailer,
128     kBeginObj,
129     kEndObj
130   };
131 
132   CPDF_Object* ParseDirect(CPDF_Object* pObj);
133   bool LoadAllCrossRefV4(FX_FILESIZE pos);
134   bool LoadAllCrossRefV5(FX_FILESIZE pos);
135   bool LoadCrossRefV5(FX_FILESIZE* pos, bool bMainXRef);
136   std::unique_ptr<CPDF_Dictionary> LoadTrailerV4();
137   Error SetEncryptHandler();
138   void ReleaseEncryptHandler();
139   bool LoadLinearizedAllCrossRefV4(FX_FILESIZE pos, uint32_t dwObjCount);
140   bool LoadLinearizedCrossRefV4(FX_FILESIZE pos, uint32_t dwObjCount);
141   bool LoadLinearizedAllCrossRefV5(FX_FILESIZE pos);
142   Error LoadLinearizedMainXRefTable();
143   CPDF_StreamAcc* GetObjectStream(uint32_t number);
144   bool IsLinearizedFile(
145       const CFX_RetainPtr<IFX_SeekableReadStream>& pFileAccess,
146       uint32_t offset);
147   void SetEncryptDictionary(CPDF_Dictionary* pDict);
148   void ShrinkObjectMap(uint32_t size);
149   // A simple check whether the cross reference table matches with
150   // the objects.
151   bool VerifyCrossRefV4();
152 
153   CPDF_Document* m_pDocument;  // not owned
154   bool m_bHasParsed;
155   bool m_bXRefStream;
156   bool m_bVersionUpdated;
157   int m_FileVersion;
158   CPDF_Dictionary* m_pEncryptDict;
159   FX_FILESIZE m_LastXRefOffset;
160   std::unique_ptr<CPDF_SecurityHandler> m_pSecurityHandler;
161   CFX_ByteString m_Password;
162   std::set<FX_FILESIZE> m_SortedOffset;
163   std::unique_ptr<CPDF_Dictionary> m_pTrailer;
164   std::vector<std::unique_ptr<CPDF_Dictionary>> m_Trailers;
165   std::unique_ptr<CPDF_LinearizedHeader> m_pLinearized;
166   uint32_t m_dwXrefStartObjNum;
167 
168   // A map of object numbers to indirect streams. Map owns the streams.
169   std::map<uint32_t, std::unique_ptr<CPDF_StreamAcc>> m_ObjectStreamMap;
170 
171   // Mapping of object numbers to offsets. The offsets are relative to the first
172   // object in the stream.
173   using StreamObjectCache = std::map<uint32_t, uint32_t>;
174 
175   // Mapping of streams to their object caches. This is valid as long as the
176   // streams in |m_ObjectStreamMap| are valid.
177   std::map<CPDF_StreamAcc*, StreamObjectCache> m_ObjCache;
178 
179   // All indirect object numbers that are being parsed.
180   std::set<uint32_t> m_ParsingObjNums;
181 };
182 
183 #endif  // CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
184