1// Copyright 2016 PDFium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#ifndef CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
8#define CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
9
10#include <limits>
11#include <map>
12#include <memory>
13#include <set>
14#include <vector>
15
16#include "core/fpdfapi/parser/cpdf_syntax_parser.h"
17#include "core/fxcrt/fx_string.h"
18#include "core/fxcrt/fx_system.h"
19#include "core/fxcrt/retain_ptr.h"
20#include "core/fxcrt/unowned_ptr.h"
21
22class CPDF_Array;
23class CPDF_CryptoHandler;
24class CPDF_Dictionary;
25class CPDF_Document;
26class CPDF_IndirectObjectHolder;
27class CPDF_LinearizedHeader;
28class CPDF_Object;
29class CPDF_SecurityHandler;
30class CPDF_StreamAcc;
31class CPDF_SyntaxParser;
32class IFX_SeekableReadStream;
33
34class CPDF_Parser {
35 public:
36  enum Error {
37    SUCCESS = 0,
38    FILE_ERROR,
39    FORMAT_ERROR,
40    PASSWORD_ERROR,
41    HANDLER_ERROR
42  };
43
44  // A limit on the maximum object number in the xref table. Theoretical limits
45  // are higher, but this may be large enough in practice.
46  static const uint32_t kMaxObjectNumber = 1048576;
47
48  static const size_t kInvalidPos = std::numeric_limits<size_t>::max();
49
50  CPDF_Parser();
51  ~CPDF_Parser();
52
53  Error StartParse(const RetainPtr<IFX_SeekableReadStream>& pFile,
54                   CPDF_Document* pDocument);
55  Error StartLinearizedParse(const RetainPtr<IFX_SeekableReadStream>& pFile,
56                             CPDF_Document* pDocument);
57
58  void SetPassword(const char* password) { m_Password = password; }
59  ByteString GetPassword() { return m_Password; }
60
61  CPDF_Dictionary* GetTrailer() const;
62
63  // Returns a new trailer which combines the last read trailer with the /Root
64  // and /Info from previous ones.
65  std::unique_ptr<CPDF_Dictionary> GetCombinedTrailer() const;
66
67  FX_FILESIZE GetLastXRefOffset() const { return m_LastXRefOffset; }
68
69  uint32_t GetPermissions() const;
70  uint32_t GetRootObjNum();
71  uint32_t GetInfoObjNum();
72  const CPDF_Array* GetIDArray() const;
73
74  CPDF_Dictionary* GetEncryptDict() const { return m_pEncryptDict.Get(); }
75
76  std::unique_ptr<CPDF_Object> ParseIndirectObject(
77      CPDF_IndirectObjectHolder* pObjList,
78      uint32_t objnum);
79
80  uint32_t GetLastObjNum() const;
81  bool IsValidObjectNumber(uint32_t objnum) const;
82  FX_FILESIZE GetObjectPositionOrZero(uint32_t objnum) const;
83  uint16_t GetObjectGenNum(uint32_t objnum) const;
84  bool IsObjectFreeOrNull(uint32_t objnum) const;
85  CPDF_SecurityHandler* GetSecurityHandler() const {
86    return m_pSecurityHandler.get();
87  }
88  RetainPtr<IFX_SeekableReadStream> GetFileAccess() const;
89  bool IsObjectFree(uint32_t objnum) const;
90
91  FX_FILESIZE GetObjectOffset(uint32_t objnum) const;
92
93  int GetFileVersion() const { return m_FileVersion; }
94  bool IsXRefStream() const { return m_bXRefStream; }
95
96  std::unique_ptr<CPDF_Object> ParseIndirectObjectAt(
97      CPDF_IndirectObjectHolder* pObjList,
98      FX_FILESIZE pos,
99      uint32_t objnum);
100
101  std::unique_ptr<CPDF_Object> ParseIndirectObjectAtByStrict(
102      CPDF_IndirectObjectHolder* pObjList,
103      FX_FILESIZE pos,
104      uint32_t objnum,
105      FX_FILESIZE* pResultPos);
106
107  uint32_t GetFirstPageNo() const;
108
109 protected:
110  enum class ObjectType : uint8_t {
111    kFree = 0x00,
112    kNotCompressed = 0x01,
113    kCompressed = 0x02,
114    kNull = 0xFF,
115  };
116
117  struct ObjectInfo {
118    ObjectInfo() : pos(0), type(ObjectType::kFree), gennum(0) {}
119    // if type is ObjectType::kCompressed the archive_obj_num should be used.
120    // if type is ObjectType::kNotCompressed the pos should be used.
121    // In other cases its are unused.
122    union {
123      FX_FILESIZE pos;
124      FX_FILESIZE archive_obj_num;
125    };
126    ObjectType type;
127    uint16_t gennum;
128  };
129
130  std::unique_ptr<CPDF_SyntaxParser> m_pSyntax;
131  std::map<uint32_t, ObjectInfo> m_ObjectInfo;
132
133  bool LoadCrossRefV4(FX_FILESIZE pos, bool bSkip);
134  bool RebuildCrossRef();
135
136 private:
137  friend class CPDF_DataAvail;
138
139  class TrailerData;
140
141  enum class ParserState {
142    kDefault,
143    kComment,
144    kWhitespace,
145    kString,
146    kHexString,
147    kEscapedString,
148    kXref,
149    kObjNum,
150    kPostObjNum,
151    kGenNum,
152    kPostGenNum,
153    kTrailer,
154    kBeginObj,
155    kEndObj
156  };
157
158  struct CrossRefObjData {
159    uint32_t obj_num = 0;
160    ObjectInfo info;
161  };
162
163  Error StartParseInternal(CPDF_Document* pDocument);
164  FX_FILESIZE ParseStartXRef();
165  bool LoadAllCrossRefV4(FX_FILESIZE pos);
166  bool LoadAllCrossRefV5(FX_FILESIZE pos);
167  bool LoadCrossRefV5(FX_FILESIZE* pos, bool bMainXRef);
168  std::unique_ptr<CPDF_Dictionary> LoadTrailerV4();
169  Error SetEncryptHandler();
170  void ReleaseEncryptHandler();
171  bool LoadLinearizedAllCrossRefV4(FX_FILESIZE pos);
172  bool LoadLinearizedAllCrossRefV5(FX_FILESIZE pos);
173  Error LoadLinearizedMainXRefTable();
174  RetainPtr<CPDF_StreamAcc> GetObjectStream(uint32_t number);
175  std::unique_ptr<CPDF_LinearizedHeader> ParseLinearizedHeader();
176  void SetEncryptDictionary(CPDF_Dictionary* pDict);
177  void ShrinkObjectMap(uint32_t size);
178  // A simple check whether the cross reference table matches with
179  // the objects.
180  bool VerifyCrossRefV4();
181
182  // If out_objects is null, the parser position will be moved to end subsection
183  // without additional validation.
184  bool ParseAndAppendCrossRefSubsectionData(
185      uint32_t start_objnum,
186      uint32_t count,
187      std::vector<CrossRefObjData>* out_objects);
188  bool ParseCrossRefV4(std::vector<CrossRefObjData>* out_objects);
189  void MergeCrossRefObjectsData(const std::vector<CrossRefObjData>& objects);
190
191  std::unique_ptr<CPDF_Object> ParseIndirectObjectAtInternal(
192      CPDF_IndirectObjectHolder* pObjList,
193      FX_FILESIZE pos,
194      uint32_t objnum,
195      CPDF_SyntaxParser::ParseType parse_type,
196      FX_FILESIZE* pResultPos);
197
198  bool InitSyntaxParser(const RetainPtr<IFX_SeekableReadStream>& file_access);
199  bool ParseFileVersion();
200
201  UnownedPtr<CPDF_Document> m_pDocument;
202  ObjectType GetObjectType(uint32_t objnum) const;
203  ObjectType GetObjectTypeFromCrossRefStreamType(
204      int cross_ref_stream_type) const;
205
206  bool m_bHasParsed;
207  bool m_bXRefStream;
208  int m_FileVersion;
209  // m_TrailerData must be destroyed after m_pSecurityHandler due to the
210  // ownership of the ID array data.
211  std::unique_ptr<TrailerData> m_TrailerData;
212  UnownedPtr<CPDF_Dictionary> m_pEncryptDict;
213  FX_FILESIZE m_LastXRefOffset;
214  std::unique_ptr<CPDF_SecurityHandler> m_pSecurityHandler;
215  ByteString m_Password;
216  std::unique_ptr<CPDF_LinearizedHeader> m_pLinearized;
217
218  // A map of object numbers to indirect streams.
219  std::map<uint32_t, RetainPtr<CPDF_StreamAcc>> m_ObjectStreamMap;
220
221  // Mapping of object numbers to offsets. The offsets are relative to the first
222  // object in the stream.
223  using StreamObjectCache = std::map<uint32_t, uint32_t>;
224
225  // Mapping of streams to their object caches. This is valid as long as the
226  // streams in |m_ObjectStreamMap| are valid.
227  std::map<RetainPtr<CPDF_StreamAcc>, StreamObjectCache> m_ObjCache;
228
229  // All indirect object numbers that are being parsed.
230  std::set<uint32_t> m_ParsingObjNums;
231
232  uint32_t m_MetadataObjnum = 0;
233};
234
235#endif  // CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
236