doc_tagged.cpp revision ee451cb395940862dad63c85adfe8f2fd55e864c
1// Copyright 2014 PDFium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#include "../../include/fpdfapi/fpdf_parser.h"
8#include "../../include/fpdfapi/fpdf_page.h"
9#include "../../include/fpdfdoc/fpdf_tagged.h"
10#include "tagged_int.h"
11const int nMaxRecursion = 32;
12static FX_BOOL IsTagged(const CPDF_Document* pDoc)
13{
14    CPDF_Dictionary* pCatalog = pDoc->GetRoot();
15    CPDF_Dictionary* pMarkInfo = pCatalog->GetDict(FX_BSTRC("MarkInfo"));
16    return pMarkInfo != NULL && pMarkInfo->GetInteger(FX_BSTRC("Marked"));
17}
18CPDF_StructTree* CPDF_StructTree::LoadPage(const CPDF_Document* pDoc, const CPDF_Dictionary* pPageDict)
19{
20    if (!IsTagged(pDoc)) {
21        return NULL;
22    }
23    CPDF_StructTreeImpl* pTree = FX_NEW CPDF_StructTreeImpl(pDoc);
24    if (pTree == NULL) {
25        return NULL;
26    }
27    pTree->LoadPageTree(pPageDict);
28    return pTree;
29}
30CPDF_StructTree* CPDF_StructTree::LoadDoc(const CPDF_Document* pDoc)
31{
32    if (!IsTagged(pDoc)) {
33        return NULL;
34    }
35    CPDF_StructTreeImpl* pTree = FX_NEW CPDF_StructTreeImpl(pDoc);
36    if (pTree == NULL) {
37        return NULL;
38    }
39    pTree->LoadDocTree();
40    return pTree;
41}
42CPDF_StructTreeImpl::CPDF_StructTreeImpl(const CPDF_Document* pDoc)
43{
44    CPDF_Dictionary* pCatalog = pDoc->GetRoot();
45    m_pTreeRoot = pCatalog->GetDict(FX_BSTRC("StructTreeRoot"));
46    if (m_pTreeRoot == NULL) {
47        return;
48    }
49    m_pRoleMap = m_pTreeRoot->GetDict(FX_BSTRC("RoleMap"));
50}
51CPDF_StructTreeImpl::~CPDF_StructTreeImpl()
52{
53    for (int i = 0; i < m_Kids.GetSize(); i ++)
54        if (m_Kids[i]) {
55            m_Kids[i]->Release();
56        }
57}
58void CPDF_StructTreeImpl::LoadDocTree()
59{
60    m_pPage = NULL;
61    if (m_pTreeRoot == NULL) {
62        return;
63    }
64    CPDF_Object* pKids = m_pTreeRoot->GetElementValue(FX_BSTRC("K"));
65    if (pKids == NULL) {
66        return;
67    }
68    if (pKids->GetType() == PDFOBJ_DICTIONARY) {
69        CPDF_StructElementImpl* pStructElementImpl = FX_NEW CPDF_StructElementImpl(this, NULL, (CPDF_Dictionary*)pKids);
70        if (pStructElementImpl == NULL) {
71            return;
72        }
73        m_Kids.Add(pStructElementImpl);
74        return;
75    }
76    if (pKids->GetType() != PDFOBJ_ARRAY) {
77        return;
78    }
79    CPDF_Array* pArray = (CPDF_Array*)pKids;
80    for (FX_DWORD i = 0; i < pArray->GetCount(); i ++) {
81        CPDF_Dictionary* pKid = pArray->GetDict(i);
82        CPDF_StructElementImpl* pStructElementImpl = FX_NEW CPDF_StructElementImpl(this, NULL, pKid);
83        if (pStructElementImpl == NULL) {
84            return;
85        }
86        m_Kids.Add(pStructElementImpl);
87    }
88}
89void CPDF_StructTreeImpl::LoadPageTree(const CPDF_Dictionary* pPageDict)
90{
91    m_pPage = pPageDict;
92    if (m_pTreeRoot == NULL) {
93        return;
94    }
95    CPDF_Object* pKids = m_pTreeRoot->GetElementValue(FX_BSTRC("K"));
96    if (pKids == NULL) {
97        return;
98    }
99    FX_DWORD dwKids = 0;
100    if (pKids->GetType() == PDFOBJ_DICTIONARY) {
101        dwKids = 1;
102    } else if (pKids->GetType() == PDFOBJ_ARRAY) {
103        dwKids = ((CPDF_Array*)pKids)->GetCount();
104    } else {
105        return;
106    }
107    FX_DWORD i;
108    m_Kids.SetSize(dwKids);
109    for (i = 0; i < dwKids; i ++) {
110        m_Kids[i] = NULL;
111    }
112    CFX_MapPtrToPtr element_map;
113    CPDF_Dictionary* pParentTree = m_pTreeRoot->GetDict(FX_BSTRC("ParentTree"));
114    if (pParentTree == NULL) {
115        return;
116    }
117    CPDF_NumberTree parent_tree(pParentTree);
118    int parents_id = pPageDict->GetInteger(FX_BSTRC("StructParents"), -1);
119    if (parents_id >= 0) {
120        CPDF_Object* pParents = parent_tree.LookupValue(parents_id);
121        if (pParents == NULL || pParents->GetType() != PDFOBJ_ARRAY) {
122            return;
123        }
124        CPDF_Array* pParentArray = (CPDF_Array*)pParents;
125        for (i = 0; i < pParentArray->GetCount(); i ++) {
126            CPDF_Dictionary* pParent = pParentArray->GetDict(i);
127            if (pParent == NULL) {
128                continue;
129            }
130            AddPageNode(pParent, element_map);
131        }
132    }
133}
134CPDF_StructElementImpl* CPDF_StructTreeImpl::AddPageNode(CPDF_Dictionary* pDict, CFX_MapPtrToPtr& map, int nLevel)
135{
136    if (nLevel > nMaxRecursion) {
137        return NULL;
138    }
139    CPDF_StructElementImpl* pElement = NULL;
140    if (map.Lookup(pDict, (FX_LPVOID&)pElement)) {
141        return pElement;
142    }
143    pElement = FX_NEW CPDF_StructElementImpl(this, NULL, pDict);
144    if (pElement == NULL) {
145        return NULL;
146    }
147    map.SetAt(pDict, pElement);
148    CPDF_Dictionary* pParent = pDict->GetDict(FX_BSTRC("P"));
149    if (pParent == NULL || pParent->GetString(FX_BSTRC("Type")) == FX_BSTRC("StructTreeRoot")) {
150        if (!AddTopLevelNode(pDict, pElement)) {
151            pElement->Release();
152            map.RemoveKey(pDict);
153        }
154    } else {
155        CPDF_StructElementImpl* pParentElement = AddPageNode(pParent, map, nLevel + 1);
156        FX_BOOL bSave = FALSE;
157        for (int i = 0; i < pParentElement->m_Kids.GetSize(); i ++) {
158            if (pParentElement->m_Kids[i].m_Type != CPDF_StructKid::Element) {
159                continue;
160            }
161            if (pParentElement->m_Kids[i].m_Element.m_pDict != pDict) {
162                continue;
163            }
164            pParentElement->m_Kids[i].m_Element.m_pElement = pElement->Retain();
165            bSave = TRUE;
166        }
167        if (!bSave) {
168            pElement->Release();
169            map.RemoveKey(pDict);
170        }
171    }
172    return pElement;
173}
174FX_BOOL CPDF_StructTreeImpl::AddTopLevelNode(CPDF_Dictionary* pDict, CPDF_StructElementImpl* pElement)
175{
176    CPDF_Object *pObj = m_pTreeRoot->GetElementValue(FX_BSTRC("K"));
177    if (!pObj) {
178        return FALSE;
179    }
180    if (pObj->GetType() == PDFOBJ_DICTIONARY) {
181        if (pObj->GetObjNum() == pDict->GetObjNum()) {
182            if (m_Kids[0]) {
183                m_Kids[0]->Release();
184            }
185            m_Kids[0] = pElement->Retain();
186        } else {
187            return FALSE;
188        }
189    }
190    if (pObj->GetType() == PDFOBJ_ARRAY) {
191        CPDF_Array* pTopKids = (CPDF_Array*)pObj;
192        FX_DWORD i;
193        FX_BOOL bSave = FALSE;
194        for (i = 0; i < pTopKids->GetCount(); i ++) {
195            CPDF_Reference* pKidRef = (CPDF_Reference*)pTopKids->GetElement(i);
196            if (pKidRef->GetType() != PDFOBJ_REFERENCE || pKidRef->GetRefObjNum() != pDict->GetObjNum()) {
197                continue;
198            }
199            if (m_Kids[i]) {
200                m_Kids[i]->Release();
201            }
202            m_Kids[i] = pElement->Retain();
203            bSave = TRUE;
204        }
205        if (!bSave) {
206            return FALSE;
207        }
208    }
209    return TRUE;
210}
211CPDF_StructElementImpl::CPDF_StructElementImpl(CPDF_StructTreeImpl* pTree, CPDF_StructElementImpl* pParent, CPDF_Dictionary* pDict)
212    : m_RefCount(0)
213{
214    m_pTree = pTree;
215    m_pDict = pDict;
216    m_Type = pDict->GetString(FX_BSTRC("S"));
217    CFX_ByteString mapped = pTree->m_pRoleMap->GetString(m_Type);
218    if (!mapped.IsEmpty()) {
219        m_Type = mapped;
220    }
221    m_pParent = pParent;
222    LoadKids(pDict);
223}
224CPDF_StructElementImpl::~CPDF_StructElementImpl()
225{
226    for (int i = 0; i < m_Kids.GetSize(); i ++) {
227        if (m_Kids[i].m_Type == CPDF_StructKid::Element && m_Kids[i].m_Element.m_pElement) {
228            ((CPDF_StructElementImpl*)m_Kids[i].m_Element.m_pElement)->Release();
229        }
230    }
231}
232CPDF_StructElementImpl* CPDF_StructElementImpl::Retain()
233{
234    m_RefCount++;
235    return this;
236}
237void CPDF_StructElementImpl::Release()
238{
239    if(--m_RefCount < 1) {
240        delete this;
241    }
242}
243void CPDF_StructElementImpl::LoadKids(CPDF_Dictionary* pDict)
244{
245    CPDF_Object* pObj = pDict->GetElement(FX_BSTRC("Pg"));
246    FX_DWORD PageObjNum = 0;
247    if (pObj && pObj->GetType() == PDFOBJ_REFERENCE) {
248        PageObjNum = ((CPDF_Reference*)pObj)->GetRefObjNum();
249    }
250    CPDF_Object* pKids = pDict->GetElementValue(FX_BSTRC("K"));
251    if (pKids == NULL) {
252        return;
253    }
254    if (pKids->GetType() == PDFOBJ_ARRAY) {
255        CPDF_Array* pArray = (CPDF_Array*)pKids;
256        m_Kids.SetSize(pArray->GetCount());
257        for (FX_DWORD i = 0; i < pArray->GetCount(); i ++) {
258            CPDF_Object* pKid = pArray->GetElementValue(i);
259            LoadKid(PageObjNum, pKid, &m_Kids[i]);
260        }
261    } else {
262        m_Kids.SetSize(1);
263        LoadKid(PageObjNum, pKids, &m_Kids[0]);
264    }
265}
266void CPDF_StructElementImpl::LoadKid(FX_DWORD PageObjNum, CPDF_Object* pKidObj, CPDF_StructKid* pKid)
267{
268    pKid->m_Type = CPDF_StructKid::Invalid;
269    if (pKidObj == NULL) {
270        return;
271    }
272    if (pKidObj->GetType() == PDFOBJ_NUMBER) {
273        if (m_pTree->m_pPage && m_pTree->m_pPage->GetObjNum() != PageObjNum) {
274            return;
275        }
276        pKid->m_Type = CPDF_StructKid::PageContent;
277        pKid->m_PageContent.m_ContentId = pKidObj->GetInteger();
278        pKid->m_PageContent.m_PageObjNum = PageObjNum;
279        return;
280    }
281    if (pKidObj->GetType() != PDFOBJ_DICTIONARY) {
282        return;
283    }
284    CPDF_Dictionary* pKidDict = (CPDF_Dictionary*)pKidObj;
285    CPDF_Object* pPageObj = pKidDict->GetElement(FX_BSTRC("Pg"));
286    if (pPageObj && pPageObj->GetType() == PDFOBJ_REFERENCE) {
287        PageObjNum = ((CPDF_Reference*)pPageObj)->GetRefObjNum();
288    }
289    CFX_ByteString type = pKidDict->GetString(FX_BSTRC("Type"));
290    if (type == FX_BSTRC("MCR")) {
291        if (m_pTree->m_pPage && m_pTree->m_pPage->GetObjNum() != PageObjNum) {
292            return;
293        }
294        pKid->m_Type = CPDF_StructKid::StreamContent;
295        CPDF_Object* pStreamObj = pKidDict->GetElement(FX_BSTRC("Stm"));
296        if (pStreamObj && pStreamObj->GetType() == PDFOBJ_REFERENCE) {
297            pKid->m_StreamContent.m_RefObjNum = ((CPDF_Reference*)pStreamObj)->GetRefObjNum();
298        } else {
299            pKid->m_StreamContent.m_RefObjNum = 0;
300        }
301        pKid->m_StreamContent.m_PageObjNum = PageObjNum;
302        pKid->m_StreamContent.m_ContentId = pKidDict->GetInteger(FX_BSTRC("MCID"));
303    } else if (type == FX_BSTRC("OBJR")) {
304        if (m_pTree->m_pPage && m_pTree->m_pPage->GetObjNum() != PageObjNum) {
305            return;
306        }
307        pKid->m_Type = CPDF_StructKid::Object;
308        CPDF_Object* pObj = pKidDict->GetElement(FX_BSTRC("Obj"));
309        if (pObj && pObj->GetType() == PDFOBJ_REFERENCE) {
310            pKid->m_Object.m_RefObjNum = ((CPDF_Reference*)pObj)->GetRefObjNum();
311        } else {
312            pKid->m_Object.m_RefObjNum = 0;
313        }
314        pKid->m_Object.m_PageObjNum = PageObjNum;
315    } else {
316        pKid->m_Type = CPDF_StructKid::Element;
317        pKid->m_Element.m_pDict = pKidDict;
318        if (m_pTree->m_pPage == NULL) {
319            pKid->m_Element.m_pElement = FX_NEW CPDF_StructElementImpl(m_pTree, this, pKidDict);
320        } else {
321            pKid->m_Element.m_pElement = NULL;
322        }
323    }
324}
325static CPDF_Dictionary* FindAttrDict(CPDF_Object* pAttrs, FX_BSTR owner, FX_FLOAT nLevel = 0.0F)
326{
327    if (nLevel > nMaxRecursion) {
328        return NULL;
329    }
330    if (pAttrs == NULL) {
331        return NULL;
332    }
333    CPDF_Dictionary* pDict = NULL;
334    if (pAttrs->GetType() == PDFOBJ_DICTIONARY) {
335        pDict = (CPDF_Dictionary*)pAttrs;
336    } else if (pAttrs->GetType() == PDFOBJ_STREAM) {
337        pDict = ((CPDF_Stream*)pAttrs)->GetDict();
338    } else if (pAttrs->GetType() == PDFOBJ_ARRAY) {
339        CPDF_Array* pArray = (CPDF_Array*)pAttrs;
340        for (FX_DWORD i = 0; i < pArray->GetCount(); i ++) {
341            CPDF_Object* pElement = pArray->GetElementValue(i);
342            pDict = FindAttrDict(pElement, owner, nLevel + 1);
343            if (pDict) {
344                return pDict;
345            }
346        }
347    }
348    if (pDict && pDict->GetString(FX_BSTRC("O")) == owner) {
349        return pDict;
350    }
351    return NULL;
352}
353CPDF_Object* CPDF_StructElementImpl::GetAttr(FX_BSTR owner, FX_BSTR name, FX_BOOL bInheritable, FX_FLOAT fLevel)
354{
355    if (fLevel > nMaxRecursion) {
356        return NULL;
357    }
358    if (bInheritable) {
359        CPDF_Object* pAttr = GetAttr(owner, name, FALSE);
360        if (pAttr) {
361            return pAttr;
362        }
363        if (m_pParent == NULL) {
364            return NULL;
365        }
366        return m_pParent->GetAttr(owner, name, TRUE, fLevel + 1);
367    }
368    CPDF_Object* pA = m_pDict->GetElementValue(FX_BSTRC("A"));
369    if (pA) {
370        CPDF_Dictionary* pAttrDict = FindAttrDict(pA, owner);
371        if (pAttrDict) {
372            CPDF_Object* pAttr = pAttrDict->GetElementValue(name);
373            if (pAttr) {
374                return pAttr;
375            }
376        }
377    }
378    CPDF_Object* pC = m_pDict->GetElementValue(FX_BSTRC("C"));
379    if (pC == NULL) {
380        return NULL;
381    }
382    CPDF_Dictionary* pClassMap = m_pTree->m_pTreeRoot->GetDict(FX_BSTRC("ClassMap"));
383    if (pClassMap == NULL) {
384        return NULL;
385    }
386    if (pC->GetType() == PDFOBJ_ARRAY) {
387        CPDF_Array* pArray = (CPDF_Array*)pC;
388        for (FX_DWORD i = 0; i < pArray->GetCount(); i ++) {
389            CFX_ByteString class_name = pArray->GetString(i);
390            CPDF_Dictionary* pClassDict = pClassMap->GetDict(class_name);
391            if (pClassDict && pClassDict->GetString(FX_BSTRC("O")) == owner) {
392                return pClassDict->GetElementValue(name);
393            }
394        }
395        return NULL;
396    }
397    CFX_ByteString class_name = pC->GetString();
398    CPDF_Dictionary* pClassDict = pClassMap->GetDict(class_name);
399    if (pClassDict && pClassDict->GetString(FX_BSTRC("O")) == owner) {
400        return pClassDict->GetElementValue(name);
401    }
402    return NULL;
403}
404CPDF_Object* CPDF_StructElementImpl::GetAttr(FX_BSTR owner, FX_BSTR name, FX_BOOL bInheritable, int subindex)
405{
406    CPDF_Object* pAttr = GetAttr(owner, name, bInheritable);
407    if (pAttr == NULL || subindex == -1 || pAttr->GetType() != PDFOBJ_ARRAY) {
408        return pAttr;
409    }
410    CPDF_Array* pArray = (CPDF_Array*)pAttr;
411    if (subindex >= (int)pArray->GetCount()) {
412        return pAttr;
413    }
414    return pArray->GetElementValue(subindex);
415}
416CFX_ByteString CPDF_StructElementImpl::GetName(FX_BSTR owner, FX_BSTR name, FX_BSTR default_value, FX_BOOL bInheritable, int subindex)
417{
418    CPDF_Object* pAttr = GetAttr(owner, name, bInheritable, subindex);
419    if (pAttr == NULL || pAttr->GetType() != PDFOBJ_NAME) {
420        return default_value;
421    }
422    return pAttr->GetString();
423}
424FX_ARGB	CPDF_StructElementImpl::GetColor(FX_BSTR owner, FX_BSTR name, FX_ARGB default_value, FX_BOOL bInheritable, int subindex)
425{
426    CPDF_Object* pAttr = GetAttr(owner, name, bInheritable, subindex);
427    if (pAttr == NULL || pAttr->GetType() != PDFOBJ_ARRAY) {
428        return default_value;
429    }
430    CPDF_Array* pArray = (CPDF_Array*)pAttr;
431    return 0xff000000 | ((int)(pArray->GetNumber(0) * 255) << 16) | ((int)(pArray->GetNumber(1) * 255) << 8) | (int)(pArray->GetNumber(2) * 255);
432}
433FX_FLOAT CPDF_StructElementImpl::GetNumber(FX_BSTR owner, FX_BSTR name, FX_FLOAT default_value, FX_BOOL bInheritable, int subindex)
434{
435    CPDF_Object* pAttr = GetAttr(owner, name, bInheritable, subindex);
436    if (pAttr == NULL || pAttr->GetType() != PDFOBJ_NUMBER) {
437        return default_value;
438    }
439    return pAttr->GetNumber();
440}
441int	CPDF_StructElementImpl::GetInteger(FX_BSTR owner, FX_BSTR name, int default_value, FX_BOOL bInheritable, int subindex)
442{
443    CPDF_Object* pAttr = GetAttr(owner, name, bInheritable, subindex);
444    if (pAttr == NULL || pAttr->GetType() != PDFOBJ_NUMBER) {
445        return default_value;
446    }
447    return pAttr->GetInteger();
448}
449