1 /*
2  * Copyright 2018 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkPDFDocumentPriv.h"
9 #include "SkPDFTag.h"
10 
11 // Table 333 in PDF 32000-1:2008
12 static const char* tag_name_from_type(SkPDF::DocumentStructureType type) {
13     switch (type) {
14         #define M(X) case SkPDF::DocumentStructureType::k ## X: return #X
15         M(Document);
16         M(Part);
17         M(Art);
18         M(Sect);
19         M(Div);
20         M(BlockQuote);
21         M(Caption);
22         M(TOC);
23         M(TOCI);
24         M(Index);
25         M(NonStruct);
26         M(Private);
27         M(H);
28         M(H1);
29         M(H2);
30         M(H3);
31         M(H4);
32         M(H5);
33         M(H6);
34         M(P);
35         M(L);
36         M(LI);
37         M(Lbl);
38         M(LBody);
39         M(Table);
40         M(TR);
41         M(TH);
42         M(TD);
43         M(THead);
44         M(TBody);
45         M(TFoot);
46         M(Span);
47         M(Quote);
48         M(Note);
49         M(Reference);
50         M(BibEntry);
51         M(Code);
52         M(Link);
53         M(Annot);
54         M(Ruby);
55         M(RB);
56         M(RT);
57         M(RP);
58         M(Warichu);
59         M(WT);
60         M(WP);
61         M(Figure);
62         M(Formula);
63         M(Form);
64         #undef M
65     }
66     SK_ABORT("bad tag");
67     return "";
68 }
69 
70 struct SkPDFTagNode {
71     SkPDFTagNode* fChildren = nullptr;
72     size_t fChildCount = 0;
73     struct MarkedContentInfo {
74         unsigned fPageIndex;
75         int fMarkId;
76     };
77     SkTArray<MarkedContentInfo> fMarkedContent;
78     int fNodeId;
79     SkPDF::DocumentStructureType fType;
80     SkPDFIndirectReference fRef;
81     enum State {
82         kUnknown,
83         kYes,
84         kNo,
85     } fCanDiscard = kUnknown;
86 };
87 
88 SkPDFTagTree::SkPDFTagTree() : fArena(4 * sizeof(SkPDFTagNode)) {}
89 
90 SkPDFTagTree::~SkPDFTagTree() = default;
91 
92 static void copy(const SkPDF::StructureElementNode& node,
93                  SkPDFTagNode* dst,
94                  SkArenaAlloc* arena,
95                  SkTHashMap<int, SkPDFTagNode*>* nodeMap) {
96     nodeMap->set(node.fNodeId, dst);
97     size_t childCount = node.fChildCount;
98     SkPDFTagNode* children = arena->makeArray<SkPDFTagNode>(childCount);
99     dst->fChildCount = childCount;
100     dst->fNodeId = node.fNodeId;
101     dst->fType = node.fType;
102     dst->fChildren = children;
103     for (size_t i = 0; i < childCount; ++i) {
104         copy(node.fChildren[i], &children[i], arena, nodeMap);
105     }
106 }
107 
108 void SkPDFTagTree::init(const SkPDF::StructureElementNode* node) {
109     if (node) {
110         fRoot = fArena.make<SkPDFTagNode>();
111         copy(*node, fRoot, &fArena, &fNodeMap);
112     }
113 }
114 
115 void SkPDFTagTree::reset() {
116     fArena.reset();
117     fNodeMap.reset();
118     fMarksPerPage.reset();
119     fRoot = nullptr;
120 }
121 
122 int SkPDFTagTree::getMarkIdForNodeId(int nodeId, unsigned pageIndex) {
123     if (!fRoot) {
124         return -1;
125     }
126     SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
127     if (!tagPtr) {
128         return -1;
129     }
130     SkPDFTagNode* tag = *tagPtr;
131     SkASSERT(tag);
132     while (fMarksPerPage.size() < pageIndex + 1) {
133         fMarksPerPage.push_back();
134     }
135     SkTArray<SkPDFTagNode*>& pageMarks = fMarksPerPage[pageIndex];
136     int markId = pageMarks.count();
137     tag->fMarkedContent.push_back({pageIndex, markId});
138     pageMarks.push_back(tag);
139     return markId;
140 }
141 
142 static bool can_discard(SkPDFTagNode* node) {
143     if (node->fCanDiscard == SkPDFTagNode::kYes) {
144         return true;
145     }
146     if (node->fCanDiscard == SkPDFTagNode::kNo) {
147         return false;
148     }
149     if (!node->fMarkedContent.empty()) {
150         node->fCanDiscard = SkPDFTagNode::kNo;
151         return false;
152     }
153     for (size_t i = 0; i < node->fChildCount; ++i) {
154         if (!can_discard(&node->fChildren[i])) {
155             node->fCanDiscard = SkPDFTagNode::kNo;
156             return false;
157         }
158     }
159     node->fCanDiscard = SkPDFTagNode::kYes;
160     return true;
161 }
162 
163 
164 SkPDFIndirectReference prepare_tag_tree_to_emit(SkPDFIndirectReference parent,
165                                                 SkPDFTagNode* node,
166                                                 SkPDFDocument* doc) {
167     SkPDFIndirectReference ref = doc->reserveRef();
168     std::unique_ptr<SkPDFArray> kids = SkPDFMakeArray();
169     SkPDFTagNode* children = node->fChildren;
170     size_t childCount = node->fChildCount;
171     for (size_t i = 0; i < childCount; ++i) {
172         SkPDFTagNode* child = &children[i];
173         if (!(can_discard(child))) {
174             kids->appendRef(prepare_tag_tree_to_emit(ref, child, doc));
175         }
176     }
177     for (const SkPDFTagNode::MarkedContentInfo& info : node->fMarkedContent) {
178         std::unique_ptr<SkPDFDict> mcr = SkPDFMakeDict("MCR");
179         mcr->insertRef("Pg", doc->getPage(info.fPageIndex));
180         mcr->insertInt("MCID", info.fMarkId);
181         kids->appendObject(std::move(mcr));
182     }
183     node->fRef = ref;
184     SkPDFDict dict("StructElem");
185     dict.insertName("S", tag_name_from_type(node->fType));
186     dict.insertRef("P", parent);
187     dict.insertObject("K", std::move(kids));
188     return doc->emit(dict, ref);
189 }
190 
191 SkPDFIndirectReference SkPDFTagTree::makeStructTreeRoot(SkPDFDocument* doc) {
192     if (!fRoot) {
193         return SkPDFIndirectReference();
194     }
195     if (can_discard(fRoot)) {
196         SkDEBUGFAIL("PDF has tag tree but no marked content.");
197     }
198     SkPDFIndirectReference ref = doc->reserveRef();
199 
200     unsigned pageCount = SkToUInt(doc->pageCount());
201 
202     // Build the StructTreeRoot.
203     SkPDFDict structTreeRoot("StructTreeRoot");
204     structTreeRoot.insertRef("K", prepare_tag_tree_to_emit(ref, fRoot, doc));
205     structTreeRoot.insertInt("ParentTreeNextKey", SkToInt(pageCount));
206 
207     // Build the parent tree, which is a mapping from the marked
208     // content IDs on each page to their corressponding tags.
209     SkPDFDict parentTree("ParentTree");
210     auto parentTreeNums = SkPDFMakeArray();
211 
212     SkASSERT(fMarksPerPage.size() <= pageCount);
213     for (size_t j = 0; j < fMarksPerPage.size(); ++j) {
214         const SkTArray<SkPDFTagNode*>& pageMarks = fMarksPerPage[j];
215         SkPDFArray markToTagArray;
216         for (SkPDFTagNode* mark : pageMarks) {
217             SkASSERT(mark->fRef);
218             markToTagArray.appendRef(mark->fRef);
219         }
220         parentTreeNums->appendInt(j);
221         parentTreeNums->appendRef(doc->emit(markToTagArray));
222     }
223     parentTree.insertObject("Nums", std::move(parentTreeNums));
224     structTreeRoot.insertRef("ParentTree", doc->emit(parentTree));
225     return doc->emit(structTreeRoot, ref);
226 }
227 
228