1 /*
2  * Copyright 2015 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkPDFMetadata.h"
9 
10 #include "SkMD5.h"
11 #include "SkMilestone.h"
12 #include "SkPDFTypes.h"
13 #include "SkTo.h"
14 #include "SkUtils.h"
15 
16 #include <utility>
17 
18 #define SKPDF_STRING(X) SKPDF_STRING_IMPL(X)
19 #define SKPDF_STRING_IMPL(X) #X
20 #define SKPDF_PRODUCER "Skia/PDF m" SKPDF_STRING(SK_MILESTONE)
21 #define SKPDF_CUSTOM_PRODUCER_KEY "ProductionLibrary"
22 
23 static constexpr SkTime::DateTime kZeroTime = {0, 0, 0, 0, 0, 0, 0, 0};
24 
25 static bool operator!=(const SkTime::DateTime& u, const SkTime::DateTime& v) {
26     return u.fTimeZoneMinutes != v.fTimeZoneMinutes ||
27            u.fYear != v.fYear ||
28            u.fMonth != v.fMonth ||
29            u.fDayOfWeek != v.fDayOfWeek ||
30            u.fDay != v.fDay ||
31            u.fHour != v.fHour ||
32            u.fMinute != v.fMinute ||
33            u.fSecond != v.fSecond;
34 }
35 
36 static SkString pdf_date(const SkTime::DateTime& dt) {
37     int timeZoneMinutes = SkToInt(dt.fTimeZoneMinutes);
38     char timezoneSign = timeZoneMinutes >= 0 ? '+' : '-';
39     int timeZoneHours = SkTAbs(timeZoneMinutes) / 60;
40     timeZoneMinutes = SkTAbs(timeZoneMinutes) % 60;
41     return SkStringPrintf(
42             "D:%04u%02u%02u%02u%02u%02u%c%02d'%02d'",
43             static_cast<unsigned>(dt.fYear), static_cast<unsigned>(dt.fMonth),
44             static_cast<unsigned>(dt.fDay), static_cast<unsigned>(dt.fHour),
45             static_cast<unsigned>(dt.fMinute),
46             static_cast<unsigned>(dt.fSecond), timezoneSign, timeZoneHours,
47             timeZoneMinutes);
48 }
49 
50 static bool utf8_is_pdfdocencoding(const char* src, size_t len) {
51     const uint8_t* end = (const uint8_t*)src + len;
52     for (const uint8_t* ptr = (const uint8_t*)src; ptr < end; ++ptr) {
53         uint8_t v = *ptr;
54         // See Table D.2 (PDFDocEncoding Character Set) in the PDF3200_2008 spec.
55         if ((v > 23 && v < 32) || v > 126) {
56             return false;
57         }
58     }
59     return true;
60 }
61 
62 void write_utf16be(char** ptr, uint16_t value) {
63     *(*ptr)++ = (value >> 8);
64     *(*ptr)++ = (value & 0xFF);
65 }
66 
67 // Please Note:  This "abuses" the SkString, which "should" only hold UTF8.
68 // But the SkString is written as if it is really just a ref-counted array of
69 // chars, so this works, as long as we handle endiness and conversions ourselves.
70 //
71 // Input:  UTF-8
72 // Output  UTF-16-BE
73 static SkString to_utf16be(const char* src, size_t len) {
74     SkString ret;
75     const char* const end = src + len;
76     size_t n = 1;  // BOM
77     for (const char* ptr = src; ptr < end;) {
78         SkUnichar u = SkUTF::NextUTF8(&ptr, end);
79         if (u < 0) {
80             break;
81         }
82         n += SkUTF::ToUTF16(u);
83     }
84     ret.resize(2 * n);
85     char* out = ret.writable_str();
86     write_utf16be(&out, 0xFEFF);  // BOM
87     for (const char* ptr = src; ptr < end;) {
88         SkUnichar u = SkUTF::NextUTF8(&ptr, end);
89         if (u < 0) {
90             break;
91         }
92         uint16_t utf16[2];
93         size_t l = SkUTF::ToUTF16(u, utf16);
94         write_utf16be(&out, utf16[0]);
95         if (l == 2) {
96             write_utf16be(&out, utf16[1]);
97         }
98     }
99     SkASSERT(out == ret.writable_str() + 2 * n);
100     return ret;
101 }
102 
103 // Input:  UTF-8
104 // Output  UTF-16-BE OR PDFDocEncoding (if that encoding is identical to ASCII encoding).
105 //
106 // See sections 14.3.3 (Document Information Dictionary) and 7.9.2.2 (Text String Type)
107 // of the PDF32000_2008 spec.
108 static SkString convert(const SkString& s) {
109     return utf8_is_pdfdocencoding(s.c_str(), s.size()) ? s : to_utf16be(s.c_str(), s.size());
110 }
111 static SkString convert(const char* src) {
112     size_t len = strlen(src);
113     return utf8_is_pdfdocencoding(src, len) ? SkString(src, len) : to_utf16be(src, len);
114 }
115 
116 namespace {
117 static const struct {
118     const char* const key;
119     SkString SkPDF::Metadata::*const valuePtr;
120 } gMetadataKeys[] = {
121         {"Title", &SkPDF::Metadata::fTitle},
122         {"Author", &SkPDF::Metadata::fAuthor},
123         {"Subject", &SkPDF::Metadata::fSubject},
124         {"Keywords", &SkPDF::Metadata::fKeywords},
125         {"Creator", &SkPDF::Metadata::fCreator},
126 };
127 }  // namespace
128 
129 std::unique_ptr<SkPDFObject> SkPDFMetadata::MakeDocumentInformationDict(
130         const SkPDF::Metadata& metadata) {
131     auto dict = SkPDFMakeDict();
132     for (const auto keyValuePtr : gMetadataKeys) {
133         const SkString& value = metadata.*(keyValuePtr.valuePtr);
134         if (value.size() > 0) {
135             dict->insertString(keyValuePtr.key, convert(value));
136         }
137     }
138     if (metadata.fProducer.isEmpty()) {
139         dict->insertString("Producer", convert(SKPDF_PRODUCER));
140     } else {
141         dict->insertString("Producer", convert(metadata.fProducer));
142         dict->insertString(SKPDF_CUSTOM_PRODUCER_KEY, convert(SKPDF_PRODUCER));
143     }
144     if (metadata.fCreation != kZeroTime) {
145         dict->insertString("CreationDate", pdf_date(metadata.fCreation));
146     }
147     if (metadata.fModified != kZeroTime) {
148         dict->insertString("ModDate", pdf_date(metadata.fModified));
149     }
150     return std::move(dict);
151 }
152 
153 SkUUID SkPDFMetadata::CreateUUID(const SkPDF::Metadata& metadata) {
154     // The main requirement is for the UUID to be unique; the exact
155     // format of the data that will be hashed is not important.
156     SkMD5 md5;
157     const char uuidNamespace[] = "org.skia.pdf\n";
158     md5.writeText(uuidNamespace);
159     double msec = SkTime::GetMSecs();
160     md5.write(&msec, sizeof(msec));
161     SkTime::DateTime dateTime;
162     SkTime::GetDateTime(&dateTime);
163     md5.write(&dateTime, sizeof(dateTime));
164     md5.write(&metadata.fCreation, sizeof(metadata.fCreation));
165     md5.write(&metadata.fModified, sizeof(metadata.fModified));
166 
167     for (const auto keyValuePtr : gMetadataKeys) {
168         md5.writeText(keyValuePtr.key);
169         md5.write("\037", 1);
170         const SkString& value = metadata.*(keyValuePtr.valuePtr);
171         md5.write(value.c_str(), value.size());
172         md5.write("\036", 1);
173     }
174     SkMD5::Digest digest;
175     md5.finish(digest);
176     // See RFC 4122, page 6-7.
177     digest.data[6] = (digest.data[6] & 0x0F) | 0x30;
178     digest.data[8] = (digest.data[6] & 0x3F) | 0x80;
179     static_assert(sizeof(digest) == sizeof(SkUUID), "uuid_size");
180     SkUUID uuid;
181     memcpy(&uuid, &digest, sizeof(digest));
182     return uuid;
183 }
184 
185 std::unique_ptr<SkPDFObject> SkPDFMetadata::MakePdfId(const SkUUID& doc,
186                                             const SkUUID& instance) {
187     // /ID [ <81b14aafa313db63dbd6f981e49f94f4>
188     //       <81b14aafa313db63dbd6f981e49f94f4> ]
189     auto array = SkPDFMakeArray();
190     static_assert(sizeof(SkUUID) == 16, "uuid_size");
191     array->appendString(
192             SkString(reinterpret_cast<const char*>(&doc), sizeof(SkUUID)));
193     array->appendString(
194             SkString(reinterpret_cast<const char*>(&instance), sizeof(SkUUID)));
195     return std::move(array);
196 }
197 
198 // Convert a block of memory to hexadecimal.  Input and output pointers will be
199 // moved to end of the range.
200 static void hexify(const uint8_t** inputPtr, char** outputPtr, int count) {
201     SkASSERT(inputPtr && *inputPtr);
202     SkASSERT(outputPtr && *outputPtr);
203     while (count-- > 0) {
204         uint8_t value = *(*inputPtr)++;
205         *(*outputPtr)++ = SkHexadecimalDigits::gLower[value >> 4];
206         *(*outputPtr)++ = SkHexadecimalDigits::gLower[value & 0xF];
207     }
208 }
209 
210 static SkString uuid_to_string(const SkUUID& uuid) {
211     //  8-4-4-4-12
212     char buffer[36];  // [32 + 4]
213     char* ptr = buffer;
214     const uint8_t* data = uuid.fData;
215     hexify(&data, &ptr, 4);
216     *ptr++ = '-';
217     hexify(&data, &ptr, 2);
218     *ptr++ = '-';
219     hexify(&data, &ptr, 2);
220     *ptr++ = '-';
221     hexify(&data, &ptr, 2);
222     *ptr++ = '-';
223     hexify(&data, &ptr, 6);
224     SkASSERT(ptr == buffer + 36);
225     SkASSERT(data == uuid.fData + 16);
226     return SkString(buffer, 36);
227 }
228 
229 namespace {
230 class PDFXMLObject final : public SkPDFObject {
231 public:
232     PDFXMLObject(SkString xml) : fXML(std::move(xml)) {}
233     void emitObject(SkWStream* stream) const override {
234         SkPDFDict dict("Metadata");
235         dict.insertName("Subtype", "XML");
236         dict.insertInt("Length", fXML.size());
237         dict.emitObject(stream);
238         static const char streamBegin[] = " stream\n";
239         stream->writeText(streamBegin);
240         // Do not compress this.  The standard requires that a
241         // program that does not understand PDF can grep for
242         // "<?xpacket" and extract the entire XML.
243         stream->write(fXML.c_str(), fXML.size());
244         static const char streamEnd[] = "\nendstream";
245         stream->writeText(streamEnd);
246     }
247 
248 private:
249     const SkString fXML;
250 };
251 }  // namespace
252 
253 static int count_xml_escape_size(const SkString& input) {
254     int extra = 0;
255     for (size_t i = 0; i < input.size(); ++i) {
256         if (input[i] == '&') {
257             extra += 4;  // strlen("&amp;") - strlen("&")
258         } else if (input[i] == '<') {
259             extra += 3;  // strlen("&lt;") - strlen("<")
260         }
261     }
262     return extra;
263 }
264 
265 const SkString escape_xml(const SkString& input,
266                           const char* before = nullptr,
267                           const char* after = nullptr) {
268     if (input.size() == 0) {
269         return input;
270     }
271     // "&" --> "&amp;" and  "<" --> "&lt;"
272     // text is assumed to be in UTF-8
273     // all strings are xml content, not attribute values.
274     size_t beforeLen = before ? strlen(before) : 0;
275     size_t afterLen = after ? strlen(after) : 0;
276     int extra = count_xml_escape_size(input);
277     SkString output(input.size() + extra + beforeLen + afterLen);
278     char* out = output.writable_str();
279     if (before) {
280         strncpy(out, before, beforeLen);
281         out += beforeLen;
282     }
283     static const char kAmp[] = "&amp;";
284     static const char kLt[] = "&lt;";
285     for (size_t i = 0; i < input.size(); ++i) {
286         if (input[i] == '&') {
287             strncpy(out, kAmp, strlen(kAmp));
288             out += strlen(kAmp);
289         } else if (input[i] == '<') {
290             strncpy(out, kLt, strlen(kLt));
291             out += strlen(kLt);
292         } else {
293             *out++ = input[i];
294         }
295     }
296     if (after) {
297         strncpy(out, after, afterLen);
298         out += afterLen;
299     }
300     // Validate that we haven't written outside of our string.
301     SkASSERT(out == &output.writable_str()[output.size()]);
302     *out = '\0';
303     return output;
304 }
305 
306 SkPDFIndirectReference SkPDFMetadata::MakeXMPObject(
307         const SkPDF::Metadata& metadata,
308         const SkUUID& doc,
309         const SkUUID& instance,
310         SkPDFDocument* docPtr) {
311     static const char templateString[] =
312             "<?xpacket begin=\"\" id=\"W5M0MpCehiHzreSzNTczkc9d\"?>\n"
313             "<x:xmpmeta xmlns:x=\"adobe:ns:meta/\"\n"
314             " x:xmptk=\"Adobe XMP Core 5.4-c005 78.147326, "
315             "2012/08/23-13:03:03\">\n"
316             "<rdf:RDF "
317             "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\">\n"
318             "<rdf:Description rdf:about=\"\"\n"
319             " xmlns:xmp=\"http://ns.adobe.com/xap/1.0/\"\n"
320             " xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n"
321             " xmlns:xmpMM=\"http://ns.adobe.com/xap/1.0/mm/\"\n"
322             " xmlns:pdf=\"http://ns.adobe.com/pdf/1.3/\"\n"
323             " xmlns:pdfaid=\"http://www.aiim.org/pdfa/ns/id/\">\n"
324             "<pdfaid:part>2</pdfaid:part>\n"
325             "<pdfaid:conformance>B</pdfaid:conformance>\n"
326             "%s"  // ModifyDate
327             "%s"  // CreateDate
328             "%s"  // xmp:CreatorTool
329             "<dc:format>application/pdf</dc:format>\n"
330             "%s"  // dc:title
331             "%s"  // dc:description
332             "%s"  // author
333             "%s"  // keywords
334             "<xmpMM:DocumentID>uuid:%s</xmpMM:DocumentID>\n"
335             "<xmpMM:InstanceID>uuid:%s</xmpMM:InstanceID>\n"
336             "%s"  // pdf:Producer
337             "%s"  // pdf:Keywords
338             "</rdf:Description>\n"
339             "</rdf:RDF>\n"
340             "</x:xmpmeta>\n"  // Note:  the standard suggests 4k of padding.
341             "<?xpacket end=\"w\"?>\n";
342 
343     SkString creationDate;
344     SkString modificationDate;
345     if (metadata.fCreation != kZeroTime) {
346         SkString tmp;
347         metadata.fCreation.toISO8601(&tmp);
348         SkASSERT(0 == count_xml_escape_size(tmp));
349         // YYYY-mm-ddTHH:MM:SS[+|-]ZZ:ZZ; no need to escape
350         creationDate = SkStringPrintf("<xmp:CreateDate>%s</xmp:CreateDate>\n",
351                                       tmp.c_str());
352     }
353     if (metadata.fModified != kZeroTime) {
354         SkString tmp;
355         metadata.fModified.toISO8601(&tmp);
356         SkASSERT(0 == count_xml_escape_size(tmp));
357         modificationDate = SkStringPrintf(
358                 "<xmp:ModifyDate>%s</xmp:ModifyDate>\n", tmp.c_str());
359     }
360     SkString title =
361             escape_xml(metadata.fTitle,
362                        "<dc:title><rdf:Alt><rdf:li xml:lang=\"x-default\">",
363                        "</rdf:li></rdf:Alt></dc:title>\n");
364     SkString author =
365             escape_xml(metadata.fAuthor, "<dc:creator><rdf:Bag><rdf:li>",
366                        "</rdf:li></rdf:Bag></dc:creator>\n");
367     // TODO: in theory, XMP can support multiple authors.  Split on a delimiter?
368     SkString subject = escape_xml(
369             metadata.fSubject,
370             "<dc:description><rdf:Alt><rdf:li xml:lang=\"x-default\">",
371             "</rdf:li></rdf:Alt></dc:description>\n");
372     SkString keywords1 =
373             escape_xml(metadata.fKeywords, "<dc:subject><rdf:Bag><rdf:li>",
374                        "</rdf:li></rdf:Bag></dc:subject>\n");
375     SkString keywords2 = escape_xml(metadata.fKeywords, "<pdf:Keywords>",
376                                     "</pdf:Keywords>\n");
377     // TODO: in theory, keywords can be a list too.
378 
379     SkString producer("<pdf:Producer>" SKPDF_PRODUCER "</pdf:Producer>\n");
380     if (!metadata.fProducer.isEmpty()) {
381         // TODO: register a developer prefix to make
382         // <skia:SKPDF_CUSTOM_PRODUCER_KEY> a real XML tag.
383         producer = escape_xml(
384                 metadata.fProducer, "<pdf:Producer>",
385                 "</pdf:Producer>\n<!-- <skia:" SKPDF_CUSTOM_PRODUCER_KEY ">"
386                 SKPDF_PRODUCER "</skia:" SKPDF_CUSTOM_PRODUCER_KEY "> -->\n");
387     }
388 
389     SkString creator = escape_xml(metadata.fCreator, "<xmp:CreatorTool>",
390                                   "</xmp:CreatorTool>\n");
391     SkString documentID = uuid_to_string(doc);  // no need to escape
392     SkASSERT(0 == count_xml_escape_size(documentID));
393     SkString instanceID = uuid_to_string(instance);
394     SkASSERT(0 == count_xml_escape_size(instanceID));
395 
396 
397     auto value = SkStringPrintf(
398             templateString, modificationDate.c_str(), creationDate.c_str(),
399             creator.c_str(), title.c_str(), subject.c_str(), author.c_str(),
400             keywords1.c_str(), documentID.c_str(), instanceID.c_str(),
401             producer.c_str(), keywords2.c_str());
402 
403     std::unique_ptr<SkPDFDict> dict = SkPDFMakeDict("Metadata");
404     dict->insertName("Subtype", "XML");
405     return SkPDFStreamOut(std::move(dict),
406                           SkMemoryStream::MakeCopy(value.c_str(), value.size()),
407                           docPtr, false);
408 }
409 
410 #undef SKPDF_CUSTOM_PRODUCER_KEY
411 #undef SKPDF_PRODUCER
412 #undef SKPDF_STRING
413 #undef SKPDF_STRING_IMPL
414