1 /*
2  * Copyright 2013 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkPdfConfig.h"
9 #include "SkPdfDiffEncoder.h"
10 #include "SkPdfNativeObject.h"
11 #include "SkPdfNativeTokenizer.h"
12 #include "SkPdfUtils.h"
13 
14 // TODO(edisonn): mac builder does not find the header ... but from headers is ok
15 //#include "SkPdfStreamCommonDictionary_autogen.h"
16 //#include "SkPdfImageDictionary_autogen.h"
17 #include "SkPdfHeaders_autogen.h"
18 
19 
20 // TODO(edisonn): Perf, Make this function run faster.
21 // There could be 0s between start and end.
22 // needle will not contain 0s.
strrstrk(char * hayStart,char * hayEnd,const char * needle)23 static char* strrstrk(char* hayStart, char* hayEnd, const char* needle) {
24     size_t needleLen = strlen(needle);
25     if ((isPdfWhiteSpaceOrPdfDelimiter(*(hayStart+needleLen)) || (hayStart+needleLen == hayEnd)) &&
26             strncmp(hayStart, needle, needleLen) == 0) {
27         return hayStart;
28     }
29 
30     hayStart++;
31 
32     while (hayStart < hayEnd) {
33         if (isPdfWhiteSpaceOrPdfDelimiter(*(hayStart-1)) &&
34                 (isPdfWhiteSpaceOrPdfDelimiter(*(hayStart+needleLen)) ||
35                       (hayStart+needleLen == hayEnd)) &&
36                 strncmp(hayStart, needle, needleLen) == 0) {
37             return hayStart;
38         }
39         hayStart++;
40     }
41     return NULL;
42 }
43 
skipPdfWhiteSpaces(const unsigned char * start,const unsigned char * end)44 const unsigned char* skipPdfWhiteSpaces(const unsigned char* start, const unsigned char* end) {
45     while (start < end && (isPdfWhiteSpace(*start) || *start == kComment_PdfDelimiter)) {
46         TRACE_COMMENT(*start);
47         if (*start == kComment_PdfDelimiter) {
48             // skip the comment until end of line
49             while (start < end && !isPdfEOL(*start)) {
50                 start++;
51                 TRACE_COMMENT(*start);
52             }
53         } else {
54             start++;
55         }
56     }
57     return start;
58 }
59 
endOfPdfToken(const unsigned char * start,const unsigned char * end)60 const unsigned char* endOfPdfToken(const unsigned char* start, const unsigned char* end) {
61     SkASSERT(!isPdfWhiteSpace(*start));
62 
63     if (start < end && isPdfDelimiter(*start)) {
64         TRACE_TK(*start);
65         start++;
66         return start;
67     }
68 
69     while (start < end && !isPdfWhiteSpaceOrPdfDelimiter(*start)) {
70         TRACE_TK(*start);
71         start++;
72     }
73     return start;
74 }
75 
76 // The parsing should end with a ].
readArray(const unsigned char * start,const unsigned char * end,SkPdfNativeObject * array,SkPdfAllocator * allocator,SkPdfNativeDoc * doc)77 static const unsigned char* readArray(const unsigned char* start, const unsigned char* end,
78                                       SkPdfNativeObject* array,
79                                       SkPdfAllocator* allocator, SkPdfNativeDoc* doc) {
80     SkPdfNativeObject::makeEmptyArray(array);
81     // PUT_TRACK_STREAM(array, start, start)
82 
83     if (allocator == NULL) {
84         // TODO(edisonn): report/warning error/assert
85         return end;
86     }
87 
88     while (start < end) {
89         // skip white spaces
90         start = skipPdfWhiteSpaces(start, end);
91 
92         const unsigned char* endOfToken = endOfPdfToken(start, end);
93 
94         if (endOfToken == start) {
95             // TODO(edisonn): report error in pdf file (end of stream with ] for end of aray
96             return start;
97         }
98 
99         if (endOfToken == start + 1 && *start == kClosedSquareBracket_PdfDelimiter) {
100             return endOfToken;
101         }
102 
103         SkPdfNativeObject* newObj = allocator->allocObject();
104         start = nextObject(start, end, newObj, allocator, doc);
105         // TODO(edisonn): perf/memory: put the variables on the stack, and flush them on the array
106         // only when we are sure they are not references!
107         if (newObj->isKeywordReference() && array->size() >= 2 &&
108                 array->objAtAIndex(SkToInt(array->size() - 1))->isInteger() &&
109                 array->objAtAIndex(SkToInt(array->size() - 2))->isInteger()) {
110             SkPdfNativeObject* gen = array->removeLastInArray();
111             SkPdfNativeObject* id = array->removeLastInArray();
112 
113             SkPdfNativeObject::resetAndMakeReference((unsigned int)id->intValue(),
114                                                      (unsigned int)gen->intValue(), newObj);
115             // newObj  PUT_TRACK_PARAMETERS_OBJ2(id, newObj) - store end, as now
116         }
117         array->appendInArray(newObj);
118     }
119     // TODO(edisonn): report not reached, we should never get here
120     // TODO(edisonn): there might be a bug here, enable an assert and run it on files
121     // or it might be that the files were actually corrupted
122     return start;
123 }
124 
readString(const unsigned char * start,const unsigned char * end,unsigned char * out)125 static const unsigned char* readString(const unsigned char* start, const unsigned char* end,
126                                        unsigned char* out) {
127     const unsigned char* in = start;
128     bool hasOut = (out != NULL);
129 
130     int openRoundBrackets = 1;
131     while (in < end) {
132         openRoundBrackets += ((*in) == kOpenedRoundBracket_PdfDelimiter);
133         openRoundBrackets -= ((*in) == kClosedRoundBracket_PdfDelimiter);
134         if (openRoundBrackets == 0) {
135             in++;   // consumed )
136             break;
137         }
138 
139         if (*in == kEscape_PdfSpecial) {
140             if (in + 1 < end) {
141                 switch (in[1]) {
142                     case 'n':
143                         if (hasOut) { *out = kLF_PdfWhiteSpace; }
144                         out++;
145                         in += 2;
146                         break;
147 
148                     case 'r':
149                         if (hasOut) { *out = kCR_PdfWhiteSpace; }
150                         out++;
151                         in += 2;
152                         break;
153 
154                     case 't':
155                         if (hasOut) { *out = kHT_PdfWhiteSpace; }
156                         out++;
157                         in += 2;
158                         break;
159 
160                     case 'b':
161                         // TODO(edisonn): any special meaning to backspace?
162                         if (hasOut) { *out = kBackspace_PdfSpecial; }
163                         out++;
164                         in += 2;
165                         break;
166 
167                     case 'f':
168                         if (hasOut) { *out = kFF_PdfWhiteSpace; }
169                         out++;
170                         in += 2;
171                         break;
172 
173                     case kOpenedRoundBracket_PdfDelimiter:
174                         if (hasOut) { *out = kOpenedRoundBracket_PdfDelimiter; }
175                         out++;
176                         in += 2;
177                         break;
178 
179                     case kClosedRoundBracket_PdfDelimiter:
180                         if (hasOut) { *out = kClosedRoundBracket_PdfDelimiter; }
181                         out++;
182                         in += 2;
183                         break;
184 
185                     case kEscape_PdfSpecial:
186                         if (hasOut) { *out = kEscape_PdfSpecial; }
187                         out++;
188                         in += 2;
189                         break;
190 
191                     case '0':
192                     case '1':
193                     case '2':
194                     case '3':
195                     case '4':
196                     case '5':
197                     case '6':
198                     case '7': {
199                             //read octals
200                             in++;   // consume backslash
201 
202                             int code = 0;
203                             int i = 0;
204                             while (in < end && *in >= '0' && *in < '8') {
205                                 code = (code << 3) + ((*in) - '0');  // code * 8 + d
206                                 i++;
207                                 in++;
208                                 if (i == 3) {
209                                     if (hasOut) { *out = code & 0xff; }
210                                     out++;
211                                     i = 0;
212                                 }
213                             }
214                             if (i > 0) {
215                                 if (hasOut) { *out = code & 0xff; }
216                                 out++;
217                             }
218                         }
219                         break;
220 
221                     default:
222                         // Per spec, backslash is ignored if escaped ch is unknown
223                         in++;
224                         break;
225                 }
226             } else {
227                 in++;
228             }
229         } else {
230             if (hasOut) { *out = *in; }
231             in++;
232             out++;
233         }
234     }
235 
236     if (hasOut) {
237         return in;  // consumed already ) at the end of the string
238     } else {
239         // return where the string would end if we reuse the string
240         return start + (out - (const unsigned char*)NULL);
241     }
242 }
243 
readStringLength(const unsigned char * start,const unsigned char * end)244 static size_t readStringLength(const unsigned char* start, const unsigned char* end) {
245     return readString(start, end, NULL) - start;
246 }
247 
readString(const unsigned char * start,const unsigned char * end,SkPdfNativeObject * str,SkPdfAllocator * allocator)248 static const unsigned char* readString(const unsigned char* start, const unsigned char* end,
249                                        SkPdfNativeObject* str, SkPdfAllocator* allocator) {
250     if (!allocator) {
251         // TODO(edisonn): report error/warn/assert
252         return end;
253     }
254 
255     size_t outLength = readStringLength(start, end);
256     unsigned char* out = (unsigned char*)allocator->alloc(outLength);
257     const unsigned char* now = readString(start, end, out);
258     SkPdfNativeObject::makeString(out, out + outLength, str);
259     //  PUT_TRACK_STREAM(str, start, now)
260     TRACE_STRING(out, out + outLength);
261     return now;  // consumed already ) at the end of the string
262 }
263 
readHexString(const unsigned char * start,const unsigned char * end,unsigned char * out)264 static const unsigned char* readHexString(const unsigned char* start, const unsigned char* end,
265                                           unsigned char* out) {
266     bool hasOut = (out != NULL);
267     const unsigned char* in = start;
268 
269     unsigned char code = 0;
270 
271     while (in < end) {
272         while (in < end && isPdfWhiteSpace(*in)) {
273             in++;
274         }
275 
276         if (*in == kClosedInequityBracket_PdfDelimiter) {
277             in++;  // consume >
278             // normal exit
279             break;
280         }
281 
282         if (in >= end) {
283             // end too soon
284             break;
285         }
286 
287         switch (*in) {
288             case '0':
289             case '1':
290             case '2':
291             case '3':
292             case '4':
293             case '5':
294             case '6':
295             case '7':
296             case '8':
297             case '9':
298                 code = (*in - '0') << 4;
299                 break;
300 
301             case 'a':
302             case 'b':
303             case 'c':
304             case 'd':
305             case 'e':
306             case 'f':
307                 code = (*in - 'a' + 10) << 4;
308                 break;
309 
310             case 'A':
311             case 'B':
312             case 'C':
313             case 'D':
314             case 'E':
315             case 'F':
316                 code = (*in - 'A' + 10) << 4;
317                 break;
318 
319             // TODO(edisonn): spec does not say how to handle this error
320             default:
321                 break;
322         }
323 
324         in++;  // advance
325 
326         while (in < end && isPdfWhiteSpace(*in)) {
327             in++;
328         }
329 
330         // TODO(edisonn): report error
331         if (in >= end) {
332             if (hasOut) { *out = code; }
333             out++;
334             break;
335         }
336 
337         if (*in == kClosedInequityBracket_PdfDelimiter) {
338             if (hasOut) { *out = code; }
339             out++;
340             in++;
341             break;
342         }
343 
344         switch (*in) {
345             case '0':
346             case '1':
347             case '2':
348             case '3':
349             case '4':
350             case '5':
351             case '6':
352             case '7':
353             case '8':
354             case '9':
355                 code += (*in - '0');
356                 break;
357 
358             case 'a':
359             case 'b':
360             case 'c':
361             case 'd':
362             case 'e':
363             case 'f':
364                 code += (*in - 'a' + 10);
365                 break;
366 
367             case 'A':
368             case 'B':
369             case 'C':
370             case 'D':
371             case 'E':
372             case 'F':
373                 code += (*in - 'A' + 10);
374                 break;
375 
376             // TODO(edisonn): spec does not say how to handle this error
377             default:
378                 break;
379         }
380 
381         if (hasOut) { *out = code; }
382         out++;
383         in++;
384     }
385 
386     if (hasOut) {
387         return in;  // consumed already ) at the end of the string
388     } else {
389         // return where the string would end if we reuse the string
390         return start + (out - (const unsigned char*)NULL);
391     }
392 }
393 
readHexStringLength(const unsigned char * start,const unsigned char * end)394 static size_t readHexStringLength(const unsigned char* start, const unsigned char* end) {
395     return readHexString(start, end, NULL) - start;
396 }
397 
readHexString(const unsigned char * start,const unsigned char * end,SkPdfNativeObject * str,SkPdfAllocator * allocator)398 static const unsigned char* readHexString(const unsigned char* start, const unsigned char* end, SkPdfNativeObject* str, SkPdfAllocator* allocator) {
399     if (!allocator) {
400         // TODO(edisonn): report error/warn/assert
401         return end;
402     }
403     size_t outLength = readHexStringLength(start, end);
404     unsigned char* out = (unsigned char*)allocator->alloc(outLength);
405     const unsigned char* now = readHexString(start, end, out);
406     SkPdfNativeObject::makeHexString(out, out + outLength, str);
407     // str PUT_TRACK_STREAM(start, now)
408     TRACE_HEXSTRING(out, out + outLength);
409     return now;  // consumed already > at the end of the string
410 }
411 
412 // TODO(edisonn): add version parameter, before PDF 1.2 name could not have special characters.
readName(const unsigned char * start,const unsigned char * end,unsigned char * out)413 static const unsigned char* readName(const unsigned char* start, const unsigned char* end,
414                                      unsigned char* out) {
415     bool hasOut = (out != NULL);
416     const unsigned char* in = start;
417 
418     unsigned char code = 0;
419 
420     while (in < end) {
421         if (isPdfWhiteSpaceOrPdfDelimiter(*in)) {
422             break;
423         }
424 
425         if (*in == '#' && in + 2 < end) {
426             in++;
427             switch (*in) {
428                 case '0':
429                 case '1':
430                 case '2':
431                 case '3':
432                 case '4':
433                 case '5':
434                 case '6':
435                 case '7':
436                 case '8':
437                 case '9':
438                     code = (*in - '0') << 4;
439                     break;
440 
441                 case 'a':
442                 case 'b':
443                 case 'c':
444                 case 'd':
445                 case 'e':
446                 case 'f':
447                     code = (*in - 'a' + 10) << 4;
448                     break;
449 
450                 case 'A':
451                 case 'B':
452                 case 'C':
453                 case 'D':
454                 case 'E':
455                 case 'F':
456                     code = (*in - 'A' + 10) << 4;
457                     break;
458 
459                 // TODO(edisonn): spec does not say how to handle this error
460                 default:
461                     break;
462             }
463 
464             in++;  // advance
465 
466             switch (*in) {
467                 case '0':
468                 case '1':
469                 case '2':
470                 case '3':
471                 case '4':
472                 case '5':
473                 case '6':
474                 case '7':
475                 case '8':
476                 case '9':
477                     code += (*in - '0');
478                     break;
479 
480                 case 'a':
481                 case 'b':
482                 case 'c':
483                 case 'd':
484                 case 'e':
485                 case 'f':
486                     code += (*in - 'a' + 10);
487                     break;
488 
489                 case 'A':
490                 case 'B':
491                 case 'C':
492                 case 'D':
493                 case 'E':
494                 case 'F':
495                     code += (*in - 'A' + 10);
496                     break;
497 
498                 // TODO(edisonn): spec does not say how to handle this error
499                 default:
500                     break;
501             }
502 
503             if (hasOut) { *out = code; }
504             out++;
505             in++;
506         } else {
507             if (hasOut) { *out = *in; }
508             out++;
509             in++;
510         }
511     }
512 
513     if (hasOut) {
514         return in;  // consumed already ) at the end of the string
515     } else {
516         // return where the string would end if we reuse the string
517         return start + (out - (const unsigned char*)NULL);
518     }
519 }
520 
readNameLength(const unsigned char * start,const unsigned char * end)521 static size_t readNameLength(const unsigned char* start, const unsigned char* end) {
522     return readName(start, end, NULL) - start;
523 }
524 
readName(const unsigned char * start,const unsigned char * end,SkPdfNativeObject * name,SkPdfAllocator * allocator)525 static const unsigned char* readName(const unsigned char* start, const unsigned char* end,
526                                      SkPdfNativeObject* name, SkPdfAllocator* allocator) {
527     if (!allocator) {
528         // TODO(edisonn): report error/warn/assert
529         return end;
530     }
531     size_t outLength = readNameLength(start, end);
532     unsigned char* out = (unsigned char*)allocator->alloc(outLength);
533     const unsigned char* now = readName(start, end, out);
534     SkPdfNativeObject::makeName(out, out + outLength, name);
535     //PUT_TRACK_STREAM(start, now)
536     TRACE_NAME(out, out + outLength);
537     return now;
538 }
539 
540 // TODO(edisonn): pdf spec let Length to be an indirect object define after the stream
541 // that makes for an interesting scenario, where the stream itself contains endstream, together
542 // with a reference object with the length, but the real length object would be somewhere else
543 // it could confuse the parser
544 /*example:
545 
546 7 0 obj
547 << /length 8 0 R>>
548 stream
549 ...............
550 endstream
551 8 0 obj #we are in stream actually, not a real object
552 << 10 >> #we are in stream actually, not a real object
553 endobj
554 endstream
555 8 0 obj #real obj
556 << 100 >> #real obj
557 endobj
558 and it could get worse, with multiple object like this
559 */
560 
561 // right now implement the silly algorithm that assumes endstream is finishing the stream
562 
readStream(const unsigned char * start,const unsigned char * end,SkPdfNativeObject * dict,SkPdfNativeDoc * doc)563 static const unsigned char* readStream(const unsigned char* start, const unsigned char* end,
564                                        SkPdfNativeObject* dict, SkPdfNativeDoc* doc) {
565     start = skipPdfWhiteSpaces(start, end);
566     if (!(  start[0] == 's' &&
567             start[1] == 't' &&
568             start[2] == 'r' &&
569             start[3] == 'e' &&
570             start[4] == 'a' &&
571             start[5] == 'm')) {
572         // no stream. return.
573         return start;
574     }
575 
576     start += 6; // strlen("stream")
577     if (start[0] == kCR_PdfWhiteSpace && start[1] == kLF_PdfWhiteSpace) {
578         start += 2;
579     } else if (start[0] == kLF_PdfWhiteSpace) {
580         start += 1;
581     } else if (isPdfWhiteSpace(start[0])) {
582         start += 1;
583     } else {
584         // TODO(edisonn): warn it should be isPdfDelimiter(start[0])) ?
585     }
586 
587     SkPdfStreamCommonDictionary* stream = (SkPdfStreamCommonDictionary*) dict;
588     // TODO(edisonn): load Length
589     int64_t length = -1;
590 
591     // TODO(edisonn): very basic implementation
592     if (stream->has_Length() && stream->Length(doc) > 0) {
593         length = stream->Length(doc);
594     }
595 
596     // TODO(edisonn): load external streams
597     // TODO(edisonn): look at the last filter, to determine how to deal with possible parsing
598     // issues. The last filter can have special rules to terminate a stream, which we could
599     // use to determine end of stream.
600 
601     if (length >= 0) {
602         const unsigned char* endstream = start + length;
603 
604         if (endstream[0] == kCR_PdfWhiteSpace && endstream[1] == kLF_PdfWhiteSpace) {
605             endstream += 2;
606         } else if (endstream[0] == kLF_PdfWhiteSpace) {
607             endstream += 1;
608         }
609 
610         if (strncmp((const char*)endstream, "endstream", strlen("endstream")) != 0) {
611             length = -1;
612         }
613     }
614 
615     if (length < 0) {
616         // scan the buffer, until we find first endstream
617         // TODO(edisonn): all buffers must have a 0 at the end now,
618         const unsigned char* endstream = (const unsigned char*)strrstrk((char*)start, (char*)end,
619                                                                         "endstream");
620 
621         if (endstream) {
622             length = endstream - start;
623             if (*(endstream-1) == kLF_PdfWhiteSpace) length--;
624             if (*(endstream-2) == kCR_PdfWhiteSpace) length--;
625         }
626     }
627     if (length >= 0) {
628         const unsigned char* endstream = start + length;
629 
630         if (endstream[0] == kCR_PdfWhiteSpace && endstream[1] == kLF_PdfWhiteSpace) {
631             endstream += 2;
632         } else if (endstream[0] == kLF_PdfWhiteSpace) {
633             endstream += 1;
634         }
635 
636         // TODO(edisonn): verify the next bytes are "endstream"
637 
638         endstream += strlen("endstream");
639         // TODO(edisonn): Assert? report error/warning?
640         dict->addStream(start, (size_t)length);
641         return endstream;
642     }
643     return start;
644 }
645 
readInlineImageStream(const unsigned char * start,const unsigned char * end,SkPdfImageDictionary * inlineImage,SkPdfNativeDoc * doc)646 static const unsigned char* readInlineImageStream(const unsigned char* start,
647                                                   const unsigned char* end,
648                                                   SkPdfImageDictionary* inlineImage,
649                                                   SkPdfNativeDoc* doc) {
650     // We already processed ID keyword, and we should be positioned immediately after it
651 
652     // TODO(edisonn): security: either make all streams to have extra 2 bytes at the end,
653     // instead of this if.
654     //if (end - start <= 2) {
655     //    // TODO(edisonn): warning?
656     //    return end; // but can we have a pixel image encoded in 1-2 bytes?
657     //}
658 
659     if (start[0] == kCR_PdfWhiteSpace && start[1] == kLF_PdfWhiteSpace) {
660         start += 2;
661     } else if (start[0] == kLF_PdfWhiteSpace) {
662         start += 1;
663     } else if (isPdfWhiteSpace(start[0])) {
664         start += 1;
665     } else {
666         SkASSERT(isPdfDelimiter(start[0]));
667         // TODO(edisonn): warning?
668     }
669 
670     const unsigned char* endstream = (const unsigned char*)strrstrk((char*)start, (char*)end, "EI");
671     const unsigned char* endEI = endstream ? endstream + 2 : NULL;  // 2 == strlen("EI")
672 
673     if (endstream) {
674         size_t length = endstream - start;
675         if (*(endstream-1) == kLF_PdfWhiteSpace) length--;
676         if (*(endstream-2) == kCR_PdfWhiteSpace) length--;
677         inlineImage->addStream(start, (size_t)length);
678     } else {
679         // TODO(edisonn): report error in inline image stream (ID-EI) section
680         // TODO(edisonn): based on filter, try to ignore a missing EI, and read data properly
681         return end;
682     }
683     return endEI;
684 }
685 
readDictionary(const unsigned char * start,const unsigned char * end,SkPdfNativeObject * dict,SkPdfAllocator * allocator,SkPdfNativeDoc * doc)686 static const unsigned char* readDictionary(const unsigned char* start, const unsigned char* end,
687                                            SkPdfNativeObject* dict,
688                                            SkPdfAllocator* allocator, SkPdfNativeDoc* doc) {
689     if (allocator == NULL) {
690         // TODO(edisonn): report/warning error
691         return end;
692     }
693     SkPdfNativeObject::makeEmptyDictionary(dict);
694     // PUT_TRACK_STREAM(dict, start, start)
695 
696     start = skipPdfWhiteSpaces(start, end);
697     SkPdfAllocator tmpStorage;  // keys will be stored in dict, we can free them after set.
698 
699     while (start < end && *start == kNamed_PdfDelimiter) {
700         SkPdfNativeObject key;
701         //*start = '\0';
702         start++;
703         start = readName(start, end, &key, &tmpStorage);
704         start = skipPdfWhiteSpaces(start, end);
705 
706         if (start < end) {
707             SkPdfNativeObject* value = allocator->allocObject();
708             start = nextObject(start, end, value, allocator, doc);
709 
710             start = skipPdfWhiteSpaces(start, end);
711 
712             if (start < end) {
713                 // We should have an indirect reference
714                 if (isPdfDigit(*start)) {
715                     SkPdfNativeObject generation;
716                     start = nextObject(start, end, &generation, allocator, doc);
717 
718                     SkPdfNativeObject keywordR;
719                     start = nextObject(start, end, &keywordR, allocator, doc);
720 
721                     if (value->isInteger() && generation.isInteger() &&
722                             keywordR.isKeywordReference()) {
723                         int64_t id = value->intValue();
724                         SkPdfNativeObject::resetAndMakeReference(
725                                 (unsigned int)id,
726                                 (unsigned int)generation.intValue(),
727                                 value);
728                         //  PUT_TRACK_PARAMETERS_OBJ2(value, &generation)
729                         dict->set(&key, value);
730                     } else {
731                         // TODO(edisonn) error?, ignore it for now.
732                         dict->set(&key, value);
733                     }
734                 } else {
735                     // next elem is not a digit, but it might not be / either!
736                     dict->set(&key, value);
737                 }
738             } else {
739                 // /key >>
740                 dict->set(&key, value);
741                 return end;
742             }
743             start = skipPdfWhiteSpaces(start, end);
744         } else {
745             dict->set(&key, &SkPdfNativeObject::kNull);
746             return end;
747         }
748     }
749 
750     // now we should expect >>
751     start = skipPdfWhiteSpaces(start, end);
752     if (*start != kClosedInequityBracket_PdfDelimiter) {
753         // TODO(edisonn): report/warning
754     }
755 
756     start++;  // skip >
757     if (*start != kClosedInequityBracket_PdfDelimiter) {
758         // TODO(edisonn): report/warning
759     }
760 
761     start++;  // skip >
762 
763     //STORE_TRACK_PARAMETER_OFFSET_END(dict,start);
764 
765     start = readStream(start, end, dict, doc);
766 
767     return start;
768 }
769 
nextObject(const unsigned char * start,const unsigned char * end,SkPdfNativeObject * token,SkPdfAllocator * allocator,SkPdfNativeDoc * doc)770 const unsigned char* nextObject(const unsigned char* start, const unsigned char* end,
771                                 SkPdfNativeObject* token,
772                                 SkPdfAllocator* allocator, SkPdfNativeDoc* doc) {
773     const unsigned char* current;
774 
775     // skip white spaces
776     start = skipPdfWhiteSpaces(start, end);
777 
778     if (start >= end) {
779         return end;
780     }
781 
782     current = endOfPdfToken(start, end);
783 
784     // no token, len would be 0
785     if (current == start || current == end) {
786         return end;
787     }
788 
789     size_t tokenLen = current - start;
790 
791     if (tokenLen == 1) {
792         // start array
793         switch (*start) {
794             case kOpenedSquareBracket_PdfDelimiter:
795                 return readArray(current, end, token, allocator, doc);
796 
797             case kOpenedRoundBracket_PdfDelimiter:
798                 return readString(start + 1, end, token, allocator);
799 
800             case kOpenedInequityBracket_PdfDelimiter:
801                 if (end > start + 1 && start[1] == kOpenedInequityBracket_PdfDelimiter) {
802                     // TODO(edisonn): pass here the length somehow?
803                     return readDictionary(start + 2, end, token, allocator, doc);  // skip <<
804                 } else {
805                     return readHexString(start + 1, end, token, allocator);  // skip <
806                 }
807 
808             case kNamed_PdfDelimiter:
809                 return readName(start + 1, end, token, allocator);
810 
811             // TODO(edisonn): what to do curly brackets?
812             case kOpenedCurlyBracket_PdfDelimiter:
813             default:
814                 break;
815         }
816 
817         SkASSERT(!isPdfWhiteSpace(*start));
818         if (isPdfDelimiter(*start)) {
819             // TODO(edisonn): how unexpected stream ] } > ) will be handled?
820             // for now ignore, and it will become a keyword to be ignored
821         }
822     }
823 
824     if (tokenLen == 4 && start[0] == 'n' && start[1] == 'u' && start[2] == 'l' && start[3] == 'l') {
825         SkPdfNativeObject::makeNull(token);
826         // PUT_TRACK_STREAM(start, start + 4)
827         return current;
828     }
829 
830     if (tokenLen == 4 && start[0] == 't' && start[1] == 'r' && start[2] == 'u' && start[3] == 'e') {
831         SkPdfNativeObject::makeBoolean(true, token);
832         // PUT_TRACK_STREAM(start, start + 4)
833         return current;
834     }
835 
836     // TODO(edisonn): again, make all buffers have 5 extra bytes
837     if (tokenLen == 5 && start[0] == 'f' &&
838                          start[1] == 'a' &&
839                          start[2] == 'l' &&
840                          start[3] == 's' &&
841                          start[4] == 'e') {
842         SkPdfNativeObject::makeBoolean(false, token);
843         // PUT_TRACK_STREAM(start, start + 5)
844         return current;
845     }
846 
847     if (isPdfNumeric(*start)) {
848         SkPdfNativeObject::makeNumeric(start, current, token);
849         //  PUT_TRACK_STREAM(start, current)
850     } else {
851         SkPdfNativeObject::makeKeyword(start, current, token);
852         // PUT_TRACK_STREAM(start, current)
853     }
854     return current;
855 }
856 
allocBlock()857 SkPdfNativeObject* SkPdfAllocator::allocBlock() {
858     fSizeInBytes += BUFFER_SIZE * sizeof(SkPdfNativeObject);
859     return new SkPdfNativeObject[BUFFER_SIZE];
860 }
861 
~SkPdfAllocator()862 SkPdfAllocator::~SkPdfAllocator() {
863     for (int i = 0 ; i < fHandles.count(); i++) {
864         free(fHandles[i]);
865     }
866     for (int i = 0 ; i < fHistory.count(); i++) {
867         for (int j = 0 ; j < BUFFER_SIZE; j++) {
868             fHistory[i][j].reset();
869         }
870         delete[] fHistory[i];
871     }
872     for (int j = 0 ; j < BUFFER_SIZE; j++) {
873         fCurrent[j].reset();
874     }
875     delete[] fCurrent;
876 }
877 
allocObject()878 SkPdfNativeObject* SkPdfAllocator::allocObject() {
879     if (fCurrentUsed >= BUFFER_SIZE) {
880         fHistory.push(fCurrent);
881         fCurrent = allocBlock();
882         fCurrentUsed = 0;
883         fSizeInBytes += sizeof(SkPdfNativeObject*);
884     }
885     fCurrentUsed++;
886     return &fCurrent[fCurrentUsed - 1];
887 }
888 
889 // TODO(edisonn): perf: do no copy the buffers, but reuse them, and mark cache the result,
890 // so there is no need of a second pass
SkPdfNativeTokenizer(SkPdfNativeObject * objWithStream,SkPdfAllocator * allocator,SkPdfNativeDoc * doc)891 SkPdfNativeTokenizer::SkPdfNativeTokenizer(SkPdfNativeObject* objWithStream,
892                                            SkPdfAllocator* allocator,
893                                            SkPdfNativeDoc* doc)
894             : fDoc(doc)
895             , fAllocator(allocator)
896             , fUncompressedStream(NULL)
897             , fUncompressedStreamEnd(NULL)
898             , fEmpty(false)
899             , fHasPutBack(false) {
900     const unsigned char* buffer = NULL;
901     size_t len = 0;
902     objWithStream->GetFilteredStreamRef(&buffer, &len);
903     // TODO(edisonn): really bad hack, find end of object (endobj might be in a comment!)
904     // we need to do now for perf, and our generated pdfs do not have comments,
905     // but we need to remove this hack for pdfs in the wild
906     char* endobj = strrstrk((char*)buffer, (char*)buffer + len, "endobj");
907     if (endobj) {
908         len = endobj - (char*)buffer + strlen("endobj");
909     }
910     fUncompressedStreamStart = fUncompressedStream = buffer;
911     fUncompressedStreamEnd = fUncompressedStream + len;
912 }
913 
SkPdfNativeTokenizer(const unsigned char * buffer,int len,SkPdfAllocator * allocator,SkPdfNativeDoc * doc)914 SkPdfNativeTokenizer::SkPdfNativeTokenizer(const unsigned char* buffer, int len,
915                                            SkPdfAllocator* allocator,
916                                            SkPdfNativeDoc* doc) : fDoc(doc)
917                                                                 , fAllocator(allocator)
918                                                                 , fEmpty(false)
919                                                                 , fHasPutBack(false) {
920     // TODO(edisonn): really bad hack, find end of object (endobj might be in a comment!)
921     // we need to do now for perf, and our generated pdfs do not have comments,
922     // but we need to remove this hack for pdfs in the wild
923     char* endobj = strrstrk((char*)buffer, (char*)buffer + len, "endobj");
924     if (endobj) {
925         len = SkToInt(endobj - (char*)buffer + strlen("endobj"));
926     }
927     fUncompressedStreamStart = fUncompressedStream = buffer;
928     fUncompressedStreamEnd = fUncompressedStream + len;
929 }
930 
~SkPdfNativeTokenizer()931 SkPdfNativeTokenizer::~SkPdfNativeTokenizer() {
932 }
933 
readTokenCore(PdfToken * token)934 bool SkPdfNativeTokenizer::readTokenCore(PdfToken* token) {
935 #ifdef PDF_TRACE_READ_TOKEN
936     static int read_op = 0;
937 #endif
938 
939     token->fKeyword = NULL;
940     token->fObject = NULL;
941 
942     fUncompressedStream = skipPdfWhiteSpaces(fUncompressedStream, fUncompressedStreamEnd);
943     if (fUncompressedStream >= fUncompressedStreamEnd) {
944         fEmpty = true;
945         return false;
946     }
947 
948     SkPdfNativeObject obj;
949     fUncompressedStream = nextObject(fUncompressedStream, fUncompressedStreamEnd, &obj, fAllocator, fDoc);
950     //  PUT_TRACK_STREAM_ARGS_EXPL2(fStreamId, fUncompressedStreamStart)
951 
952     // If it is a keyword, we will only get the pointer of the string.
953     if (obj.type() == SkPdfNativeObject::kKeyword_PdfObjectType) {
954         token->fKeyword = obj.c_str();
955         token->fKeywordLength = obj.lenstr();
956         token->fType = kKeyword_TokenType;
957     } else {
958         SkPdfNativeObject* pobj = fAllocator->allocObject();
959         *pobj = obj;
960         token->fObject = pobj;
961         token->fType = kObject_TokenType;
962     }
963 
964 #ifdef PDF_TRACE_READ_TOKEN
965     read_op++;
966 #if 0
967     if (548 == read_op) {
968         printf("break;\n");
969     }
970 #endif
971     printf("%i READ %s %s\n", read_op, token->fType == kKeyword_TokenType ? "Keyword" : "Object",
972            token->fKeyword ? SkString(token->fKeyword, token->fKeywordLength).c_str() :
973                              token->fObject->toString().c_str());
974 #endif
975 
976     return true;
977 }
978 
PutBack(PdfToken token)979 void SkPdfNativeTokenizer::PutBack(PdfToken token) {
980     SkASSERT(!fHasPutBack);
981     fHasPutBack = true;
982     fPutBack = token;
983 #ifdef PDF_TRACE_READ_TOKEN
984     printf("PUT_BACK %s %s\n", token.fType == kKeyword_TokenType ? "Keyword" : "Object",
985            token.fKeyword ? SkString(token.fKeyword, token.fKeywordLength).c_str() :
986                             token.fObject->toString().c_str());
987 #endif
988 }
989 
readToken(PdfToken * token,bool writeDiff)990 bool SkPdfNativeTokenizer::readToken(PdfToken* token, bool writeDiff) {
991     if (fHasPutBack) {
992         *token = fPutBack;
993         fHasPutBack = false;
994 #ifdef PDF_TRACE_READ_TOKEN
995         printf("READ_BACK %s %s\n", token->fType == kKeyword_TokenType ? "Keyword" : "Object",
996                token->fKeyword ? SkString(token->fKeyword, token->fKeywordLength).c_str() :
997                                  token->fObject->toString().c_str());
998 #endif
999         if (writeDiff) {
1000             SkPdfDiffEncoder::WriteToFile(token);
1001         }
1002         return true;
1003     }
1004 
1005     if (fEmpty) {
1006 #ifdef PDF_TRACE_READ_TOKEN
1007         printf("EMPTY TOKENIZER\n");
1008 #endif
1009         return false;
1010     }
1011 
1012     const bool result = readTokenCore(token);
1013     if (result && writeDiff) {
1014         SkPdfDiffEncoder::WriteToFile(token);
1015     }
1016     return result;
1017 }
1018 
1019 #define DECLARE_PDF_NAME(longName) SkPdfName longName((char*)#longName)
1020 
1021 // keys
1022 DECLARE_PDF_NAME(BitsPerComponent);
1023 DECLARE_PDF_NAME(ColorSpace);
1024 DECLARE_PDF_NAME(Decode);
1025 DECLARE_PDF_NAME(DecodeParms);
1026 DECLARE_PDF_NAME(Filter);
1027 DECLARE_PDF_NAME(Height);
1028 DECLARE_PDF_NAME(ImageMask);
1029 DECLARE_PDF_NAME(Intent); // PDF 1.1 - the key, or the abBreviations?
1030 DECLARE_PDF_NAME(Interpolate);
1031 DECLARE_PDF_NAME(Width);
1032 
1033 // values
1034 DECLARE_PDF_NAME(DeviceGray);
1035 DECLARE_PDF_NAME(DeviceRGB);
1036 DECLARE_PDF_NAME(DeviceCMYK);
1037 DECLARE_PDF_NAME(Indexed);
1038 DECLARE_PDF_NAME(ASCIIHexDecode);
1039 DECLARE_PDF_NAME(ASCII85Decode);
1040 DECLARE_PDF_NAME(LZWDecode);
1041 DECLARE_PDF_NAME(FlateDecode);  // PDF 1.2
1042 DECLARE_PDF_NAME(RunLengthDecode);
1043 DECLARE_PDF_NAME(CCITTFaxDecode);
1044 DECLARE_PDF_NAME(DCTDecode);
1045 
1046 #define HANDLE_NAME_ABBR(obj,longName,shortName) if (obj->isName(#shortName)) return &longName;
1047 
1048 
inlineImageKeyAbbreviationExpand(SkPdfNativeObject * key)1049 static SkPdfNativeObject* inlineImageKeyAbbreviationExpand(SkPdfNativeObject* key) {
1050     if (!key || !key->isName()) {
1051         return key;
1052     }
1053 
1054     // TODO(edisonn): use autogenerated code!
1055     HANDLE_NAME_ABBR(key, BitsPerComponent, BPC);
1056     HANDLE_NAME_ABBR(key, ColorSpace, CS);
1057     HANDLE_NAME_ABBR(key, Decode, D);
1058     HANDLE_NAME_ABBR(key, DecodeParms, DP);
1059     HANDLE_NAME_ABBR(key, Filter, F);
1060     HANDLE_NAME_ABBR(key, Height, H);
1061     HANDLE_NAME_ABBR(key, ImageMask, IM);
1062 //    HANDLE_NAME_ABBR(key, Intent, );
1063     HANDLE_NAME_ABBR(key, Interpolate, I);
1064     HANDLE_NAME_ABBR(key, Width, W);
1065 
1066     return key;
1067 }
1068 
inlineImageValueAbbreviationExpand(SkPdfNativeObject * value)1069 static SkPdfNativeObject* inlineImageValueAbbreviationExpand(SkPdfNativeObject* value) {
1070     if (!value || !value->isName()) {
1071         return value;
1072     }
1073 
1074     // TODO(edisonn): use autogenerated code!
1075     HANDLE_NAME_ABBR(value, DeviceGray, G);
1076     HANDLE_NAME_ABBR(value, DeviceRGB, RGB);
1077     HANDLE_NAME_ABBR(value, DeviceCMYK, CMYK);
1078     HANDLE_NAME_ABBR(value, Indexed, I);
1079     HANDLE_NAME_ABBR(value, ASCIIHexDecode, AHx);
1080     HANDLE_NAME_ABBR(value, ASCII85Decode, A85);
1081     HANDLE_NAME_ABBR(value, LZWDecode, LZW);
1082     HANDLE_NAME_ABBR(value, FlateDecode, Fl);  // (PDF 1.2)
1083     HANDLE_NAME_ABBR(value, RunLengthDecode, RL);
1084     HANDLE_NAME_ABBR(value, CCITTFaxDecode, CCF);
1085     HANDLE_NAME_ABBR(value, DCTDecode, DCT);
1086 
1087     return value;
1088 }
1089 
readInlineImage()1090 SkPdfImageDictionary* SkPdfNativeTokenizer::readInlineImage() {
1091     // BI already processed
1092     fUncompressedStream = skipPdfWhiteSpaces(fUncompressedStream, fUncompressedStreamEnd);
1093     if (fUncompressedStream >= fUncompressedStreamEnd) {
1094         return NULL;
1095     }
1096 
1097     SkPdfImageDictionary* inlineImage = (SkPdfImageDictionary*)fAllocator->allocObject();
1098     SkPdfNativeObject::makeEmptyDictionary(inlineImage);
1099     //  PUT_TRACK_STREAM_ARGS_EXPL(fStreamId, fUncompressedStream - fUncompressedStreamStart,
1100     //                             fUncompressedStream - fUncompressedStreamStart)
1101 
1102     while (fUncompressedStream < fUncompressedStreamEnd) {
1103         SkPdfNativeObject* key = fAllocator->allocObject();
1104         fUncompressedStream = nextObject(fUncompressedStream, fUncompressedStreamEnd, key,
1105                                          fAllocator, fDoc);
1106         // PUT_TRACK_STREAM_ARGS_EXPL2(fStreamId, fUncompressedStreamStart)s
1107 
1108         if (key->isKeyword() && key->lenstr() == 2 &&
1109                     key->c_str()[0] == 'I' && key->c_str()[1] == 'D') { // ID
1110             fUncompressedStream = readInlineImageStream(fUncompressedStream, fUncompressedStreamEnd,
1111                                                         inlineImage, fDoc);
1112             return inlineImage;
1113         } else {
1114             SkPdfNativeObject* obj = fAllocator->allocObject();
1115             fUncompressedStream = nextObject(fUncompressedStream, fUncompressedStreamEnd, obj,
1116                                              fAllocator, fDoc);
1117             //  PUT_TRACK_STREAM_ARGS_EXPL2(fStreamId, fUncompressedStreamStart)s
1118             // TODO(edisonn): perf maybe we should not expand abBreviation like this
1119             inlineImage->set(inlineImageKeyAbbreviationExpand(key),
1120                              inlineImageValueAbbreviationExpand(obj));
1121         }
1122     }
1123     // TODO(edisonn): report end of data with inline image without an EI
1124     return inlineImage;
1125 }
1126