1/*
2 * Copyright 2013 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkPdfConfig.h"
9#include "SkPdfDiffEncoder.h"
10#include "SkPdfNativeObject.h"
11#include "SkPdfNativeTokenizer.h"
12#include "SkPdfUtils.h"
13
14// TODO(edisonn): mac builder does not find the header ... but from headers is ok
15//#include "SkPdfStreamCommonDictionary_autogen.h"
16//#include "SkPdfImageDictionary_autogen.h"
17#include "SkPdfHeaders_autogen.h"
18
19
20// TODO(edisonn): Perf, Make this function run faster.
21// There could be 0s between start and end.
22// needle will not contain 0s.
23static char* strrstrk(char* hayStart, char* hayEnd, const char* needle) {
24    int needleLen = strlen(needle);
25    if ((isPdfWhiteSpaceOrPdfDelimiter(*(hayStart+needleLen)) || (hayStart+needleLen == hayEnd)) &&
26            strncmp(hayStart, needle, needleLen) == 0) {
27        return hayStart;
28    }
29
30    hayStart++;
31
32    while (hayStart < hayEnd) {
33        if (isPdfWhiteSpaceOrPdfDelimiter(*(hayStart-1)) &&
34                (isPdfWhiteSpaceOrPdfDelimiter(*(hayStart+needleLen)) ||
35                      (hayStart+needleLen == hayEnd)) &&
36                strncmp(hayStart, needle, needleLen) == 0) {
37            return hayStart;
38        }
39        hayStart++;
40    }
41    return NULL;
42}
43
44const unsigned char* skipPdfWhiteSpaces(const unsigned char* start, const unsigned char* end) {
45    while (start < end && (isPdfWhiteSpace(*start) || *start == kComment_PdfDelimiter)) {
46        TRACE_COMMENT(*start);
47        if (*start == kComment_PdfDelimiter) {
48            // skip the comment until end of line
49            while (start < end && !isPdfEOL(*start)) {
50                start++;
51                TRACE_COMMENT(*start);
52            }
53        } else {
54            start++;
55        }
56    }
57    return start;
58}
59
60const unsigned char* endOfPdfToken(const unsigned char* start, const unsigned char* end) {
61    SkASSERT(!isPdfWhiteSpace(*start));
62
63    if (start < end && isPdfDelimiter(*start)) {
64        TRACE_TK(*start);
65        start++;
66        return start;
67    }
68
69    while (start < end && !isPdfWhiteSpaceOrPdfDelimiter(*start)) {
70        TRACE_TK(*start);
71        start++;
72    }
73    return start;
74}
75
76// The parsing should end with a ].
77static const unsigned char* readArray(const unsigned char* start, const unsigned char* end,
78                                      SkPdfNativeObject* array,
79                                      SkPdfAllocator* allocator, SkPdfNativeDoc* doc) {
80    SkPdfNativeObject::makeEmptyArray(array);
81    // PUT_TRACK_STREAM(array, start, start)
82
83    if (allocator == NULL) {
84        // TODO(edisonn): report/warning error/assert
85        return end;
86    }
87
88    while (start < end) {
89        // skip white spaces
90        start = skipPdfWhiteSpaces(start, end);
91
92        const unsigned char* endOfToken = endOfPdfToken(start, end);
93
94        if (endOfToken == start) {
95            // TODO(edisonn): report error in pdf file (end of stream with ] for end of aray
96            return start;
97        }
98
99        if (endOfToken == start + 1 && *start == kClosedSquareBracket_PdfDelimiter) {
100            return endOfToken;
101        }
102
103        SkPdfNativeObject* newObj = allocator->allocObject();
104        start = nextObject(start, end, newObj, allocator, doc);
105        // TODO(edisonn): perf/memory: put the variables on the stack, and flush them on the array
106        // only when we are sure they are not references!
107        if (newObj->isKeywordReference() && array->size() >= 2 &&
108                array->objAtAIndex(array->size() - 1)->isInteger() &&
109                array->objAtAIndex(array->size() - 2)->isInteger()) {
110            SkPdfNativeObject* gen = array->removeLastInArray();
111            SkPdfNativeObject* id = array->removeLastInArray();
112
113            SkPdfNativeObject::resetAndMakeReference((unsigned int)id->intValue(),
114                                                     (unsigned int)gen->intValue(), newObj);
115            // newObj  PUT_TRACK_PARAMETERS_OBJ2(id, newObj) - store end, as now
116        }
117        array->appendInArray(newObj);
118    }
119    // TODO(edisonn): report not reached, we should never get here
120    // TODO(edisonn): there might be a bug here, enable an assert and run it on files
121    // or it might be that the files were actually corrupted
122    return start;
123}
124
125static const unsigned char* readString(const unsigned char* start, const unsigned char* end,
126                                       unsigned char* out) {
127    const unsigned char* in = start;
128    bool hasOut = (out != NULL);
129
130    int openRoundBrackets = 1;
131    while (in < end) {
132        openRoundBrackets += ((*in) == kOpenedRoundBracket_PdfDelimiter);
133        openRoundBrackets -= ((*in) == kClosedRoundBracket_PdfDelimiter);
134        if (openRoundBrackets == 0) {
135            in++;   // consumed )
136            break;
137        }
138
139        if (*in == kEscape_PdfSpecial) {
140            if (in + 1 < end) {
141                switch (in[1]) {
142                    case 'n':
143                        if (hasOut) { *out = kLF_PdfWhiteSpace; }
144                        out++;
145                        in += 2;
146                        break;
147
148                    case 'r':
149                        if (hasOut) { *out = kCR_PdfWhiteSpace; }
150                        out++;
151                        in += 2;
152                        break;
153
154                    case 't':
155                        if (hasOut) { *out = kHT_PdfWhiteSpace; }
156                        out++;
157                        in += 2;
158                        break;
159
160                    case 'b':
161                        // TODO(edisonn): any special meaning to backspace?
162                        if (hasOut) { *out = kBackspace_PdfSpecial; }
163                        out++;
164                        in += 2;
165                        break;
166
167                    case 'f':
168                        if (hasOut) { *out = kFF_PdfWhiteSpace; }
169                        out++;
170                        in += 2;
171                        break;
172
173                    case kOpenedRoundBracket_PdfDelimiter:
174                        if (hasOut) { *out = kOpenedRoundBracket_PdfDelimiter; }
175                        out++;
176                        in += 2;
177                        break;
178
179                    case kClosedRoundBracket_PdfDelimiter:
180                        if (hasOut) { *out = kClosedRoundBracket_PdfDelimiter; }
181                        out++;
182                        in += 2;
183                        break;
184
185                    case kEscape_PdfSpecial:
186                        if (hasOut) { *out = kEscape_PdfSpecial; }
187                        out++;
188                        in += 2;
189                        break;
190
191                    case '0':
192                    case '1':
193                    case '2':
194                    case '3':
195                    case '4':
196                    case '5':
197                    case '6':
198                    case '7': {
199                            //read octals
200                            in++;   // consume backslash
201
202                            int code = 0;
203                            int i = 0;
204                            while (in < end && *in >= '0' && *in < '8') {
205                                code = (code << 3) + ((*in) - '0');  // code * 8 + d
206                                i++;
207                                in++;
208                                if (i == 3) {
209                                    if (hasOut) { *out = code & 0xff; }
210                                    out++;
211                                    i = 0;
212                                }
213                            }
214                            if (i > 0) {
215                                if (hasOut) { *out = code & 0xff; }
216                                out++;
217                            }
218                        }
219                        break;
220
221                    default:
222                        // Per spec, backslash is ignored if escaped ch is unknown
223                        in++;
224                        break;
225                }
226            } else {
227                in++;
228            }
229        } else {
230            if (hasOut) { *out = *in; }
231            in++;
232            out++;
233        }
234    }
235
236    if (hasOut) {
237        return in;  // consumed already ) at the end of the string
238    } else {
239        // return where the string would end if we reuse the string
240        return start + (out - (const unsigned char*)NULL);
241    }
242}
243
244static int readStringLength(const unsigned char* start, const unsigned char* end) {
245    return readString(start, end, NULL) - start;
246}
247
248static const unsigned char* readString(const unsigned char* start, const unsigned char* end,
249                                       SkPdfNativeObject* str, SkPdfAllocator* allocator) {
250    if (!allocator) {
251        // TODO(edisonn): report error/warn/assert
252        return end;
253    }
254
255    int outLength = readStringLength(start, end);
256    unsigned char* out = (unsigned char*)allocator->alloc(outLength);
257    const unsigned char* now = readString(start, end, out);
258    SkPdfNativeObject::makeString(out, out + outLength, str);
259    //  PUT_TRACK_STREAM(str, start, now)
260    TRACE_STRING(out, out + outLength);
261    return now;  // consumed already ) at the end of the string
262}
263
264static const unsigned char* readHexString(const unsigned char* start, const unsigned char* end,
265                                          unsigned char* out) {
266    bool hasOut = (out != NULL);
267    const unsigned char* in = start;
268
269    unsigned char code = 0;
270
271    while (in < end) {
272        while (in < end && isPdfWhiteSpace(*in)) {
273            in++;
274        }
275
276        if (*in == kClosedInequityBracket_PdfDelimiter) {
277            in++;  // consume >
278            // normal exit
279            break;
280        }
281
282        if (in >= end) {
283            // end too soon
284            break;
285        }
286
287        switch (*in) {
288            case '0':
289            case '1':
290            case '2':
291            case '3':
292            case '4':
293            case '5':
294            case '6':
295            case '7':
296            case '8':
297            case '9':
298                code = (*in - '0') << 4;
299                break;
300
301            case 'a':
302            case 'b':
303            case 'c':
304            case 'd':
305            case 'e':
306            case 'f':
307                code = (*in - 'a' + 10) << 4;
308                break;
309
310            case 'A':
311            case 'B':
312            case 'C':
313            case 'D':
314            case 'E':
315            case 'F':
316                code = (*in - 'A' + 10) << 4;
317                break;
318
319            // TODO(edisonn): spec does not say how to handle this error
320            default:
321                break;
322        }
323
324        in++;  // advance
325
326        while (in < end && isPdfWhiteSpace(*in)) {
327            in++;
328        }
329
330        // TODO(edisonn): report error
331        if (in >= end) {
332            if (hasOut) { *out = code; }
333            out++;
334            break;
335        }
336
337        if (*in == kClosedInequityBracket_PdfDelimiter) {
338            if (hasOut) { *out = code; }
339            out++;
340            in++;
341            break;
342        }
343
344        switch (*in) {
345            case '0':
346            case '1':
347            case '2':
348            case '3':
349            case '4':
350            case '5':
351            case '6':
352            case '7':
353            case '8':
354            case '9':
355                code += (*in - '0');
356                break;
357
358            case 'a':
359            case 'b':
360            case 'c':
361            case 'd':
362            case 'e':
363            case 'f':
364                code += (*in - 'a' + 10);
365                break;
366
367            case 'A':
368            case 'B':
369            case 'C':
370            case 'D':
371            case 'E':
372            case 'F':
373                code += (*in - 'A' + 10);
374                break;
375
376            // TODO(edisonn): spec does not say how to handle this error
377            default:
378                break;
379        }
380
381        if (hasOut) { *out = code; }
382        out++;
383        in++;
384    }
385
386    if (hasOut) {
387        return in;  // consumed already ) at the end of the string
388    } else {
389        // return where the string would end if we reuse the string
390        return start + (out - (const unsigned char*)NULL);
391    }
392}
393
394static int readHexStringLength(const unsigned char* start, const unsigned char* end) {
395    return readHexString(start, end, NULL) - start;
396}
397
398static const unsigned char* readHexString(const unsigned char* start, const unsigned char* end, SkPdfNativeObject* str, SkPdfAllocator* allocator) {
399    if (!allocator) {
400        // TODO(edisonn): report error/warn/assert
401        return end;
402    }
403    int outLength = readHexStringLength(start, end);
404    unsigned char* out = (unsigned char*)allocator->alloc(outLength);
405    const unsigned char* now = readHexString(start, end, out);
406    SkPdfNativeObject::makeHexString(out, out + outLength, str);
407    // str PUT_TRACK_STREAM(start, now)
408    TRACE_HEXSTRING(out, out + outLength);
409    return now;  // consumed already > at the end of the string
410}
411
412// TODO(edisonn): add version parameter, before PDF 1.2 name could not have special characters.
413static const unsigned char* readName(const unsigned char* start, const unsigned char* end,
414                                     unsigned char* out) {
415    bool hasOut = (out != NULL);
416    const unsigned char* in = start;
417
418    unsigned char code = 0;
419
420    while (in < end) {
421        if (isPdfWhiteSpaceOrPdfDelimiter(*in)) {
422            break;
423        }
424
425        if (*in == '#' && in + 2 < end) {
426            in++;
427            switch (*in) {
428                case '0':
429                case '1':
430                case '2':
431                case '3':
432                case '4':
433                case '5':
434                case '6':
435                case '7':
436                case '8':
437                case '9':
438                    code = (*in - '0') << 4;
439                    break;
440
441                case 'a':
442                case 'b':
443                case 'c':
444                case 'd':
445                case 'e':
446                case 'f':
447                    code = (*in - 'a' + 10) << 4;
448                    break;
449
450                case 'A':
451                case 'B':
452                case 'C':
453                case 'D':
454                case 'E':
455                case 'F':
456                    code = (*in - 'A' + 10) << 4;
457                    break;
458
459                // TODO(edisonn): spec does not say how to handle this error
460                default:
461                    break;
462            }
463
464            in++;  // advance
465
466            switch (*in) {
467                case '0':
468                case '1':
469                case '2':
470                case '3':
471                case '4':
472                case '5':
473                case '6':
474                case '7':
475                case '8':
476                case '9':
477                    code += (*in - '0');
478                    break;
479
480                case 'a':
481                case 'b':
482                case 'c':
483                case 'd':
484                case 'e':
485                case 'f':
486                    code += (*in - 'a' + 10);
487                    break;
488
489                case 'A':
490                case 'B':
491                case 'C':
492                case 'D':
493                case 'E':
494                case 'F':
495                    code += (*in - 'A' + 10);
496                    break;
497
498                // TODO(edisonn): spec does not say how to handle this error
499                default:
500                    break;
501            }
502
503            if (hasOut) { *out = code; }
504            out++;
505            in++;
506        } else {
507            if (hasOut) { *out = *in; }
508            out++;
509            in++;
510        }
511    }
512
513    if (hasOut) {
514        return in;  // consumed already ) at the end of the string
515    } else {
516        // return where the string would end if we reuse the string
517        return start + (out - (const unsigned char*)NULL);
518    }
519}
520
521static int readNameLength(const unsigned char* start, const unsigned char* end) {
522    return readName(start, end, NULL) - start;
523}
524
525static const unsigned char* readName(const unsigned char* start, const unsigned char* end,
526                                     SkPdfNativeObject* name, SkPdfAllocator* allocator) {
527    if (!allocator) {
528        // TODO(edisonn): report error/warn/assert
529        return end;
530    }
531    int outLength = readNameLength(start, end);
532    unsigned char* out = (unsigned char*)allocator->alloc(outLength);
533    const unsigned char* now = readName(start, end, out);
534    SkPdfNativeObject::makeName(out, out + outLength, name);
535    //PUT_TRACK_STREAM(start, now)
536    TRACE_NAME(out, out + outLength);
537    return now;
538}
539
540// TODO(edisonn): pdf spec let Length to be an indirect object define after the stream
541// that makes for an interesting scenario, where the stream itself contains endstream, together
542// with a reference object with the length, but the real length object would be somewhere else
543// it could confuse the parser
544/*example:
545
5467 0 obj
547<< /length 8 0 R>>
548stream
549...............
550endstream
5518 0 obj #we are in stream actually, not a real object
552<< 10 >> #we are in stream actually, not a real object
553endobj
554endstream
5558 0 obj #real obj
556<< 100 >> #real obj
557endobj
558and it could get worse, with multiple object like this
559*/
560
561// right now implement the silly algorithm that assumes endstream is finishing the stream
562
563static const unsigned char* readStream(const unsigned char* start, const unsigned char* end,
564                                       SkPdfNativeObject* dict, SkPdfNativeDoc* doc) {
565    start = skipPdfWhiteSpaces(start, end);
566    if (!(  start[0] == 's' &&
567            start[1] == 't' &&
568            start[2] == 'r' &&
569            start[3] == 'e' &&
570            start[4] == 'a' &&
571            start[5] == 'm')) {
572        // no stream. return.
573        return start;
574    }
575
576    start += 6; // strlen("stream")
577    if (start[0] == kCR_PdfWhiteSpace && start[1] == kLF_PdfWhiteSpace) {
578        start += 2;
579    } else if (start[0] == kLF_PdfWhiteSpace) {
580        start += 1;
581    } else if (isPdfWhiteSpace(start[0])) {
582        start += 1;
583    } else {
584        // TODO(edisonn): warn it should be isPdfDelimiter(start[0])) ?
585    }
586
587    SkPdfStreamCommonDictionary* stream = (SkPdfStreamCommonDictionary*) dict;
588    // TODO(edisonn): load Length
589    int64_t length = -1;
590
591    // TODO(edisonn): very basic implementation
592    if (stream->has_Length() && stream->Length(doc) > 0) {
593        length = stream->Length(doc);
594    }
595
596    // TODO(edisonn): load external streams
597    // TODO(edisonn): look at the last filter, to determine how to deal with possible parsing
598    // issues. The last filter can have special rules to terminate a stream, which we could
599    // use to determine end of stream.
600
601    if (length >= 0) {
602        const unsigned char* endstream = start + length;
603
604        if (endstream[0] == kCR_PdfWhiteSpace && endstream[1] == kLF_PdfWhiteSpace) {
605            endstream += 2;
606        } else if (endstream[0] == kLF_PdfWhiteSpace) {
607            endstream += 1;
608        }
609
610        if (strncmp((const char*)endstream, "endstream", strlen("endstream")) != 0) {
611            length = -1;
612        }
613    }
614
615    if (length < 0) {
616        // scan the buffer, until we find first endstream
617        // TODO(edisonn): all buffers must have a 0 at the end now,
618        const unsigned char* endstream = (const unsigned char*)strrstrk((char*)start, (char*)end,
619                                                                        "endstream");
620
621        if (endstream) {
622            length = endstream - start;
623            if (*(endstream-1) == kLF_PdfWhiteSpace) length--;
624            if (*(endstream-2) == kCR_PdfWhiteSpace) length--;
625        }
626    }
627    if (length >= 0) {
628        const unsigned char* endstream = start + length;
629
630        if (endstream[0] == kCR_PdfWhiteSpace && endstream[1] == kLF_PdfWhiteSpace) {
631            endstream += 2;
632        } else if (endstream[0] == kLF_PdfWhiteSpace) {
633            endstream += 1;
634        }
635
636        // TODO(edisonn): verify the next bytes are "endstream"
637
638        endstream += strlen("endstream");
639        // TODO(edisonn): Assert? report error/warning?
640        dict->addStream(start, (size_t)length);
641        return endstream;
642    }
643    return start;
644}
645
646static const unsigned char* readInlineImageStream(const unsigned char* start,
647                                                  const unsigned char* end,
648                                                  SkPdfImageDictionary* inlineImage,
649                                                  SkPdfNativeDoc* doc) {
650    // We already processed ID keyword, and we should be positioned immediately after it
651
652    // TODO(edisonn): security: either make all streams to have extra 2 bytes at the end,
653    // instead of this if.
654    //if (end - start <= 2) {
655    //    // TODO(edisonn): warning?
656    //    return end; // but can we have a pixel image encoded in 1-2 bytes?
657    //}
658
659    if (start[0] == kCR_PdfWhiteSpace && start[1] == kLF_PdfWhiteSpace) {
660        start += 2;
661    } else if (start[0] == kLF_PdfWhiteSpace) {
662        start += 1;
663    } else if (isPdfWhiteSpace(start[0])) {
664        start += 1;
665    } else {
666        SkASSERT(isPdfDelimiter(start[0]));
667        // TODO(edisonn): warning?
668    }
669
670    const unsigned char* endstream = (const unsigned char*)strrstrk((char*)start, (char*)end, "EI");
671    const unsigned char* endEI = endstream ? endstream + 2 : NULL;  // 2 == strlen("EI")
672
673    if (endstream) {
674        int length = endstream - start;
675        if (*(endstream-1) == kLF_PdfWhiteSpace) length--;
676        if (*(endstream-2) == kCR_PdfWhiteSpace) length--;
677        inlineImage->addStream(start, (size_t)length);
678    } else {
679        // TODO(edisonn): report error in inline image stream (ID-EI) section
680        // TODO(edisonn): based on filter, try to ignore a missing EI, and read data properly
681        return end;
682    }
683    return endEI;
684}
685
686static const unsigned char* readDictionary(const unsigned char* start, const unsigned char* end,
687                                           SkPdfNativeObject* dict,
688                                           SkPdfAllocator* allocator, SkPdfNativeDoc* doc) {
689    if (allocator == NULL) {
690        // TODO(edisonn): report/warning error
691        return end;
692    }
693    SkPdfNativeObject::makeEmptyDictionary(dict);
694    // PUT_TRACK_STREAM(dict, start, start)
695
696    start = skipPdfWhiteSpaces(start, end);
697    SkPdfAllocator tmpStorage;  // keys will be stored in dict, we can free them after set.
698
699    while (start < end && *start == kNamed_PdfDelimiter) {
700        SkPdfNativeObject key;
701        //*start = '\0';
702        start++;
703        start = readName(start, end, &key, &tmpStorage);
704        start = skipPdfWhiteSpaces(start, end);
705
706        if (start < end) {
707            SkPdfNativeObject* value = allocator->allocObject();
708            start = nextObject(start, end, value, allocator, doc);
709
710            start = skipPdfWhiteSpaces(start, end);
711
712            if (start < end) {
713                // We should have an indirect reference
714                if (isPdfDigit(*start)) {
715                    SkPdfNativeObject generation;
716                    start = nextObject(start, end, &generation, allocator, doc);
717
718                    SkPdfNativeObject keywordR;
719                    start = nextObject(start, end, &keywordR, allocator, doc);
720
721                    if (value->isInteger() && generation.isInteger() &&
722                            keywordR.isKeywordReference()) {
723                        int64_t id = value->intValue();
724                        SkPdfNativeObject::resetAndMakeReference(
725                                (unsigned int)id,
726                                (unsigned int)generation.intValue(),
727                                value);
728                        //  PUT_TRACK_PARAMETERS_OBJ2(value, &generation)
729                        dict->set(&key, value);
730                    } else {
731                        // TODO(edisonn) error?, ignore it for now.
732                        dict->set(&key, value);
733                    }
734                } else {
735                    // next elem is not a digit, but it might not be / either!
736                    dict->set(&key, value);
737                }
738            } else {
739                // /key >>
740                dict->set(&key, value);
741                return end;
742            }
743            start = skipPdfWhiteSpaces(start, end);
744        } else {
745            dict->set(&key, &SkPdfNativeObject::kNull);
746            return end;
747        }
748    }
749
750    // now we should expect >>
751    start = skipPdfWhiteSpaces(start, end);
752    if (*start != kClosedInequityBracket_PdfDelimiter) {
753        // TODO(edisonn): report/warning
754    }
755
756    start++;  // skip >
757    if (*start != kClosedInequityBracket_PdfDelimiter) {
758        // TODO(edisonn): report/warning
759    }
760
761    start++;  // skip >
762
763    //STORE_TRACK_PARAMETER_OFFSET_END(dict,start);
764
765    start = readStream(start, end, dict, doc);
766
767    return start;
768}
769
770const unsigned char* nextObject(const unsigned char* start, const unsigned char* end,
771                                SkPdfNativeObject* token,
772                                SkPdfAllocator* allocator, SkPdfNativeDoc* doc) {
773    const unsigned char* current;
774
775    // skip white spaces
776    start = skipPdfWhiteSpaces(start, end);
777
778    if (start >= end) {
779        return end;
780    }
781
782    current = endOfPdfToken(start, end);
783
784    // no token, len would be 0
785    if (current == start || current == end) {
786        return end;
787    }
788
789    int tokenLen = current - start;
790
791    if (tokenLen == 1) {
792        // start array
793        switch (*start) {
794            case kOpenedSquareBracket_PdfDelimiter:
795                return readArray(current, end, token, allocator, doc);
796
797            case kOpenedRoundBracket_PdfDelimiter:
798                return readString(start + 1, end, token, allocator);
799
800            case kOpenedInequityBracket_PdfDelimiter:
801                if (end > start + 1 && start[1] == kOpenedInequityBracket_PdfDelimiter) {
802                    // TODO(edisonn): pass here the length somehow?
803                    return readDictionary(start + 2, end, token, allocator, doc);  // skip <<
804                } else {
805                    return readHexString(start + 1, end, token, allocator);  // skip <
806                }
807
808            case kNamed_PdfDelimiter:
809                return readName(start + 1, end, token, allocator);
810
811            // TODO(edisonn): what to do curly brackets?
812            case kOpenedCurlyBracket_PdfDelimiter:
813            default:
814                break;
815        }
816
817        SkASSERT(!isPdfWhiteSpace(*start));
818        if (isPdfDelimiter(*start)) {
819            // TODO(edisonn): how unexpected stream ] } > ) will be handled?
820            // for now ignore, and it will become a keyword to be ignored
821        }
822    }
823
824    if (tokenLen == 4 && start[0] == 'n' && start[1] == 'u' && start[2] == 'l' && start[3] == 'l') {
825        SkPdfNativeObject::makeNull(token);
826        // PUT_TRACK_STREAM(start, start + 4)
827        return current;
828    }
829
830    if (tokenLen == 4 && start[0] == 't' && start[1] == 'r' && start[2] == 'u' && start[3] == 'e') {
831        SkPdfNativeObject::makeBoolean(true, token);
832        // PUT_TRACK_STREAM(start, start + 4)
833        return current;
834    }
835
836    // TODO(edisonn): again, make all buffers have 5 extra bytes
837    if (tokenLen == 5 && start[0] == 'f' &&
838                         start[1] == 'a' &&
839                         start[2] == 'l' &&
840                         start[3] == 's' &&
841                         start[4] == 'e') {
842        SkPdfNativeObject::makeBoolean(false, token);
843        // PUT_TRACK_STREAM(start, start + 5)
844        return current;
845    }
846
847    if (isPdfNumeric(*start)) {
848        SkPdfNativeObject::makeNumeric(start, current, token);
849        //  PUT_TRACK_STREAM(start, current)
850    } else {
851        SkPdfNativeObject::makeKeyword(start, current, token);
852        // PUT_TRACK_STREAM(start, current)
853    }
854    return current;
855}
856
857SkPdfNativeObject* SkPdfAllocator::allocBlock() {
858    fSizeInBytes += BUFFER_SIZE * sizeof(SkPdfNativeObject);
859    return new SkPdfNativeObject[BUFFER_SIZE];
860}
861
862SkPdfAllocator::~SkPdfAllocator() {
863    for (int i = 0 ; i < fHandles.count(); i++) {
864        free(fHandles[i]);
865    }
866    for (int i = 0 ; i < fHistory.count(); i++) {
867        for (int j = 0 ; j < BUFFER_SIZE; j++) {
868            fHistory[i][j].reset();
869        }
870        delete[] fHistory[i];
871    }
872    for (int j = 0 ; j < BUFFER_SIZE; j++) {
873        fCurrent[j].reset();
874    }
875    delete[] fCurrent;
876}
877
878SkPdfNativeObject* SkPdfAllocator::allocObject() {
879    if (fCurrentUsed >= BUFFER_SIZE) {
880        fHistory.push(fCurrent);
881        fCurrent = allocBlock();
882        fCurrentUsed = 0;
883        fSizeInBytes += sizeof(SkPdfNativeObject*);
884    }
885    fCurrentUsed++;
886    return &fCurrent[fCurrentUsed - 1];
887}
888
889// TODO(edisonn): perf: do no copy the buffers, but reuse them, and mark cache the result,
890// so there is no need of a second pass
891SkPdfNativeTokenizer::SkPdfNativeTokenizer(SkPdfNativeObject* objWithStream,
892                                           SkPdfAllocator* allocator,
893                                           SkPdfNativeDoc* doc)
894            : fDoc(doc)
895            , fAllocator(allocator)
896            , fUncompressedStream(NULL)
897            , fUncompressedStreamEnd(NULL)
898            , fEmpty(false)
899            , fHasPutBack(false) {
900    const unsigned char* buffer = NULL;
901    size_t len = 0;
902    objWithStream->GetFilteredStreamRef(&buffer, &len);
903    // TODO(edisonn): really bad hack, find end of object (endobj might be in a comment!)
904    // we need to do now for perf, and our generated pdfs do not have comments,
905    // but we need to remove this hack for pdfs in the wild
906    char* endobj = strrstrk((char*)buffer, (char*)buffer + len, "endobj");
907    if (endobj) {
908        len = endobj - (char*)buffer + strlen("endobj");
909    }
910    fUncompressedStreamStart = fUncompressedStream = buffer;
911    fUncompressedStreamEnd = fUncompressedStream + len;
912}
913
914SkPdfNativeTokenizer::SkPdfNativeTokenizer(const unsigned char* buffer, int len,
915                                           SkPdfAllocator* allocator,
916                                           SkPdfNativeDoc* doc) : fDoc(doc)
917                                                                , fAllocator(allocator)
918                                                                , fEmpty(false)
919                                                                , fHasPutBack(false) {
920    // TODO(edisonn): really bad hack, find end of object (endobj might be in a comment!)
921    // we need to do now for perf, and our generated pdfs do not have comments,
922    // but we need to remove this hack for pdfs in the wild
923    char* endobj = strrstrk((char*)buffer, (char*)buffer + len, "endobj");
924    if (endobj) {
925        len = endobj - (char*)buffer + strlen("endobj");
926    }
927    fUncompressedStreamStart = fUncompressedStream = buffer;
928    fUncompressedStreamEnd = fUncompressedStream + len;
929}
930
931SkPdfNativeTokenizer::~SkPdfNativeTokenizer() {
932}
933
934bool SkPdfNativeTokenizer::readTokenCore(PdfToken* token) {
935#ifdef PDF_TRACE_READ_TOKEN
936    static int read_op = 0;
937#endif
938
939    token->fKeyword = NULL;
940    token->fObject = NULL;
941
942    fUncompressedStream = skipPdfWhiteSpaces(fUncompressedStream, fUncompressedStreamEnd);
943    if (fUncompressedStream >= fUncompressedStreamEnd) {
944        fEmpty = true;
945        return false;
946    }
947
948    SkPdfNativeObject obj;
949    fUncompressedStream = nextObject(fUncompressedStream, fUncompressedStreamEnd, &obj, fAllocator, fDoc);
950    //  PUT_TRACK_STREAM_ARGS_EXPL2(fStreamId, fUncompressedStreamStart)
951
952    // If it is a keyword, we will only get the pointer of the string.
953    if (obj.type() == SkPdfNativeObject::kKeyword_PdfObjectType) {
954        token->fKeyword = obj.c_str();
955        token->fKeywordLength = obj.lenstr();
956        token->fType = kKeyword_TokenType;
957    } else {
958        SkPdfNativeObject* pobj = fAllocator->allocObject();
959        *pobj = obj;
960        token->fObject = pobj;
961        token->fType = kObject_TokenType;
962    }
963
964#ifdef PDF_TRACE_READ_TOKEN
965    read_op++;
966#if 0
967    if (548 == read_op) {
968        printf("break;\n");
969    }
970#endif
971    printf("%i READ %s %s\n", read_op, token->fType == kKeyword_TokenType ? "Keyword" : "Object",
972           token->fKeyword ? SkString(token->fKeyword, token->fKeywordLength).c_str() :
973                             token->fObject->toString().c_str());
974#endif
975
976    return true;
977}
978
979void SkPdfNativeTokenizer::PutBack(PdfToken token) {
980    SkASSERT(!fHasPutBack);
981    fHasPutBack = true;
982    fPutBack = token;
983#ifdef PDF_TRACE_READ_TOKEN
984    printf("PUT_BACK %s %s\n", token.fType == kKeyword_TokenType ? "Keyword" : "Object",
985           token.fKeyword ? SkString(token.fKeyword, token.fKeywordLength).c_str() :
986                            token.fObject->toString().c_str());
987#endif
988}
989
990bool SkPdfNativeTokenizer::readToken(PdfToken* token, bool writeDiff) {
991    if (fHasPutBack) {
992        *token = fPutBack;
993        fHasPutBack = false;
994#ifdef PDF_TRACE_READ_TOKEN
995        printf("READ_BACK %s %s\n", token->fType == kKeyword_TokenType ? "Keyword" : "Object",
996               token->fKeyword ? SkString(token->fKeyword, token->fKeywordLength).c_str() :
997                                 token->fObject->toString().c_str());
998#endif
999        if (writeDiff) {
1000            SkPdfDiffEncoder::WriteToFile(token);
1001        }
1002        return true;
1003    }
1004
1005    if (fEmpty) {
1006#ifdef PDF_TRACE_READ_TOKEN
1007        printf("EMPTY TOKENIZER\n");
1008#endif
1009        return false;
1010    }
1011
1012    const bool result = readTokenCore(token);
1013    if (result && writeDiff) {
1014        SkPdfDiffEncoder::WriteToFile(token);
1015    }
1016    return result;
1017}
1018
1019#define DECLARE_PDF_NAME(longName) SkPdfName longName((char*)#longName)
1020
1021// keys
1022DECLARE_PDF_NAME(BitsPerComponent);
1023DECLARE_PDF_NAME(ColorSpace);
1024DECLARE_PDF_NAME(Decode);
1025DECLARE_PDF_NAME(DecodeParms);
1026DECLARE_PDF_NAME(Filter);
1027DECLARE_PDF_NAME(Height);
1028DECLARE_PDF_NAME(ImageMask);
1029DECLARE_PDF_NAME(Intent); // PDF 1.1 - the key, or the abBreviations?
1030DECLARE_PDF_NAME(Interpolate);
1031DECLARE_PDF_NAME(Width);
1032
1033// values
1034DECLARE_PDF_NAME(DeviceGray);
1035DECLARE_PDF_NAME(DeviceRGB);
1036DECLARE_PDF_NAME(DeviceCMYK);
1037DECLARE_PDF_NAME(Indexed);
1038DECLARE_PDF_NAME(ASCIIHexDecode);
1039DECLARE_PDF_NAME(ASCII85Decode);
1040DECLARE_PDF_NAME(LZWDecode);
1041DECLARE_PDF_NAME(FlateDecode);  // PDF 1.2
1042DECLARE_PDF_NAME(RunLengthDecode);
1043DECLARE_PDF_NAME(CCITTFaxDecode);
1044DECLARE_PDF_NAME(DCTDecode);
1045
1046#define HANDLE_NAME_ABBR(obj,longName,shortName) if (obj->isName(#shortName)) return &longName;
1047
1048
1049static SkPdfNativeObject* inlineImageKeyAbbreviationExpand(SkPdfNativeObject* key) {
1050    if (!key || !key->isName()) {
1051        return key;
1052    }
1053
1054    // TODO(edisonn): use autogenerated code!
1055    HANDLE_NAME_ABBR(key, BitsPerComponent, BPC);
1056    HANDLE_NAME_ABBR(key, ColorSpace, CS);
1057    HANDLE_NAME_ABBR(key, Decode, D);
1058    HANDLE_NAME_ABBR(key, DecodeParms, DP);
1059    HANDLE_NAME_ABBR(key, Filter, F);
1060    HANDLE_NAME_ABBR(key, Height, H);
1061    HANDLE_NAME_ABBR(key, ImageMask, IM);
1062//    HANDLE_NAME_ABBR(key, Intent, );
1063    HANDLE_NAME_ABBR(key, Interpolate, I);
1064    HANDLE_NAME_ABBR(key, Width, W);
1065
1066    return key;
1067}
1068
1069static SkPdfNativeObject* inlineImageValueAbbreviationExpand(SkPdfNativeObject* value) {
1070    if (!value || !value->isName()) {
1071        return value;
1072    }
1073
1074    // TODO(edisonn): use autogenerated code!
1075    HANDLE_NAME_ABBR(value, DeviceGray, G);
1076    HANDLE_NAME_ABBR(value, DeviceRGB, RGB);
1077    HANDLE_NAME_ABBR(value, DeviceCMYK, CMYK);
1078    HANDLE_NAME_ABBR(value, Indexed, I);
1079    HANDLE_NAME_ABBR(value, ASCIIHexDecode, AHx);
1080    HANDLE_NAME_ABBR(value, ASCII85Decode, A85);
1081    HANDLE_NAME_ABBR(value, LZWDecode, LZW);
1082    HANDLE_NAME_ABBR(value, FlateDecode, Fl);  // (PDF 1.2)
1083    HANDLE_NAME_ABBR(value, RunLengthDecode, RL);
1084    HANDLE_NAME_ABBR(value, CCITTFaxDecode, CCF);
1085    HANDLE_NAME_ABBR(value, DCTDecode, DCT);
1086
1087    return value;
1088}
1089
1090SkPdfImageDictionary* SkPdfNativeTokenizer::readInlineImage() {
1091    // BI already processed
1092    fUncompressedStream = skipPdfWhiteSpaces(fUncompressedStream, fUncompressedStreamEnd);
1093    if (fUncompressedStream >= fUncompressedStreamEnd) {
1094        return NULL;
1095    }
1096
1097    SkPdfImageDictionary* inlineImage = (SkPdfImageDictionary*)fAllocator->allocObject();
1098    SkPdfNativeObject::makeEmptyDictionary(inlineImage);
1099    //  PUT_TRACK_STREAM_ARGS_EXPL(fStreamId, fUncompressedStream - fUncompressedStreamStart,
1100    //                             fUncompressedStream - fUncompressedStreamStart)
1101
1102    while (fUncompressedStream < fUncompressedStreamEnd) {
1103        SkPdfNativeObject* key = fAllocator->allocObject();
1104        fUncompressedStream = nextObject(fUncompressedStream, fUncompressedStreamEnd, key,
1105                                         fAllocator, fDoc);
1106        // PUT_TRACK_STREAM_ARGS_EXPL2(fStreamId, fUncompressedStreamStart)s
1107
1108        if (key->isKeyword() && key->lenstr() == 2 &&
1109                    key->c_str()[0] == 'I' && key->c_str()[1] == 'D') { // ID
1110            fUncompressedStream = readInlineImageStream(fUncompressedStream, fUncompressedStreamEnd,
1111                                                        inlineImage, fDoc);
1112            return inlineImage;
1113        } else {
1114            SkPdfNativeObject* obj = fAllocator->allocObject();
1115            fUncompressedStream = nextObject(fUncompressedStream, fUncompressedStreamEnd, obj,
1116                                             fAllocator, fDoc);
1117            //  PUT_TRACK_STREAM_ARGS_EXPL2(fStreamId, fUncompressedStreamStart)s
1118            // TODO(edisonn): perf maybe we should not expand abBreviation like this
1119            inlineImage->set(inlineImageKeyAbbreviationExpand(key),
1120                             inlineImageValueAbbreviationExpand(obj));
1121        }
1122    }
1123    // TODO(edisonn): report end of data with inline image without an EI
1124    return inlineImage;
1125}
1126