SkPdfNativeTokenizer.cpp revision 222382b30a176db9d9044d9df1ae14e0fbe27181
1
2#include "SkPdfNativeTokenizer.h"
3#include "SkPdfObject.h"
4#include "SkPdfConfig.h"
5
6#include "SkPdfStreamCommonDictionary_autogen.h"
7
8static unsigned char* skipPdfWhiteSpaces(unsigned char* start, unsigned char* end) {
9    while (start < end && isPdfWhiteSpace(*start)) {
10        if (*start == kComment_PdfDelimiter) {
11            // skip the comment until end of line
12            while (start < end && !isPdfEOL(*start)) {
13                *start = '\0';
14                start++;
15            }
16        } else {
17            *start = '\0';
18            start++;
19        }
20    }
21    return start;
22}
23
24// TODO(edisonn) '(' can be used, will it break the string a delimiter or space inside () ?
25static unsigned char* endOfPdfToken(unsigned char* start, unsigned char* end) {
26    //int opened brackets
27    //TODO(edisonn): what out for special chars, like \n, \032
28
29    SkASSERT(!isPdfWhiteSpace(*start));
30
31    if (start < end && isPdfDelimiter(*start)) {
32        start++;
33        return start;
34    }
35
36    while (start < end && !isPdfWhiteSpaceOrPdfDelimiter(*start)) {
37        start++;
38    }
39    return start;
40}
41
42// last elem has to be ]
43static unsigned char* readArray(unsigned char* start, unsigned char* end, SkPdfObject* array, SkPdfAllocator* allocator) {
44    while (start < end) {
45        // skip white spaces
46        start = skipPdfWhiteSpaces(start, end);
47
48        unsigned char* endOfToken = endOfPdfToken(start, end);
49
50        if (endOfToken == start) {
51            // TODO(edisonn): report error in pdf file (end of stream with ] for end of aray
52            return start;
53        }
54
55        if (endOfToken == start + 1 && *start == kClosedSquareBracket_PdfDelimiter) {
56            return endOfToken;
57        }
58
59        SkPdfObject* newObj = allocator->allocObject();
60        start = nextObject(start, end, newObj, allocator);
61        // TODO(edisonn): perf/memory: put the variables on the stack, and flush them on the array only when
62        // we are sure they are not references!
63        if (newObj->isKeywordReference() && array->size() >= 2 && array->objAtAIndex(array->size() - 1)->isInteger() && array->objAtAIndex(array->size() - 2)->isInteger()) {
64            SkPdfObject* gen = array->removeLastInArray();
65            SkPdfObject* id = array->removeLastInArray();
66            newObj->reset();
67            SkPdfObject::makeReference((unsigned int)id->intValue(), (unsigned int)gen->intValue(), newObj);
68        }
69        array->appendInArray(newObj);
70    }
71    // TODO(edisonn): report not reached, we should never get here
72    SkASSERT(false);
73    return start;
74}
75
76// When we read strings we will rewrite the string so we will reuse the memory
77// when we start to read the string, we already consumed the opened bracket
78static unsigned char* readString(unsigned char* start, unsigned char* end, SkPdfObject* str) {
79    unsigned char* out = start;
80    unsigned char* in = start;
81
82    int openRoundBrackets = 0;
83    while (in < end && (*in != kClosedRoundBracket_PdfDelimiter || openRoundBrackets > 0)) {
84        openRoundBrackets += ((*in) == kOpenedRoundBracket_PdfDelimiter);
85        openRoundBrackets -= ((*in) == kClosedRoundBracket_PdfDelimiter);
86        if (*in == kEscape_PdfSpecial) {
87            if (in + 1 < end) {
88                switch (in[1]) {
89                    case 'n':
90                        *out = kLF_PdfWhiteSpace;
91                        out++;
92                        in += 2;
93                        break;
94
95                    case 'r':
96                        *out = kCR_PdfWhiteSpace;
97                        out++;
98                        in += 2;
99                        break;
100
101                    case 't':
102                        *out = kHT_PdfWhiteSpace;
103                        out++;
104                        in += 2;
105                        break;
106
107                    case 'b':
108                        // TODO(edisonn): any special meaning to backspace?
109                        *out = kBackspace_PdfSpecial;
110                        out++;
111                        in += 2;
112                        break;
113
114                    case 'f':
115                        *out = kFF_PdfWhiteSpace;
116                        out++;
117                        in += 2;
118                        break;
119
120                    case kOpenedRoundBracket_PdfDelimiter:
121                        *out = kOpenedRoundBracket_PdfDelimiter;
122                        out++;
123                        in += 2;
124                        break;
125
126                    case kClosedRoundBracket_PdfDelimiter:
127                        *out = kClosedRoundBracket_PdfDelimiter;
128                        out++;
129                        in += 2;
130                        break;
131
132                    case kEscape_PdfSpecial:
133                        *out = kEscape_PdfSpecial;
134                        out++;
135                        in += 2;
136                        break;
137
138                    case '0':
139                    case '1':
140                    case '2':
141                    case '3':
142                    case '4':
143                    case '5':
144                    case '6':
145                    case '7': {
146                            //read octals
147                            in++;   // consume backslash
148
149                            int code = 0;
150                            int i = 0;
151                            while (in < end && *in >= '0' && *in < '8') {
152                                code = (code << 3) + ((*in) - '0');  // code * 8 + d
153                                i++;
154                                in++;
155                                if (i == 3) {
156                                    *out = code & 0xff;
157                                    out++;
158                                    i = 0;
159                                }
160                            }
161                            if (i > 0) {
162                                *out = code & 0xff;
163                                out++;
164                            }
165                        }
166                        break;
167
168                    default:
169                        // Per spec, backslash is ignored is escaped ch is unknown
170                        in++;
171                        break;
172                }
173            }
174        } else {
175            // TODO(edisonn): perf, avoid copy into itself, maybe first do a simple scan until found backslash ?
176            // we could have one look that first just inc current, and when we find the backslash
177            // we go to this loop
178            *in = *out;
179            in++;
180            out++;
181        }
182    }
183
184
185    SkPdfObject::makeString(start, out, str);
186    return in + 1;  // consume ) at the end of the string
187}
188
189static unsigned char* readHexString(unsigned char* start, unsigned char* end, SkPdfObject* str) {
190    unsigned char* out = start;
191    unsigned char* in = start;
192
193    unsigned char code = 0;
194
195    while (in < end) {
196        while (in < end && isPdfWhiteSpace(*in)) {
197            in++;
198        }
199
200        if (*in == kClosedInequityBracket_PdfDelimiter) {
201            *in = '\0';
202            in++;
203            // normal exit
204            break;
205        }
206
207        if (in >= end) {
208            // end too soon
209            break;
210        }
211
212        switch (*in) {
213            case '0':
214            case '1':
215            case '2':
216            case '3':
217            case '4':
218            case '5':
219            case '6':
220            case '7':
221            case '8':
222            case '9':
223                code = (*in - '0') << 4;
224                break;
225
226            case 'a':
227            case 'b':
228            case 'c':
229            case 'd':
230            case 'e':
231            case 'f':
232                code = (*in - 'a' + 10) << 4;
233                break;
234
235            case 'A':
236            case 'B':
237            case 'C':
238            case 'D':
239            case 'E':
240            case 'F':
241                code = (*in - 'A' + 10) << 4;
242                break;
243
244            // TODO(edisonn): spec does not say how to handle this error
245            default:
246                break;
247        }
248
249        in++;  // advance
250
251        while (in < end && isPdfWhiteSpace(*in)) {
252            in++;
253        }
254
255        // TODO(edisonn): report error
256        if (in >= end) {
257            *out = code;
258            out++;
259            break;
260        }
261
262        if (*in == kClosedInequityBracket_PdfDelimiter) {
263            *out = code;
264            out++;
265            break;
266        }
267
268        switch (*in) {
269            case '0':
270            case '1':
271            case '2':
272            case '3':
273            case '4':
274            case '5':
275            case '6':
276            case '7':
277            case '8':
278            case '9':
279                code += (*in - '0');
280                break;
281
282            case 'a':
283            case 'b':
284            case 'c':
285            case 'd':
286            case 'e':
287            case 'f':
288                code += (*in - 'a' + 10);
289                break;
290
291            case 'A':
292            case 'B':
293            case 'C':
294            case 'D':
295            case 'E':
296            case 'F':
297                code += (*in - 'A' + 10);
298                break;
299
300            // TODO(edisonn): spec does not say how to handle this error
301            default:
302                break;
303        }
304
305        *out = code;
306        out++;
307        in++;
308    }
309
310    if (out < in) {
311        *out = '\0';
312    }
313
314    SkPdfObject::makeHexString(start, out, str);
315    return in;  // consume > at the end of the string
316}
317
318// TODO(edisonn): before PDF 1.2 name could not have special characters, add version parameter
319static unsigned char* readName(unsigned char* start, unsigned char* end, SkPdfObject* name) {
320    unsigned char* out = start;
321    unsigned char* in = start;
322
323    unsigned char code = 0;
324
325    while (in < end) {
326        if (isPdfWhiteSpaceOrPdfDelimiter(*in)) {
327            break;
328        }
329
330        if (*in == '#' && in + 2 < end) {
331            in++;
332            switch (*in) {
333                case '0':
334                case '1':
335                case '2':
336                case '3':
337                case '4':
338                case '5':
339                case '6':
340                case '7':
341                case '8':
342                case '9':
343                    code = (*in - '0') << 4;
344                    break;
345
346                case 'a':
347                case 'b':
348                case 'c':
349                case 'd':
350                case 'e':
351                case 'f':
352                    code = (*in - 'a' + 10) << 4;
353                    break;
354
355                case 'A':
356                case 'B':
357                case 'C':
358                case 'D':
359                case 'E':
360                case 'F':
361                    code = (*in - 'A' + 10) << 4;
362                    break;
363
364                // TODO(edisonn): spec does not say how to handle this error
365                default:
366                    break;
367            }
368
369            in++;  // advance
370
371            switch (*in) {
372                case '0':
373                case '1':
374                case '2':
375                case '3':
376                case '4':
377                case '5':
378                case '6':
379                case '7':
380                case '8':
381                case '9':
382                    code += (*in - '0');
383                    break;
384
385                case 'a':
386                case 'b':
387                case 'c':
388                case 'd':
389                case 'e':
390                case 'f':
391                    code += (*in - 'a' + 10);
392                    break;
393
394                case 'A':
395                case 'B':
396                case 'C':
397                case 'D':
398                case 'E':
399                case 'F':
400                    code += (*in - 'A' + 10);
401                    break;
402
403                // TODO(edisonn): spec does not say how to handle this error
404                default:
405                    break;
406            }
407
408            *out = code;
409            out++;
410            in++;
411        } else {
412            *out = *in;
413            out++;
414            in++;
415        }
416    }
417
418    SkPdfObject::makeName(start, out, name);
419    return in;
420}
421
422// TODO(edisonn): pdf spec let Length to be an indirect object define after the stream
423// that makes for an interesting scenario, where the stream itself contains endstream, together
424// with a reference object with the length, but the real length object would be somewhere else
425// it could confuse the parser
426/*example:
427
4287 0 obj
429<< /length 8 0 R>>
430stream
431...............
432endstream
4338 0 obj #we are in stream actually, not a real object
434<< 10 >> #we are in stream actually, not a real object
435endobj
436endstream
4378 0 obj #real obj
438<< 100 >> #real obj
439endobj
440and it could get worse, with multiple object like this
441*/
442
443// right now implement the silly algorithm that assumes endstream is finishing the stream
444
445
446static unsigned char* readStream(unsigned char* start, unsigned char* end, SkPdfObject* dict) {
447    start = skipPdfWhiteSpaces(start, end);
448    if (!(start[0] == 's' && start[1] == 't' && start[2] == 'r' && start[3] == 'e' && start[4] == 'a' && start[5] == 'm')) {
449        // no stream. return.
450        return start;
451    }
452
453    start += 6; // strlen("stream")
454    if (start[0] == kCR_PdfWhiteSpace && start[1] == kLF_PdfWhiteSpace) {
455        start += 2;
456    } else if (start[0] == kLF_PdfWhiteSpace) {
457        start += 1;
458    }
459
460    SkPdfStreamCommonDictionary* stream = (SkPdfStreamCommonDictionary*) dict;
461    // TODO(edisonn): load Length
462    int64_t length = -1;
463
464    // TODO(edisonn): very basic implementation
465    if (stream->has_Length() && stream->Length(NULL) > 0) {
466        length = stream->Length(NULL);
467    }
468
469    // TODO(edisonn): laod external streams
470    // TODO(edisonn): look at the last filter, to determione how to deal with possible issue
471
472    if (length < 0) {
473        // scan the buffer, until we find first endstream
474        // TODO(edisonn): all buffers must have a 0 at the end now,
475        // TODO(edisonn): hack (mark end of content with 0)
476        unsigned char lastCh = *end;
477        *end = '\0';
478        //SkASSERT(*end == '\0');
479        unsigned char* endstream = (unsigned char*)strstr((const char*)start, "endstream");
480        *end = lastCh;
481
482        if (endstream) {
483            length = endstream - start;
484            if (*(endstream-1) == kLF_PdfWhiteSpace) length--;
485            if (*(endstream-1) == kCR_PdfWhiteSpace) length--;
486        }
487    }
488    if (length >= 0) {
489        unsigned char* endstream = start + length;
490
491        if (endstream[0] == kCR_PdfWhiteSpace && endstream[1] == kLF_PdfWhiteSpace) {
492            endstream += 2;
493        } else if (endstream[0] == kLF_PdfWhiteSpace) {
494            endstream += 1;
495        }
496
497        // TODO(edisonn): verify the next bytes are "endstream"
498
499        endstream += strlen("endstream");
500        // TODO(edisonn): Assert? report error/warning?
501        dict->addStream(start, (size_t)length);
502        return endstream;
503    }
504    return start;
505}
506
507static unsigned char* readDictionary(unsigned char* start, unsigned char* end, SkPdfObject* dict, SkPdfAllocator* allocator) {
508    SkPdfObject::makeEmptyDictionary(dict);
509
510    start = skipPdfWhiteSpaces(start, end);
511
512    while (start < end && *start == kNamed_PdfDelimiter) {
513        SkPdfObject key;
514        *start = '\0';
515        start++;
516        start = readName(start, end, &key);
517        start = skipPdfWhiteSpaces(start, end);
518
519        if (start < end) {
520            SkPdfObject* value = allocator->allocObject();
521            start = nextObject(start, end, value, allocator);
522
523            start = skipPdfWhiteSpaces(start, end);
524
525            if (start < end) {
526                // seems we have an indirect reference
527                if (isPdfDigit(*start)) {
528                    SkPdfObject generation;
529                    start = nextObject(start, end, &generation, allocator);
530
531                    SkPdfObject keywordR;
532                    start = nextObject(start, end, &keywordR, allocator);
533
534                    if (value->isInteger() && generation.isInteger() && keywordR.isKeywordReference()) {
535                        int64_t id = value->intValue();
536                        value->reset();
537                        SkPdfObject::makeReference((unsigned int)id, (unsigned int)generation.intValue(), value);
538                        dict->set(&key, value);
539                    } else {
540                        // error, ignore
541                        dict->set(&key, value);
542                    }
543                } else {
544                    // next elem is not a digit, but it might not be / either!
545                    dict->set(&key, value);
546                }
547            } else {
548                // /key >>
549                dict->set(&key, value);
550                return end;
551            }
552            start = skipPdfWhiteSpaces(start, end);
553        } else {
554            dict->set(&key, &SkPdfObject::kNull);
555            return end;
556        }
557    }
558
559    // TODO(edisonn): options to ignore these errors
560
561    // now we should expect >>
562    start = skipPdfWhiteSpaces(start, end);
563    start = endOfPdfToken(start, end);  // >
564    start = endOfPdfToken(start, end);  // >
565
566    // TODO(edisonn): read stream ... put dict and stream in a struct, and have a pointer to struct ...
567    // or alocate 2 objects, and if there is no stream, free it to be used by someone else? or just leave it ?
568
569    start = readStream(start, end, dict);
570
571    return start;
572}
573
574unsigned char* nextObject(unsigned char* start, unsigned char* end, SkPdfObject* token, SkPdfAllocator* allocator) {
575    unsigned char* current;
576
577    // skip white spaces
578    start = skipPdfWhiteSpaces(start, end);
579
580    current = endOfPdfToken(start, end);
581
582    // no token, len would be 0
583    if (current == start) {
584        return NULL;
585    }
586
587    int tokenLen = current - start;
588
589    if (tokenLen == 1) {
590        // start array
591        switch (*start) {
592            case kOpenedSquareBracket_PdfDelimiter:
593                *start = '\0';
594                SkPdfObject::makeEmptyArray(token);
595                return readArray(current, end, token, allocator);
596
597            case kOpenedRoundBracket_PdfDelimiter:
598                *start = '\0';
599                return readString(start, end, token);
600
601            case kOpenedInequityBracket_PdfDelimiter:
602                *start = '\0';
603                if (end > start + 1 && start[1] == kOpenedInequityBracket_PdfDelimiter) {
604                    // TODO(edisonn): pass here the length somehow?
605                    return readDictionary(start + 2, end, token, allocator);  // skip <<
606                } else {
607                    return readHexString(start + 1, end, token);  // skip <
608                }
609
610            case kNamed_PdfDelimiter:
611                *start = '\0';
612                return readName(start + 1, end, token);
613
614            // TODO(edisonn): what to do curly brackets? read spec!
615            case kOpenedCurlyBracket_PdfDelimiter:
616            default:
617                break;
618        }
619
620        SkASSERT(!isPdfWhiteSpace(*start));
621        if (isPdfDelimiter(*start)) {
622            // TODO(edisonn): how stream ] } > ) will be handled?
623            // for now ignore, and it will become a keyword to be ignored
624        }
625    }
626
627    if (tokenLen == 4 && start[0] == 'n' && start[1] == 'u' && start[2] == 'l' && start[3] == 'l') {
628        SkPdfObject::makeNull(token);
629        return current;
630    }
631
632    if (tokenLen == 4 && start[0] == 't' && start[1] == 'r' && start[2] == 'u' && start[3] == 'e') {
633        SkPdfObject::makeBoolean(true, token);
634        return current;
635    }
636
637    if (tokenLen == 5 && start[0] == 'f' && start[1] == 'a' && start[2] == 'l' && start[3] == 's' && start[3] == 'e') {
638        SkPdfObject::makeBoolean(false, token);
639        return current;
640    }
641
642    if (isPdfNumeric(*start)) {
643        SkPdfObject::makeNumeric(start, current, token);
644    } else {
645        SkPdfObject::makeKeyword(start, current, token);
646    }
647    return current;
648}
649
650SkPdfObject* SkPdfAllocator::allocBlock() {
651    return new SkPdfObject[BUFFER_SIZE];
652}
653
654SkPdfAllocator::~SkPdfAllocator() {
655    for (int i = 0 ; i < fHandles.count(); i++) {
656        free(fHandles[i]);
657    }
658    for (int i = 0 ; i < fHistory.count(); i++) {
659        for (int j = 0 ; j < BUFFER_SIZE; j++) {
660            fHistory[i][j].reset();
661        }
662        delete[] fHistory[i];
663    }
664    for (int j = 0 ; j < BUFFER_SIZE; j++) {
665        fCurrent[j].reset();
666    }
667    delete[] fCurrent;
668}
669
670SkPdfObject* SkPdfAllocator::allocObject() {
671    if (fCurrentUsed >= BUFFER_SIZE) {
672        fHistory.push(fCurrent);
673        fCurrent = allocBlock();
674        fCurrentUsed = 0;
675    }
676    fCurrentUsed++;
677    return &fCurrent[fCurrentUsed - 1];
678}
679
680// TODO(edisonn): perf: do no copy the buffers, but use them, and mark cache the result, so there is no need of a second pass
681SkPdfNativeTokenizer::SkPdfNativeTokenizer(SkPdfObject* objWithStream, const SkPdfMapper* mapper, SkPdfAllocator* allocator) : fMapper(mapper), fAllocator(allocator), fUncompressedStream(NULL), fUncompressedStreamEnd(NULL), fEmpty(false), fHasPutBack(false) {
682    unsigned char* buffer = NULL;
683    size_t len = 0;
684    objWithStream->GetFilteredStreamRef(&buffer, &len, fAllocator);
685    // TODO(edisonn): hack, find end of object
686    char* endobj = strstr((char*)buffer, "endobj");
687    if (endobj) {
688        len = endobj - (char*)buffer + strlen("endobj");
689    }
690    fUncompressedStreamStart = fUncompressedStream = (unsigned char*)fAllocator->alloc(len);
691    fUncompressedStreamEnd = fUncompressedStream + len;
692    memcpy(fUncompressedStream, buffer, len);
693}
694
695SkPdfNativeTokenizer::SkPdfNativeTokenizer(unsigned char* buffer, int len, const SkPdfMapper* mapper, SkPdfAllocator* allocator) : fMapper(mapper), fAllocator(allocator), fEmpty(false), fHasPutBack(false) {
696    // TODO(edisonn): hack, find end of object
697    char* endobj = strstr((char*)buffer, "endobj");
698    if (endobj) {
699        len = endobj - (char*)buffer + strlen("endobj");
700    }
701    fUncompressedStreamStart = fUncompressedStream = (unsigned char*)fAllocator->alloc(len);
702    fUncompressedStreamEnd = fUncompressedStream + len;
703    memcpy(fUncompressedStream, buffer, len);
704}
705
706SkPdfNativeTokenizer::~SkPdfNativeTokenizer() {
707}
708
709bool SkPdfNativeTokenizer::readTokenCore(PdfToken* token) {
710    token->fKeyword = NULL;
711    token->fObject = NULL;
712
713    fUncompressedStream = skipPdfWhiteSpaces(fUncompressedStream, fUncompressedStreamEnd);
714    if (fUncompressedStream >= fUncompressedStreamEnd) {
715        return false;
716    }
717
718    SkPdfObject obj;
719    fUncompressedStream = nextObject(fUncompressedStream, fUncompressedStreamEnd, &obj, fAllocator);
720
721    // If it is a keyword, we will only get the pointer of the string
722    if (obj.type() == SkPdfObject::kKeyword_PdfObjectType) {
723        token->fKeyword = obj.c_str();
724        token->fKeywordLength = obj.len();
725        token->fType = kKeyword_TokenType;
726    } else {
727        SkPdfObject* pobj = fAllocator->allocObject();
728        *pobj = obj;
729        token->fObject = pobj;
730        token->fType = kObject_TokenType;
731    }
732
733#ifdef PDF_TRACE
734    static int read_op = 0;
735    read_op++;
736    if (548 == read_op) {
737        printf("break;\n");
738    }
739    printf("%i READ %s %s\n", read_op, token->fType == kKeyword_TokenType ? "Keyword" : "Object", token->fKeyword ? std::string(token->fKeyword, token->fKeywordLength).c_str() : token->fObject->toString().c_str());
740#endif
741
742    return true;
743}
744
745void SkPdfNativeTokenizer::PutBack(PdfToken token) {
746    SkASSERT(!fHasPutBack);
747    fHasPutBack = true;
748    fPutBack = token;
749#ifdef PDF_TRACE
750    printf("PUT_BACK %s %s\n", token.fType == kKeyword_TokenType ? "Keyword" : "Object", token.fKeyword ? std::string(token.fKeyword, token.fKeywordLength).c_str(): token.fObject->toString().c_str());
751#endif
752}
753
754bool SkPdfNativeTokenizer::readToken(PdfToken* token) {
755    if (fHasPutBack) {
756        *token = fPutBack;
757        fHasPutBack = false;
758#ifdef PDF_TRACE
759    printf("READ_BACK %s %s\n", token->fType == kKeyword_TokenType ? "Keyword" : "Object", token->fKeyword ? std::string(token->fKeyword, token->fKeywordLength).c_str() : token->fObject->toString().c_str());
760#endif
761        return true;
762    }
763
764    if (fEmpty) {
765#ifdef PDF_TRACE
766    printf("EMPTY TOKENIZER\n");
767#endif
768        return false;
769    }
770
771    return readTokenCore(token);
772}
773
774