1/*
2 * Copyright (C) 2008 Esmertec AG.
3 * Copyright (C) 2008 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *      http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#include <stdio.h>
19#include <stdlib.h>
20#include <setjmp.h>
21#include <assert.h>
22#include "wbxml_parser.h"
23#include "csp13_data.h"
24#ifdef SUPPORT_SYNCML
25#include "syncml_data.h"
26#endif
27
28#ifdef PLATFORM_ANDROID
29extern "C" void *bsearch(const void *key, const void *base0, size_t nmemb,
30        size_t size, int (*compar)(const void *, const void *));
31#endif
32
33#define ARRAY_SIZE(a)   (sizeof(a) / sizeof(a[0]))
34
35//#define WBXML_DEBUG 1
36
37/* Major TODO items:
38   - Attribute value tokens (not used by IMPS CSP)
39   - EXT_* except EXT_T_0 (not used by IMPS CSP)
40   - PI (not used by IMPS CSP)
41   - cleanups
42
43   Other TODO:
44   - Support more public ID? Only IMPS is supported now.
45   - Support other charsets than UTF-8
46 */
47
48static int compareTokenData(const void * t1, const void * t2)
49{
50    return ((TokenData *)t1)->token - ((TokenData *)t2)->token;
51}
52
53static int compareAttrData(const void * t1, const void * t2)
54{
55    return ((AttrData *)t1)->token - ((AttrData *)t2)->token;
56}
57
58static bool isTagStart(int token)
59{
60    if (token == TOKEN_SWITCH_PAGE)
61        return true;
62
63    token &= 0x3f;
64    return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0);
65}
66
67static bool isAttrStart(int token)
68{
69    return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0) ||
70        (token > TOKEN_LITERAL_C && token < 0x80);
71}
72
73WbxmlParser::WbxmlParser(uint32_t transportEncoding) :
74    mTransportEncoding(transportEncoding)
75{
76    reset();
77}
78
79WbxmlParser::~WbxmlParser()
80{
81}
82
83void WbxmlParser::reset(void)
84{
85    mContentHandler = NULL;
86
87    mExternalChunk = NULL;
88    mExternalChunkLen = 0;
89    mLastChunk.clear();
90    mDataOffset = 0;
91    mIsDataEnd = false;
92
93    mStartElemStack.clear();
94    mStringTable.clear();
95
96    mCurrTagPage = mCurrAttrPage = 0;
97    mPublicId = 0;
98
99    mState = EXPECT_HEADER;
100    mLastError = ERROR_NO_ERROR;
101}
102
103void WbxmlParser::setContentHandler(WbxmlContentHandler * handler)
104{
105    mContentHandler = handler;
106}
107
108int WbxmlParser::parse(const char * data, uint32_t dataLen, bool end)
109{
110    if (data == NULL) {
111        mLastError = ERROR_INVALID_DATA;
112        return WBXML_STATUS_ERROR;
113    }
114
115    // All temporary C++ varaibles must be declared before setjmp to make
116    // sure they get properly destructed after longjmp.
117    vector<Attribute> attribs;
118    Attribute attrib;
119    string tagName;
120    string characters;
121    string opaque;
122
123#ifdef WBXML_DEBUG
124    printf("\nparse dataLen %d; end %d; readPos %d; availData %d\n",
125        dataLen, end, getReadPos(), availDataSize());
126#endif
127    appendData(data, dataLen, end);
128    volatile int readPos = getReadPos();
129    int setjmpRet;
130    switch (setjmpRet = setjmp(mJmpbuf)) {
131        case 0:
132            break;
133
134        case ERROR_NEED_MORE_DATA:
135            if (!mIsDataEnd) {
136#ifdef WBXML_DEBUG
137                printf("\nneed more data: readPos %d\n", readPos);
138#endif
139                setReadPos(readPos);
140                saveRemainingData();
141                return WBXML_STATUS_OK;
142            } else {
143#ifdef WBXML_DEBUG
144                printf("wbxml parser error: unexpected data end\n");
145#endif
146                mLastError = ERROR_NEED_MORE_DATA;
147                return WBXML_STATUS_ERROR;
148            }
149            break;
150
151        case ERROR_UNSUPPORTED_PUBID:
152        case ERROR_UNSUPPORTED_CHARSET:
153        case ERROR_INVALID_STRING_TABLE:
154        case ERROR_INVALID_STRING_TABLE_REFERENCE:
155        case ERROR_INVALID_EXT_TOKEN:
156        case ERROR_INVALID_MBUINT:
157        case ERROR_INVALID_ENTITY:
158        case ERROR_UNRECOGNIZED_TAG:
159        case ERROR_UNRECOGNIZED_ATTR:
160        case ERROR_MISSING_ATTR:
161        case ERROR_MISSING_TOKEN_END:
162#ifdef WBXML_DEBUG
163            printf("wbxml parser error %d\n", setjmpRet);
164#endif
165            mLastError = ParserError(setjmpRet);
166            return WBXML_STATUS_ERROR;
167            break;
168
169        case ERROR_NOT_SUPPORTED_YET:
170            printf("wbxml parser error: Not implemented feature.\n");
171            mLastError = ParserError(setjmpRet);
172            return WBXML_STATUS_ERROR;
173            break;
174
175        default:
176            printf("wbxml parser error: Impossible execution path.\n");
177            mLastError = ParserError(setjmpRet);
178            return WBXML_STATUS_ERROR;
179            break;
180    }
181
182    for (;;) {
183        // save readPos for error recovery
184        readPos = getReadPos();
185
186        switch (mState) {
187            case EXPECT_HEADER:
188                mDocVersion = readByte();
189
190                mPublicId = readMbuint32();
191                if (mPublicId != 0) {
192                    if (!selectTokenMapping(mPublicId)) {
193#ifdef WBXML_DEBUG
194                        printf("wbxml parser error: unsupported public id \n");
195#endif
196                        longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID);
197                    }
198                } else {
199                    mPublicId = -readMbuint32();
200                }
201                mCharset = readMbuint32();
202                if (!mCharset) {
203                    mCharset = mTransportEncoding;
204                    if (!mCharset) {
205                        mCharset = CHARSET_UTF8;
206                    }
207                }
208                // TODO: support more charsets other than UTF-8
209                if (mCharset != CHARSET_UTF8) {
210#ifdef WBXML_DEBUG
211                    printf("wbxml parser error: unsupported charset\n");
212#endif
213                    longjmp(mJmpbuf, ERROR_UNSUPPORTED_CHARSET);
214                }
215
216                // now advance to next state
217                if (mContentHandler) {
218                    mContentHandler->handlePublicId(mPublicId);
219                }
220                mState = EXPECT_STRING_TABLE;
221                break;
222
223            case EXPECT_STRING_TABLE:
224            {
225                uint32_t len = readMbuint32();
226                if (availDataSize() < len) {
227                    longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
228                }
229                mStringTable.clear();
230                // TODO: optimize this
231                while (len--) {
232                    mStringTable += readByte();
233                }
234                if (mStringTable.size()) {
235                    if (mStringTable[mStringTable.size() - 1] != 0) {
236                        // must have an ending \0
237                        //TODO:the byte array returned by SCTS does not contain '\0' at the
238                        //end,should this be fixed accordingly?
239#ifdef WBXML_DEBUG
240                        printf("wbxml parser error: invalid string table\n");
241#endif
242                        longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE);
243                    }
244                }
245                mState = EXPECT_BODY_START;
246                if (mPublicId <= 0) {
247                    const char * s = mStringTable.c_str() + (-mPublicId);
248#ifdef SUPPORT_SYNCML
249                    if (strcmp(s, "-//SYNCML//DTD SyncML 1.2//EN") == 0) {
250                        mPublicId = PUBLICID_SYNCML_1_2;
251                    } else if (strcmp(s, "-//SYNCML//DTD SyncML 1.1//EN") == 0) {
252                        mPublicId = PUBLICID_SYNCML_1_1;
253                    } else if (strcmp(s, "-//SYNCML//DTD SyncML 1.0//EN") == 0) {
254                        mPublicId = PUBLICID_SYNCML_1_0;
255                    }
256#endif
257                    if ((mPublicId <= 0) || !selectTokenMapping(mPublicId)) {
258                        longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID);
259                    }
260                }
261                break;
262            }
263
264            case EXPECT_BODY_START:
265                //TODO: handle possible PIs
266                mState = EXPECT_ELEMENT_START;
267                break;
268
269            case EXPECT_ELEMENT_START:
270            {
271                int stag = readByte();
272                const char * name;
273                if ((stag & 0x3f) == TOKEN_LITERAL) {
274                    name = resolveStrTableRef();
275                } else {
276                    if (stag == TOKEN_SWITCH_PAGE) {
277                        mCurrTagPage = readByte();
278                        stag = readByte();
279                    }
280                    name = lookupTagName(stag);
281                }
282                if (name == NULL) {
283#ifdef WBXML_DEBUG
284                    printf("wbxml parser error: unrecognized tag\n");
285#endif
286                    longjmp(mJmpbuf, ERROR_UNRECOGNIZED_TAG);
287                }
288                attribs.clear();
289                if (stag & 0x80) {
290                    // followed by 1 or more attributes
291                    while (peekByte() != TOKEN_END) {
292                        readAttribute(&attrib);
293                        attribs.push_back(attrib);
294                    }
295                    if (!attribs.size()) {
296#ifdef WBXML_DEBUG
297                        printf("wbxml parser error: missing attributes\n");
298#endif
299                        longjmp(mJmpbuf, ERROR_MISSING_ATTR);
300                    }
301                    // TOKEN_END
302                    readByte();
303                }
304                if (mContentHandler) {
305                    mContentHandler->startElement(name, attribs);
306                }
307                if (stag & 0x40) {
308                    mState = EXPECT_CONTENT;
309                } else {
310                    mState = ELEMENT_END;
311                }
312                tagName = name;
313                mStartElemStack.push_back(name);
314                break;
315            }
316
317            case EXPECT_CONTENT:
318            {
319                int byte = peekByte();
320                if (byte == TOKEN_SWITCH_PAGE) {
321                    readByte();
322                    mCurrTagPage = readByte();
323                    byte = peekByte();
324                }
325                if (isTagStart(byte) || byte == TOKEN_END) {
326                    if (characters.size() && mContentHandler) {
327                        mContentHandler->characters(characters.c_str(), characters.size());
328                        characters.clear();
329                    }
330                    if (byte == TOKEN_END) {
331                        mState = EXPECT_ELEMENT_END;
332                    } else {
333                        mState = EXPECT_ELEMENT_START;
334                    }
335                } else {
336                    // TODO: handle extension and pi
337                    switch (byte) {
338                        case TOKEN_ENTITY:
339                        case TOKEN_STR_I:
340                        case TOKEN_STR_T:
341                            readString(characters);
342                            break;
343
344                        case TOKEN_EXT_T_0:
345                        {
346                            readByte();
347                            uint32_t valueToken = readMbuint32();
348                            if (mPublicId == PUBLICID_IMPS_1_1
349                                    || mPublicId == PUBLICID_IMPS_1_2
350                                    || mPublicId == PUBLICID_IMPS_1_3) {
351                                TokenData t = {valueToken, NULL};
352                                const TokenData * res = (TokenData *)bsearch(&t,
353                                        csp13ExtValueTokens, ARRAY_SIZE(csp13ExtValueTokens),
354                                        sizeof(csp13ExtValueTokens[0]), compareTokenData);
355                                if (res) {
356                                    characters.append(res->tagName);
357                                } else {
358                                    longjmp(mJmpbuf, ERROR_INVALID_EXT_TOKEN);
359                                }
360                            } else {
361                                printf ("Token 0x%x\n", byte);
362                                longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
363                            }
364                            break;
365                        }
366
367                        case TOKEN_OPAQUE:
368                        {
369                            readByte();
370                            uint32_t opaqueDataLen = readMbuint32();
371                            opaque.clear();
372                            while (opaqueDataLen--) {
373                                opaque += (char)readByte();
374                            }
375                            if (mContentHandler) {
376                                mContentHandler->opaque(opaque.c_str(), opaque.size());
377                            }
378                            break;
379                        }
380
381                        default:
382                            printf ("Token 0x%x\n", byte);
383                            longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
384                            break;
385                    }
386                }
387                break;
388            }
389
390            case EXPECT_ELEMENT_END:
391                if (readByte() != TOKEN_END) {
392#ifdef WBXML_DEBUG
393                    printf("wbxml parser error: TOKEN_END expected\n");
394#endif
395                    longjmp(mJmpbuf, ERROR_MISSING_TOKEN_END);
396                }
397                mState = ELEMENT_END;
398                break;
399
400            case ELEMENT_END:
401                assert(!mStartElemStack.empty());
402
403                tagName = mStartElemStack.back();
404                mStartElemStack.pop_back();
405                if (mContentHandler) {
406                    mContentHandler->endElement(tagName.c_str());
407                }
408                if (mStartElemStack.empty()) {
409                    mState = EXPECT_BODY_END;
410                } else {
411                    mState = EXPECT_CONTENT;
412                }
413                break;
414
415            case EXPECT_BODY_END:
416                // TODO: handle possible PIs
417
418                // we're done
419                return WBXML_STATUS_OK;
420                break;
421        }
422    }
423}
424
425/*
426 * We don't make a copy of the data chunk for the current parse() until
427 * it returns.
428 * The remaining data will be saved in saveRemainingData() before parse()
429 * returns.
430 */
431void WbxmlParser::appendData(const char * data, uint32_t len, bool end)
432{
433    mExternalChunk = data;
434    mExternalChunkLen = len;
435    mIsDataEnd = end;
436}
437
438void WbxmlParser::saveRemainingData()
439{
440    if (mDataOffset > mLastChunk.size()) {
441        uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
442        assert(offsetToExtChunk <= mExternalChunkLen);
443        mLastChunk.assign(mExternalChunk + offsetToExtChunk,
444                mExternalChunkLen - offsetToExtChunk);
445        mDataOffset = 0;
446    } else {
447        mLastChunk.append(mExternalChunk, mExternalChunkLen);
448    }
449    mExternalChunk = NULL;
450    mExternalChunkLen = 0;
451}
452
453int WbxmlParser::readByte()
454{
455    if (mDataOffset < mLastChunk.size()) {
456#ifdef WBXML_DEBUG
457        printf ("rb 0x%x; ", (unsigned char)mLastChunk[mDataOffset]);
458#endif
459        return (unsigned char)mLastChunk[mDataOffset++];
460    } else {
461        uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
462        if (offsetToExtChunk < mExternalChunkLen) {
463            mDataOffset++;
464#ifdef WBXML_DEBUG
465            printf ("rb 0x%x; ", (unsigned char)mExternalChunk[offsetToExtChunk]);
466#endif
467            return (unsigned char)mExternalChunk[offsetToExtChunk];
468        }
469        longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
470    }
471}
472
473int WbxmlParser::peekByte()
474{
475    if (mDataOffset < mLastChunk.size()) {
476        return (unsigned char)mLastChunk[mDataOffset];
477    } else {
478        uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
479        if (offsetToExtChunk < mExternalChunkLen) {
480            return (unsigned char)mExternalChunk[offsetToExtChunk];
481        }
482        longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
483    }
484}
485
486uint32_t WbxmlParser::readMbuint32()
487{
488    uint32_t value = 0;
489    uint32_t byte;
490    do {
491        if ((value >> 25) != 0) {
492            // would go overflow. not a valid uint32.
493            longjmp(mJmpbuf, ERROR_INVALID_MBUINT);
494        }
495        byte = readByte();
496        value = (value << 7) | (byte & 0x7f);
497    } while (byte & 0x80);
498    return value;
499}
500
501/**
502 * Read STR_I | STR_T | ENTITY and *append* to str.
503 * Yes this looks ugly...
504 */
505void WbxmlParser::readString(string & str)
506{
507    int byte = readByte();
508    switch (byte) {
509        case TOKEN_STR_I:
510            //TODO: assuming UTF-8
511            while ((byte = readByte()) != 0) {
512                str += (char)byte;
513            }
514            break;
515
516        case TOKEN_ENTITY:
517        {
518            uint32_t ch = readMbuint32();
519            //TODO: assuming UTF-8 for now.
520            if (ch <= 0x7f) {
521                str += (char)ch;
522            } else if (ch <= 0x7ff) {
523                str += (char)((ch >> 6) | 0xc0);
524                str += (char)((ch & 0x3f) | 0x80);
525            } else if (ch <= 0xffff) {
526                str += (char)((ch >> 12) | 0xe0);
527                str += (char)(((ch >> 6) & 0x3f) | 0x80);
528                str += (char)((ch & 0x3f) | 0x80);
529            } else if (ch <= 0x10ffff) {
530                // 010000 - 10FFFF
531                str += (char)((ch >> 18) | 0xf0);
532                str += (char)(((ch >> 12) & 0x3f) | 0x80);
533                str += (char)(((ch >> 6) & 0x3f) | 0x80);
534                str += (char)((ch & 0x3f) | 0x80);
535            } else {
536                // not a valid UCS-4 character
537                longjmp(mJmpbuf, ERROR_INVALID_ENTITY);
538            }
539            break;
540        }
541
542        case TOKEN_STR_T:
543        {
544            const char * s = resolveStrTableRef();
545            str.append(s, strlen(s));
546            break;
547        }
548
549        default:
550            // impossible
551            printf ("Unknown token 0x%02x\n", byte);
552            longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
553            break;
554    }
555}
556
557const char * WbxmlParser::resolveStrTableRef(void)
558{
559    uint32_t offset = readMbuint32();
560    if (offset >= mStringTable.size()) {
561        longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE_REFERENCE);
562    }
563    return mStringTable.c_str() + offset;
564}
565
566bool WbxmlParser::selectTokenMapping(int publicId)
567{
568    switch (publicId) {
569        case PUBLICID_IMPS_1_3:
570        case PUBLICID_IMPS_1_2:
571        case PUBLICID_IMPS_1_1:
572            mTagPages = csp13TagPages;
573            mNumTagPages = ARRAY_SIZE(csp13TagPages);
574            mAttrPages = csp13AttrPages;
575            mNumAttrPages = ARRAY_SIZE(csp13AttrPages);
576            break;
577
578#ifdef SUPPORT_SYNCML
579        case PUBLICID_SYNCML_1_0:
580        case PUBLICID_SYNCML_1_1:
581        case PUBLICID_SYNCML_1_2:
582        case PUBLICID_SYNCML_METINF_1_2:
583            mTagPages = syncmlTagPages;
584            mNumTagPages = ARRAY_SIZE(syncmlTagPages);
585            mAttrPages = NULL;
586            mNumAttrPages = 0;
587            break;
588
589        case PUBLICID_SYNCML_DEVINF_1_2:
590            mTagPages = syncmlDevInfTagPages;
591            mNumTagPages = ARRAY_SIZE(syncmlDevInfTagPages);
592            mAttrPages = NULL;
593            mNumAttrPages = 0;
594            break;
595#endif
596        default:
597            return false;
598    }
599    return true;
600}
601
602const char * WbxmlParser::lookupTagName(int tag) const
603{
604    tag = tag & 0x3f;
605
606    // TODO: optimize this
607    if (mCurrTagPage >= mNumTagPages) {
608        return NULL;
609    }
610    const TagCodePage * page = &mTagPages[mCurrTagPage];
611    if (page == NULL) {
612        return NULL;
613    }
614
615    TokenData t = {tag, NULL};
616    const TokenData * res = (TokenData *)bsearch(&t, page->tags, page->numTokens,
617            sizeof(TokenData), compareTokenData);
618    if (res) {
619        return res->tagName;
620    }
621
622    return NULL;
623}
624
625const char * WbxmlParser::lookupAttrName(int token, const char **prefix) const
626{
627    // TODO: optimize this
628    if (mCurrAttrPage >= mNumAttrPages) {
629        return NULL;
630    }
631    const AttrCodePage * page = &mAttrPages[mCurrAttrPage];
632    if (page == NULL) {
633        return NULL;
634    }
635
636    AttrData t = {token, NULL, NULL};
637    const AttrData * res = (AttrData *)bsearch(&t, page->attrs, page->numTokens,
638            sizeof(AttrData), compareAttrData);
639    if (res) {
640        if (prefix) {
641            *prefix = res->attrValuePrefix;
642        }
643        return res->attrName;
644    }
645
646    return NULL;
647}
648
649void WbxmlParser::readAttribute(Attribute * attrib)
650{
651    // attribute start: attrib start token, LITERAL or END
652    int attrStart = readByte();
653    const char * name;
654    const char * valuePrefix = NULL;
655
656    if (attrStart == TOKEN_LITERAL) {
657        name = resolveStrTableRef();
658    } else {
659        if (attrStart == TOKEN_SWITCH_PAGE) {
660            mCurrAttrPage = readByte();
661            attrStart = readByte();
662        }
663        name = lookupAttrName(attrStart, &valuePrefix);
664    }
665    if (name == NULL) {
666        longjmp(mJmpbuf, ERROR_UNRECOGNIZED_ATTR);
667    }
668    attrib->name = name;
669    attrib->value = "";
670    if (valuePrefix != NULL) {
671        attrib->value = valuePrefix;
672    }
673
674    // now attribute value: zero or more value, string, entity or extension tokens
675    for (;;) {
676        int valueToken = peekByte();
677        if (isAttrStart(valueToken) || valueToken == TOKEN_END) {
678            // An attribute start token, a LITERAL token or the END token
679            // indicates the end of an attribute value.
680            return;
681        }
682        switch (valueToken) {
683            case TOKEN_ENTITY:
684            case TOKEN_STR_I:
685            case TOKEN_STR_T:
686                readString(attrib->value);
687                break;
688
689            case TOKEN_EXT_I_0:
690            case TOKEN_EXT_I_1:
691            case TOKEN_EXT_I_2:
692            case TOKEN_EXT_0:
693            case TOKEN_EXT_1:
694            case TOKEN_EXT_2:
695                //TODO: document type specific
696                printf ("Unsupported Token 0x%x\n", valueToken);
697                longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
698                break;
699
700            default:
701                //TODO
702                printf ("Unknown Token 0x%x\n", valueToken);
703                longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
704                break;
705        }
706    }
707}
708
709