1/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd 2 See the file COPYING for copying permission. 3*/ 4 5#include <stddef.h> 6 7#ifdef COMPILED_FROM_DSP 8#include "winconfig.h" 9#elif defined(MACOS_CLASSIC) 10#include "macconfig.h" 11#elif defined(__amigaos__) 12#include "amigaconfig.h" 13#elif defined(__WATCOMC__) 14#include "watcomconfig.h" 15#else 16#ifdef HAVE_EXPAT_CONFIG_H 17#include <expat_config.h> 18#endif 19#endif /* ndef COMPILED_FROM_DSP */ 20 21#include "expat_external.h" 22#include "internal.h" 23#include "xmltok.h" 24#include "nametab.h" 25 26#ifdef XML_DTD 27#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) 28#else 29#define IGNORE_SECTION_TOK_VTABLE /* as nothing */ 30#endif 31 32#define VTABLE1 \ 33 { PREFIX(prologTok), PREFIX(contentTok), \ 34 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \ 35 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ 36 PREFIX(sameName), \ 37 PREFIX(nameMatchesAscii), \ 38 PREFIX(nameLength), \ 39 PREFIX(skipS), \ 40 PREFIX(getAtts), \ 41 PREFIX(charRefNumber), \ 42 PREFIX(predefinedEntityName), \ 43 PREFIX(updatePosition), \ 44 PREFIX(isPublicId) 45 46#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 47 48#define UCS2_GET_NAMING(pages, hi, lo) \ 49 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) 50 51/* A 2 byte UTF-8 representation splits the characters 11 bits between 52 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into 53 pages, 3 bits to add to that index and 5 bits to generate the mask. 54*/ 55#define UTF8_GET_NAMING2(pages, byte) \ 56 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 57 + ((((byte)[0]) & 3) << 1) \ 58 + ((((byte)[1]) >> 5) & 1)] \ 59 & (1 << (((byte)[1]) & 0x1F))) 60 61/* A 3 byte UTF-8 representation splits the characters 16 bits between 62 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index 63 into pages, 3 bits to add to that index and 5 bits to generate the 64 mask. 65*/ 66#define UTF8_GET_NAMING3(pages, byte) \ 67 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ 68 + ((((byte)[1]) >> 2) & 0xF)] \ 69 << 3) \ 70 + ((((byte)[1]) & 3) << 1) \ 71 + ((((byte)[2]) >> 5) & 1)] \ 72 & (1 << (((byte)[2]) & 0x1F))) 73 74#define UTF8_GET_NAMING(pages, p, n) \ 75 ((n) == 2 \ 76 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ 77 : ((n) == 3 \ 78 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ 79 : 0)) 80 81/* Detection of invalid UTF-8 sequences is based on Table 3.1B 82 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ 83 with the additional restriction of not allowing the Unicode 84 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). 85 Implementation details: 86 (A & 0x80) == 0 means A < 0x80 87 and 88 (A & 0xC0) == 0xC0 means A > 0xBF 89*/ 90 91#define UTF8_INVALID2(p) \ 92 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) 93 94#define UTF8_INVALID3(p) \ 95 (((p)[2] & 0x80) == 0 \ 96 || \ 97 ((*p) == 0xEF && (p)[1] == 0xBF \ 98 ? \ 99 (p)[2] > 0xBD \ 100 : \ 101 ((p)[2] & 0xC0) == 0xC0) \ 102 || \ 103 ((*p) == 0xE0 \ 104 ? \ 105 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ 106 : \ 107 ((p)[1] & 0x80) == 0 \ 108 || \ 109 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) 110 111#define UTF8_INVALID4(p) \ 112 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \ 113 || \ 114 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \ 115 || \ 116 ((*p) == 0xF0 \ 117 ? \ 118 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ 119 : \ 120 ((p)[1] & 0x80) == 0 \ 121 || \ 122 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) 123 124static int PTRFASTCALL 125isNever(const ENCODING *enc, const char *p) 126{ 127 return 0; 128} 129 130static int PTRFASTCALL 131utf8_isName2(const ENCODING *enc, const char *p) 132{ 133 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); 134} 135 136static int PTRFASTCALL 137utf8_isName3(const ENCODING *enc, const char *p) 138{ 139 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); 140} 141 142#define utf8_isName4 isNever 143 144static int PTRFASTCALL 145utf8_isNmstrt2(const ENCODING *enc, const char *p) 146{ 147 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); 148} 149 150static int PTRFASTCALL 151utf8_isNmstrt3(const ENCODING *enc, const char *p) 152{ 153 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); 154} 155 156#define utf8_isNmstrt4 isNever 157 158static int PTRFASTCALL 159utf8_isInvalid2(const ENCODING *enc, const char *p) 160{ 161 return UTF8_INVALID2((const unsigned char *)p); 162} 163 164static int PTRFASTCALL 165utf8_isInvalid3(const ENCODING *enc, const char *p) 166{ 167 return UTF8_INVALID3((const unsigned char *)p); 168} 169 170static int PTRFASTCALL 171utf8_isInvalid4(const ENCODING *enc, const char *p) 172{ 173 return UTF8_INVALID4((const unsigned char *)p); 174} 175 176struct normal_encoding { 177 ENCODING enc; 178 unsigned char type[256]; 179#ifdef XML_MIN_SIZE 180 int (PTRFASTCALL *byteType)(const ENCODING *, const char *); 181 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *); 182 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); 183 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); 184 int (PTRCALL *charMatches)(const ENCODING *, const char *, int); 185#endif /* XML_MIN_SIZE */ 186 int (PTRFASTCALL *isName2)(const ENCODING *, const char *); 187 int (PTRFASTCALL *isName3)(const ENCODING *, const char *); 188 int (PTRFASTCALL *isName4)(const ENCODING *, const char *); 189 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); 190 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); 191 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); 192 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); 193 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); 194 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *); 195}; 196 197#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc)) 198 199#ifdef XML_MIN_SIZE 200 201#define STANDARD_VTABLE(E) \ 202 E ## byteType, \ 203 E ## isNameMin, \ 204 E ## isNmstrtMin, \ 205 E ## byteToAscii, \ 206 E ## charMatches, 207 208#else 209 210#define STANDARD_VTABLE(E) /* as nothing */ 211 212#endif 213 214#define NORMAL_VTABLE(E) \ 215 E ## isName2, \ 216 E ## isName3, \ 217 E ## isName4, \ 218 E ## isNmstrt2, \ 219 E ## isNmstrt3, \ 220 E ## isNmstrt4, \ 221 E ## isInvalid2, \ 222 E ## isInvalid3, \ 223 E ## isInvalid4 224 225static int FASTCALL checkCharRefNumber(int); 226 227#include "xmltok_impl.h" 228#include "ascii.h" 229 230#ifdef XML_MIN_SIZE 231#define sb_isNameMin isNever 232#define sb_isNmstrtMin isNever 233#endif 234 235#ifdef XML_MIN_SIZE 236#define MINBPC(enc) ((enc)->minBytesPerChar) 237#else 238/* minimum bytes per character */ 239#define MINBPC(enc) 1 240#endif 241 242#define SB_BYTE_TYPE(enc, p) \ 243 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 244 245#ifdef XML_MIN_SIZE 246static int PTRFASTCALL 247sb_byteType(const ENCODING *enc, const char *p) 248{ 249 return SB_BYTE_TYPE(enc, p); 250} 251#define BYTE_TYPE(enc, p) \ 252 (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) 253#else 254#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) 255#endif 256 257#ifdef XML_MIN_SIZE 258#define BYTE_TO_ASCII(enc, p) \ 259 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) 260static int PTRFASTCALL 261sb_byteToAscii(const ENCODING *enc, const char *p) 262{ 263 return *p; 264} 265#else 266#define BYTE_TO_ASCII(enc, p) (*(p)) 267#endif 268 269#define IS_NAME_CHAR(enc, p, n) \ 270 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p)) 271#define IS_NMSTRT_CHAR(enc, p, n) \ 272 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p)) 273#define IS_INVALID_CHAR(enc, p, n) \ 274 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p)) 275 276#ifdef XML_MIN_SIZE 277#define IS_NAME_CHAR_MINBPC(enc, p) \ 278 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) 279#define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 280 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) 281#else 282#define IS_NAME_CHAR_MINBPC(enc, p) (0) 283#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 284#endif 285 286#ifdef XML_MIN_SIZE 287#define CHAR_MATCHES(enc, p, c) \ 288 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) 289static int PTRCALL 290sb_charMatches(const ENCODING *enc, const char *p, int c) 291{ 292 return *p == c; 293} 294#else 295/* c is an ASCII character */ 296#define CHAR_MATCHES(enc, p, c) (*(p) == c) 297#endif 298 299#define PREFIX(ident) normal_ ## ident 300#define XML_TOK_IMPL_C 301#include "xmltok_impl.c" 302#undef XML_TOK_IMPL_C 303 304#undef MINBPC 305#undef BYTE_TYPE 306#undef BYTE_TO_ASCII 307#undef CHAR_MATCHES 308#undef IS_NAME_CHAR 309#undef IS_NAME_CHAR_MINBPC 310#undef IS_NMSTRT_CHAR 311#undef IS_NMSTRT_CHAR_MINBPC 312#undef IS_INVALID_CHAR 313 314enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ 315 UTF8_cval1 = 0x00, 316 UTF8_cval2 = 0xc0, 317 UTF8_cval3 = 0xe0, 318 UTF8_cval4 = 0xf0 319}; 320 321static enum XML_Convert_Result PTRCALL 322utf8_toUtf8(const ENCODING *enc, 323 const char **fromP, const char *fromLim, 324 char **toP, const char *toLim) 325{ 326 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; 327 char *to; 328 const char *from; 329 if (fromLim - *fromP > toLim - *toP) { 330 /* Avoid copying partial characters. */ 331 res = XML_CONVERT_OUTPUT_EXHAUSTED; 332 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) 333 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) 334 break; 335 } 336 for (to = *toP, from = *fromP; (from < fromLim) && (to < toLim); from++, to++) 337 *to = *from; 338 *fromP = from; 339 *toP = to; 340 341 if ((to == toLim) && (from < fromLim)) 342 return XML_CONVERT_OUTPUT_EXHAUSTED; 343 else 344 return res; 345} 346 347static enum XML_Convert_Result PTRCALL 348utf8_toUtf16(const ENCODING *enc, 349 const char **fromP, const char *fromLim, 350 unsigned short **toP, const unsigned short *toLim) 351{ 352 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; 353 unsigned short *to = *toP; 354 const char *from = *fromP; 355 while (from < fromLim && to < toLim) { 356 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { 357 case BT_LEAD2: 358 if (fromLim - from < 2) { 359 res = XML_CONVERT_INPUT_INCOMPLETE; 360 break; 361 } 362 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); 363 from += 2; 364 break; 365 case BT_LEAD3: 366 if (fromLim - from < 3) { 367 res = XML_CONVERT_INPUT_INCOMPLETE; 368 break; 369 } 370 *to++ = (unsigned short)(((from[0] & 0xf) << 12) 371 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f)); 372 from += 3; 373 break; 374 case BT_LEAD4: 375 { 376 unsigned long n; 377 if (toLim - to < 2) { 378 res = XML_CONVERT_OUTPUT_EXHAUSTED; 379 goto after; 380 } 381 if (fromLim - from < 4) { 382 res = XML_CONVERT_INPUT_INCOMPLETE; 383 goto after; 384 } 385 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) 386 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); 387 n -= 0x10000; 388 to[0] = (unsigned short)((n >> 10) | 0xD800); 389 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); 390 to += 2; 391 from += 4; 392 } 393 break; 394 default: 395 *to++ = *from++; 396 break; 397 } 398 } 399after: 400 *fromP = from; 401 *toP = to; 402 return res; 403} 404 405#ifdef XML_NS 406static const struct normal_encoding utf8_encoding_ns = { 407 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 408 { 409#include "asciitab.h" 410#include "utf8tab.h" 411 }, 412 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 413}; 414#endif 415 416static const struct normal_encoding utf8_encoding = { 417 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 418 { 419#define BT_COLON BT_NMSTRT 420#include "asciitab.h" 421#undef BT_COLON 422#include "utf8tab.h" 423 }, 424 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 425}; 426 427#ifdef XML_NS 428 429static const struct normal_encoding internal_utf8_encoding_ns = { 430 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 431 { 432#include "iasciitab.h" 433#include "utf8tab.h" 434 }, 435 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 436}; 437 438#endif 439 440static const struct normal_encoding internal_utf8_encoding = { 441 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 442 { 443#define BT_COLON BT_NMSTRT 444#include "iasciitab.h" 445#undef BT_COLON 446#include "utf8tab.h" 447 }, 448 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 449}; 450 451static enum XML_Convert_Result PTRCALL 452latin1_toUtf8(const ENCODING *enc, 453 const char **fromP, const char *fromLim, 454 char **toP, const char *toLim) 455{ 456 for (;;) { 457 unsigned char c; 458 if (*fromP == fromLim) 459 return XML_CONVERT_COMPLETED; 460 c = (unsigned char)**fromP; 461 if (c & 0x80) { 462 if (toLim - *toP < 2) 463 return XML_CONVERT_OUTPUT_EXHAUSTED; 464 *(*toP)++ = (char)((c >> 6) | UTF8_cval2); 465 *(*toP)++ = (char)((c & 0x3f) | 0x80); 466 (*fromP)++; 467 } 468 else { 469 if (*toP == toLim) 470 return XML_CONVERT_OUTPUT_EXHAUSTED; 471 *(*toP)++ = *(*fromP)++; 472 } 473 } 474} 475 476static enum XML_Convert_Result PTRCALL 477latin1_toUtf16(const ENCODING *enc, 478 const char **fromP, const char *fromLim, 479 unsigned short **toP, const unsigned short *toLim) 480{ 481 while (*fromP < fromLim && *toP < toLim) 482 *(*toP)++ = (unsigned char)*(*fromP)++; 483 484 if ((*toP == toLim) && (*fromP < fromLim)) 485 return XML_CONVERT_OUTPUT_EXHAUSTED; 486 else 487 return XML_CONVERT_COMPLETED; 488} 489 490#ifdef XML_NS 491 492static const struct normal_encoding latin1_encoding_ns = { 493 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 494 { 495#include "asciitab.h" 496#include "latin1tab.h" 497 }, 498 STANDARD_VTABLE(sb_) 499}; 500 501#endif 502 503static const struct normal_encoding latin1_encoding = { 504 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 505 { 506#define BT_COLON BT_NMSTRT 507#include "asciitab.h" 508#undef BT_COLON 509#include "latin1tab.h" 510 }, 511 STANDARD_VTABLE(sb_) 512}; 513 514static enum XML_Convert_Result PTRCALL 515ascii_toUtf8(const ENCODING *enc, 516 const char **fromP, const char *fromLim, 517 char **toP, const char *toLim) 518{ 519 while (*fromP < fromLim && *toP < toLim) 520 *(*toP)++ = *(*fromP)++; 521 522 if ((*toP == toLim) && (*fromP < fromLim)) 523 return XML_CONVERT_OUTPUT_EXHAUSTED; 524 else 525 return XML_CONVERT_COMPLETED; 526} 527 528#ifdef XML_NS 529 530static const struct normal_encoding ascii_encoding_ns = { 531 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 532 { 533#include "asciitab.h" 534/* BT_NONXML == 0 */ 535 }, 536 STANDARD_VTABLE(sb_) 537}; 538 539#endif 540 541static const struct normal_encoding ascii_encoding = { 542 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 543 { 544#define BT_COLON BT_NMSTRT 545#include "asciitab.h" 546#undef BT_COLON 547/* BT_NONXML == 0 */ 548 }, 549 STANDARD_VTABLE(sb_) 550}; 551 552static int PTRFASTCALL 553unicode_byte_type(char hi, char lo) 554{ 555 switch ((unsigned char)hi) { 556 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 557 return BT_LEAD4; 558 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 559 return BT_TRAIL; 560 case 0xFF: 561 switch ((unsigned char)lo) { 562 case 0xFF: 563 case 0xFE: 564 return BT_NONXML; 565 } 566 break; 567 } 568 return BT_NONASCII; 569} 570 571#define DEFINE_UTF16_TO_UTF8(E) \ 572static enum XML_Convert_Result PTRCALL \ 573E ## toUtf8(const ENCODING *enc, \ 574 const char **fromP, const char *fromLim, \ 575 char **toP, const char *toLim) \ 576{ \ 577 const char *from = *fromP; \ 578 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \ 579 for (; from < fromLim; from += 2) { \ 580 int plane; \ 581 unsigned char lo2; \ 582 unsigned char lo = GET_LO(from); \ 583 unsigned char hi = GET_HI(from); \ 584 switch (hi) { \ 585 case 0: \ 586 if (lo < 0x80) { \ 587 if (*toP == toLim) { \ 588 *fromP = from; \ 589 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 590 } \ 591 *(*toP)++ = lo; \ 592 break; \ 593 } \ 594 /* fall through */ \ 595 case 0x1: case 0x2: case 0x3: \ 596 case 0x4: case 0x5: case 0x6: case 0x7: \ 597 if (toLim - *toP < 2) { \ 598 *fromP = from; \ 599 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 600 } \ 601 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ 602 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 603 break; \ 604 default: \ 605 if (toLim - *toP < 3) { \ 606 *fromP = from; \ 607 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 608 } \ 609 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 610 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 611 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 612 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 613 break; \ 614 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ 615 if (toLim - *toP < 4) { \ 616 *fromP = from; \ 617 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 618 } \ 619 if (fromLim - from < 4) { \ 620 *fromP = from; \ 621 return XML_CONVERT_INPUT_INCOMPLETE; \ 622 } \ 623 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 624 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ 625 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 626 from += 2; \ 627 lo2 = GET_LO(from); \ 628 *(*toP)++ = (((lo & 0x3) << 4) \ 629 | ((GET_HI(from) & 0x3) << 2) \ 630 | (lo2 >> 6) \ 631 | 0x80); \ 632 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 633 break; \ 634 } \ 635 } \ 636 *fromP = from; \ 637 if (from < fromLim) \ 638 return XML_CONVERT_INPUT_INCOMPLETE; \ 639 else \ 640 return XML_CONVERT_COMPLETED; \ 641} 642 643#define DEFINE_UTF16_TO_UTF16(E) \ 644static enum XML_Convert_Result PTRCALL \ 645E ## toUtf16(const ENCODING *enc, \ 646 const char **fromP, const char *fromLim, \ 647 unsigned short **toP, const unsigned short *toLim) \ 648{ \ 649 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \ 650 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \ 651 /* Avoid copying first half only of surrogate */ \ 652 if (fromLim - *fromP > ((toLim - *toP) << 1) \ 653 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \ 654 fromLim -= 2; \ 655 res = XML_CONVERT_INPUT_INCOMPLETE; \ 656 } \ 657 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \ 658 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 659 if ((*toP == toLim) && (*fromP < fromLim)) \ 660 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 661 else \ 662 return res; \ 663} 664 665#define SET2(ptr, ch) \ 666 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) 667#define GET_LO(ptr) ((unsigned char)(ptr)[0]) 668#define GET_HI(ptr) ((unsigned char)(ptr)[1]) 669 670DEFINE_UTF16_TO_UTF8(little2_) 671DEFINE_UTF16_TO_UTF16(little2_) 672 673#undef SET2 674#undef GET_LO 675#undef GET_HI 676 677#define SET2(ptr, ch) \ 678 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) 679#define GET_LO(ptr) ((unsigned char)(ptr)[1]) 680#define GET_HI(ptr) ((unsigned char)(ptr)[0]) 681 682DEFINE_UTF16_TO_UTF8(big2_) 683DEFINE_UTF16_TO_UTF16(big2_) 684 685#undef SET2 686#undef GET_LO 687#undef GET_HI 688 689#define LITTLE2_BYTE_TYPE(enc, p) \ 690 ((p)[1] == 0 \ 691 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ 692 : unicode_byte_type((p)[1], (p)[0])) 693#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) 694#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) 695#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \ 696 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 697#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 698 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 699 700#ifdef XML_MIN_SIZE 701 702static int PTRFASTCALL 703little2_byteType(const ENCODING *enc, const char *p) 704{ 705 return LITTLE2_BYTE_TYPE(enc, p); 706} 707 708static int PTRFASTCALL 709little2_byteToAscii(const ENCODING *enc, const char *p) 710{ 711 return LITTLE2_BYTE_TO_ASCII(enc, p); 712} 713 714static int PTRCALL 715little2_charMatches(const ENCODING *enc, const char *p, int c) 716{ 717 return LITTLE2_CHAR_MATCHES(enc, p, c); 718} 719 720static int PTRFASTCALL 721little2_isNameMin(const ENCODING *enc, const char *p) 722{ 723 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p); 724} 725 726static int PTRFASTCALL 727little2_isNmstrtMin(const ENCODING *enc, const char *p) 728{ 729 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p); 730} 731 732#undef VTABLE 733#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 734 735#else /* not XML_MIN_SIZE */ 736 737#undef PREFIX 738#define PREFIX(ident) little2_ ## ident 739#define MINBPC(enc) 2 740/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 741#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) 742#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) 743#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c) 744#define IS_NAME_CHAR(enc, p, n) 0 745#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) 746#define IS_NMSTRT_CHAR(enc, p, n) (0) 747#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) 748 749#define XML_TOK_IMPL_C 750#include "xmltok_impl.c" 751#undef XML_TOK_IMPL_C 752 753#undef MINBPC 754#undef BYTE_TYPE 755#undef BYTE_TO_ASCII 756#undef CHAR_MATCHES 757#undef IS_NAME_CHAR 758#undef IS_NAME_CHAR_MINBPC 759#undef IS_NMSTRT_CHAR 760#undef IS_NMSTRT_CHAR_MINBPC 761#undef IS_INVALID_CHAR 762 763#endif /* not XML_MIN_SIZE */ 764 765#ifdef XML_NS 766 767static const struct normal_encoding little2_encoding_ns = { 768 { VTABLE, 2, 0, 769#if BYTEORDER == 1234 770 1 771#else 772 0 773#endif 774 }, 775 { 776#include "asciitab.h" 777#include "latin1tab.h" 778 }, 779 STANDARD_VTABLE(little2_) 780}; 781 782#endif 783 784static const struct normal_encoding little2_encoding = { 785 { VTABLE, 2, 0, 786#if BYTEORDER == 1234 787 1 788#else 789 0 790#endif 791 }, 792 { 793#define BT_COLON BT_NMSTRT 794#include "asciitab.h" 795#undef BT_COLON 796#include "latin1tab.h" 797 }, 798 STANDARD_VTABLE(little2_) 799}; 800 801#if BYTEORDER != 4321 802 803#ifdef XML_NS 804 805static const struct normal_encoding internal_little2_encoding_ns = { 806 { VTABLE, 2, 0, 1 }, 807 { 808#include "iasciitab.h" 809#include "latin1tab.h" 810 }, 811 STANDARD_VTABLE(little2_) 812}; 813 814#endif 815 816static const struct normal_encoding internal_little2_encoding = { 817 { VTABLE, 2, 0, 1 }, 818 { 819#define BT_COLON BT_NMSTRT 820#include "iasciitab.h" 821#undef BT_COLON 822#include "latin1tab.h" 823 }, 824 STANDARD_VTABLE(little2_) 825}; 826 827#endif 828 829 830#define BIG2_BYTE_TYPE(enc, p) \ 831 ((p)[0] == 0 \ 832 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ 833 : unicode_byte_type((p)[0], (p)[1])) 834#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) 835#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) 836#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \ 837 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 838#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 839 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 840 841#ifdef XML_MIN_SIZE 842 843static int PTRFASTCALL 844big2_byteType(const ENCODING *enc, const char *p) 845{ 846 return BIG2_BYTE_TYPE(enc, p); 847} 848 849static int PTRFASTCALL 850big2_byteToAscii(const ENCODING *enc, const char *p) 851{ 852 return BIG2_BYTE_TO_ASCII(enc, p); 853} 854 855static int PTRCALL 856big2_charMatches(const ENCODING *enc, const char *p, int c) 857{ 858 return BIG2_CHAR_MATCHES(enc, p, c); 859} 860 861static int PTRFASTCALL 862big2_isNameMin(const ENCODING *enc, const char *p) 863{ 864 return BIG2_IS_NAME_CHAR_MINBPC(enc, p); 865} 866 867static int PTRFASTCALL 868big2_isNmstrtMin(const ENCODING *enc, const char *p) 869{ 870 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p); 871} 872 873#undef VTABLE 874#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 875 876#else /* not XML_MIN_SIZE */ 877 878#undef PREFIX 879#define PREFIX(ident) big2_ ## ident 880#define MINBPC(enc) 2 881/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 882#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) 883#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) 884#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c) 885#define IS_NAME_CHAR(enc, p, n) 0 886#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p) 887#define IS_NMSTRT_CHAR(enc, p, n) (0) 888#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) 889 890#define XML_TOK_IMPL_C 891#include "xmltok_impl.c" 892#undef XML_TOK_IMPL_C 893 894#undef MINBPC 895#undef BYTE_TYPE 896#undef BYTE_TO_ASCII 897#undef CHAR_MATCHES 898#undef IS_NAME_CHAR 899#undef IS_NAME_CHAR_MINBPC 900#undef IS_NMSTRT_CHAR 901#undef IS_NMSTRT_CHAR_MINBPC 902#undef IS_INVALID_CHAR 903 904#endif /* not XML_MIN_SIZE */ 905 906#ifdef XML_NS 907 908static const struct normal_encoding big2_encoding_ns = { 909 { VTABLE, 2, 0, 910#if BYTEORDER == 4321 911 1 912#else 913 0 914#endif 915 }, 916 { 917#include "asciitab.h" 918#include "latin1tab.h" 919 }, 920 STANDARD_VTABLE(big2_) 921}; 922 923#endif 924 925static const struct normal_encoding big2_encoding = { 926 { VTABLE, 2, 0, 927#if BYTEORDER == 4321 928 1 929#else 930 0 931#endif 932 }, 933 { 934#define BT_COLON BT_NMSTRT 935#include "asciitab.h" 936#undef BT_COLON 937#include "latin1tab.h" 938 }, 939 STANDARD_VTABLE(big2_) 940}; 941 942#if BYTEORDER != 1234 943 944#ifdef XML_NS 945 946static const struct normal_encoding internal_big2_encoding_ns = { 947 { VTABLE, 2, 0, 1 }, 948 { 949#include "iasciitab.h" 950#include "latin1tab.h" 951 }, 952 STANDARD_VTABLE(big2_) 953}; 954 955#endif 956 957static const struct normal_encoding internal_big2_encoding = { 958 { VTABLE, 2, 0, 1 }, 959 { 960#define BT_COLON BT_NMSTRT 961#include "iasciitab.h" 962#undef BT_COLON 963#include "latin1tab.h" 964 }, 965 STANDARD_VTABLE(big2_) 966}; 967 968#endif 969 970#undef PREFIX 971 972static int FASTCALL 973streqci(const char *s1, const char *s2) 974{ 975 for (;;) { 976 char c1 = *s1++; 977 char c2 = *s2++; 978 if (ASCII_a <= c1 && c1 <= ASCII_z) 979 c1 += ASCII_A - ASCII_a; 980 if (ASCII_a <= c2 && c2 <= ASCII_z) 981 c2 += ASCII_A - ASCII_a; 982 if (c1 != c2) 983 return 0; 984 if (!c1) 985 break; 986 } 987 return 1; 988} 989 990static void PTRCALL 991initUpdatePosition(const ENCODING *enc, const char *ptr, 992 const char *end, POSITION *pos) 993{ 994 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); 995} 996 997static int 998toAscii(const ENCODING *enc, const char *ptr, const char *end) 999{ 1000 char buf[1]; 1001 char *p = buf; 1002 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); 1003 if (p == buf) 1004 return -1; 1005 else 1006 return buf[0]; 1007} 1008 1009static int FASTCALL 1010isSpace(int c) 1011{ 1012 switch (c) { 1013 case 0x20: 1014 case 0xD: 1015 case 0xA: 1016 case 0x9: 1017 return 1; 1018 } 1019 return 0; 1020} 1021 1022/* Return 1 if there's just optional white space or there's an S 1023 followed by name=val. 1024*/ 1025static int 1026parsePseudoAttribute(const ENCODING *enc, 1027 const char *ptr, 1028 const char *end, 1029 const char **namePtr, 1030 const char **nameEndPtr, 1031 const char **valPtr, 1032 const char **nextTokPtr) 1033{ 1034 int c; 1035 char open; 1036 if (ptr == end) { 1037 *namePtr = NULL; 1038 return 1; 1039 } 1040 if (!isSpace(toAscii(enc, ptr, end))) { 1041 *nextTokPtr = ptr; 1042 return 0; 1043 } 1044 do { 1045 ptr += enc->minBytesPerChar; 1046 } while (isSpace(toAscii(enc, ptr, end))); 1047 if (ptr == end) { 1048 *namePtr = NULL; 1049 return 1; 1050 } 1051 *namePtr = ptr; 1052 for (;;) { 1053 c = toAscii(enc, ptr, end); 1054 if (c == -1) { 1055 *nextTokPtr = ptr; 1056 return 0; 1057 } 1058 if (c == ASCII_EQUALS) { 1059 *nameEndPtr = ptr; 1060 break; 1061 } 1062 if (isSpace(c)) { 1063 *nameEndPtr = ptr; 1064 do { 1065 ptr += enc->minBytesPerChar; 1066 } while (isSpace(c = toAscii(enc, ptr, end))); 1067 if (c != ASCII_EQUALS) { 1068 *nextTokPtr = ptr; 1069 return 0; 1070 } 1071 break; 1072 } 1073 ptr += enc->minBytesPerChar; 1074 } 1075 if (ptr == *namePtr) { 1076 *nextTokPtr = ptr; 1077 return 0; 1078 } 1079 ptr += enc->minBytesPerChar; 1080 c = toAscii(enc, ptr, end); 1081 while (isSpace(c)) { 1082 ptr += enc->minBytesPerChar; 1083 c = toAscii(enc, ptr, end); 1084 } 1085 if (c != ASCII_QUOT && c != ASCII_APOS) { 1086 *nextTokPtr = ptr; 1087 return 0; 1088 } 1089 open = (char)c; 1090 ptr += enc->minBytesPerChar; 1091 *valPtr = ptr; 1092 for (;; ptr += enc->minBytesPerChar) { 1093 c = toAscii(enc, ptr, end); 1094 if (c == open) 1095 break; 1096 if (!(ASCII_a <= c && c <= ASCII_z) 1097 && !(ASCII_A <= c && c <= ASCII_Z) 1098 && !(ASCII_0 <= c && c <= ASCII_9) 1099 && c != ASCII_PERIOD 1100 && c != ASCII_MINUS 1101 && c != ASCII_UNDERSCORE) { 1102 *nextTokPtr = ptr; 1103 return 0; 1104 } 1105 } 1106 *nextTokPtr = ptr + enc->minBytesPerChar; 1107 return 1; 1108} 1109 1110static const char KW_version[] = { 1111 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0' 1112}; 1113 1114static const char KW_encoding[] = { 1115 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0' 1116}; 1117 1118static const char KW_standalone[] = { 1119 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, 1120 ASCII_n, ASCII_e, '\0' 1121}; 1122 1123static const char KW_yes[] = { 1124 ASCII_y, ASCII_e, ASCII_s, '\0' 1125}; 1126 1127static const char KW_no[] = { 1128 ASCII_n, ASCII_o, '\0' 1129}; 1130 1131static int 1132doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, 1133 const char *, 1134 const char *), 1135 int isGeneralTextEntity, 1136 const ENCODING *enc, 1137 const char *ptr, 1138 const char *end, 1139 const char **badPtr, 1140 const char **versionPtr, 1141 const char **versionEndPtr, 1142 const char **encodingName, 1143 const ENCODING **encoding, 1144 int *standalone) 1145{ 1146 const char *val = NULL; 1147 const char *name = NULL; 1148 const char *nameEnd = NULL; 1149 ptr += 5 * enc->minBytesPerChar; 1150 end -= 2 * enc->minBytesPerChar; 1151 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) 1152 || !name) { 1153 *badPtr = ptr; 1154 return 0; 1155 } 1156 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) { 1157 if (!isGeneralTextEntity) { 1158 *badPtr = name; 1159 return 0; 1160 } 1161 } 1162 else { 1163 if (versionPtr) 1164 *versionPtr = val; 1165 if (versionEndPtr) 1166 *versionEndPtr = ptr; 1167 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1168 *badPtr = ptr; 1169 return 0; 1170 } 1171 if (!name) { 1172 if (isGeneralTextEntity) { 1173 /* a TextDecl must have an EncodingDecl */ 1174 *badPtr = ptr; 1175 return 0; 1176 } 1177 return 1; 1178 } 1179 } 1180 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) { 1181 int c = toAscii(enc, val, end); 1182 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) { 1183 *badPtr = val; 1184 return 0; 1185 } 1186 if (encodingName) 1187 *encodingName = val; 1188 if (encoding) 1189 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); 1190 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1191 *badPtr = ptr; 1192 return 0; 1193 } 1194 if (!name) 1195 return 1; 1196 } 1197 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) 1198 || isGeneralTextEntity) { 1199 *badPtr = name; 1200 return 0; 1201 } 1202 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) { 1203 if (standalone) 1204 *standalone = 1; 1205 } 1206 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) { 1207 if (standalone) 1208 *standalone = 0; 1209 } 1210 else { 1211 *badPtr = val; 1212 return 0; 1213 } 1214 while (isSpace(toAscii(enc, ptr, end))) 1215 ptr += enc->minBytesPerChar; 1216 if (ptr != end) { 1217 *badPtr = ptr; 1218 return 0; 1219 } 1220 return 1; 1221} 1222 1223static int FASTCALL 1224checkCharRefNumber(int result) 1225{ 1226 switch (result >> 8) { 1227 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 1228 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 1229 return -1; 1230 case 0: 1231 if (latin1_encoding.type[result] == BT_NONXML) 1232 return -1; 1233 break; 1234 case 0xFF: 1235 if (result == 0xFFFE || result == 0xFFFF) 1236 return -1; 1237 break; 1238 } 1239 return result; 1240} 1241 1242int FASTCALL 1243XmlUtf8Encode(int c, char *buf) 1244{ 1245 enum { 1246 /* minN is minimum legal resulting value for N byte sequence */ 1247 min2 = 0x80, 1248 min3 = 0x800, 1249 min4 = 0x10000 1250 }; 1251 1252 if (c < 0) 1253 return 0; 1254 if (c < min2) { 1255 buf[0] = (char)(c | UTF8_cval1); 1256 return 1; 1257 } 1258 if (c < min3) { 1259 buf[0] = (char)((c >> 6) | UTF8_cval2); 1260 buf[1] = (char)((c & 0x3f) | 0x80); 1261 return 2; 1262 } 1263 if (c < min4) { 1264 buf[0] = (char)((c >> 12) | UTF8_cval3); 1265 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80); 1266 buf[2] = (char)((c & 0x3f) | 0x80); 1267 return 3; 1268 } 1269 if (c < 0x110000) { 1270 buf[0] = (char)((c >> 18) | UTF8_cval4); 1271 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80); 1272 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80); 1273 buf[3] = (char)((c & 0x3f) | 0x80); 1274 return 4; 1275 } 1276 return 0; 1277} 1278 1279int FASTCALL 1280XmlUtf16Encode(int charNum, unsigned short *buf) 1281{ 1282 if (charNum < 0) 1283 return 0; 1284 if (charNum < 0x10000) { 1285 buf[0] = (unsigned short)charNum; 1286 return 1; 1287 } 1288 if (charNum < 0x110000) { 1289 charNum -= 0x10000; 1290 buf[0] = (unsigned short)((charNum >> 10) + 0xD800); 1291 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00); 1292 return 2; 1293 } 1294 return 0; 1295} 1296 1297struct unknown_encoding { 1298 struct normal_encoding normal; 1299 CONVERTER convert; 1300 void *userData; 1301 unsigned short utf16[256]; 1302 char utf8[256][4]; 1303}; 1304 1305#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc)) 1306 1307int 1308XmlSizeOfUnknownEncoding(void) 1309{ 1310 return sizeof(struct unknown_encoding); 1311} 1312 1313static int PTRFASTCALL 1314unknown_isName(const ENCODING *enc, const char *p) 1315{ 1316 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1317 int c = uenc->convert(uenc->userData, p); 1318 if (c & ~0xFFFF) 1319 return 0; 1320 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); 1321} 1322 1323static int PTRFASTCALL 1324unknown_isNmstrt(const ENCODING *enc, const char *p) 1325{ 1326 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1327 int c = uenc->convert(uenc->userData, p); 1328 if (c & ~0xFFFF) 1329 return 0; 1330 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); 1331} 1332 1333static int PTRFASTCALL 1334unknown_isInvalid(const ENCODING *enc, const char *p) 1335{ 1336 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1337 int c = uenc->convert(uenc->userData, p); 1338 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; 1339} 1340 1341static enum XML_Convert_Result PTRCALL 1342unknown_toUtf8(const ENCODING *enc, 1343 const char **fromP, const char *fromLim, 1344 char **toP, const char *toLim) 1345{ 1346 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1347 char buf[XML_UTF8_ENCODE_MAX]; 1348 for (;;) { 1349 const char *utf8; 1350 int n; 1351 if (*fromP == fromLim) 1352 return XML_CONVERT_COMPLETED; 1353 utf8 = uenc->utf8[(unsigned char)**fromP]; 1354 n = *utf8++; 1355 if (n == 0) { 1356 int c = uenc->convert(uenc->userData, *fromP); 1357 n = XmlUtf8Encode(c, buf); 1358 if (n > toLim - *toP) 1359 return XML_CONVERT_OUTPUT_EXHAUSTED; 1360 utf8 = buf; 1361 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1362 - (BT_LEAD2 - 2)); 1363 } 1364 else { 1365 if (n > toLim - *toP) 1366 return XML_CONVERT_OUTPUT_EXHAUSTED; 1367 (*fromP)++; 1368 } 1369 do { 1370 *(*toP)++ = *utf8++; 1371 } while (--n != 0); 1372 } 1373} 1374 1375static enum XML_Convert_Result PTRCALL 1376unknown_toUtf16(const ENCODING *enc, 1377 const char **fromP, const char *fromLim, 1378 unsigned short **toP, const unsigned short *toLim) 1379{ 1380 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1381 while (*fromP < fromLim && *toP < toLim) { 1382 unsigned short c = uenc->utf16[(unsigned char)**fromP]; 1383 if (c == 0) { 1384 c = (unsigned short) 1385 uenc->convert(uenc->userData, *fromP); 1386 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1387 - (BT_LEAD2 - 2)); 1388 } 1389 else 1390 (*fromP)++; 1391 *(*toP)++ = c; 1392 } 1393 1394 if ((*toP == toLim) && (*fromP < fromLim)) 1395 return XML_CONVERT_OUTPUT_EXHAUSTED; 1396 else 1397 return XML_CONVERT_COMPLETED; 1398} 1399 1400ENCODING * 1401XmlInitUnknownEncoding(void *mem, 1402 int *table, 1403 CONVERTER convert, 1404 void *userData) 1405{ 1406 int i; 1407 struct unknown_encoding *e = (struct unknown_encoding *)mem; 1408 for (i = 0; i < (int)sizeof(struct normal_encoding); i++) 1409 ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; 1410 for (i = 0; i < 128; i++) 1411 if (latin1_encoding.type[i] != BT_OTHER 1412 && latin1_encoding.type[i] != BT_NONXML 1413 && table[i] != i) 1414 return 0; 1415 for (i = 0; i < 256; i++) { 1416 int c = table[i]; 1417 if (c == -1) { 1418 e->normal.type[i] = BT_MALFORM; 1419 /* This shouldn't really get used. */ 1420 e->utf16[i] = 0xFFFF; 1421 e->utf8[i][0] = 1; 1422 e->utf8[i][1] = 0; 1423 } 1424 else if (c < 0) { 1425 if (c < -4) 1426 return 0; 1427 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); 1428 e->utf8[i][0] = 0; 1429 e->utf16[i] = 0; 1430 } 1431 else if (c < 0x80) { 1432 if (latin1_encoding.type[c] != BT_OTHER 1433 && latin1_encoding.type[c] != BT_NONXML 1434 && c != i) 1435 return 0; 1436 e->normal.type[i] = latin1_encoding.type[c]; 1437 e->utf8[i][0] = 1; 1438 e->utf8[i][1] = (char)c; 1439 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c); 1440 } 1441 else if (checkCharRefNumber(c) < 0) { 1442 e->normal.type[i] = BT_NONXML; 1443 /* This shouldn't really get used. */ 1444 e->utf16[i] = 0xFFFF; 1445 e->utf8[i][0] = 1; 1446 e->utf8[i][1] = 0; 1447 } 1448 else { 1449 if (c > 0xFFFF) 1450 return 0; 1451 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) 1452 e->normal.type[i] = BT_NMSTRT; 1453 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) 1454 e->normal.type[i] = BT_NAME; 1455 else 1456 e->normal.type[i] = BT_OTHER; 1457 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); 1458 e->utf16[i] = (unsigned short)c; 1459 } 1460 } 1461 e->userData = userData; 1462 e->convert = convert; 1463 if (convert) { 1464 e->normal.isName2 = unknown_isName; 1465 e->normal.isName3 = unknown_isName; 1466 e->normal.isName4 = unknown_isName; 1467 e->normal.isNmstrt2 = unknown_isNmstrt; 1468 e->normal.isNmstrt3 = unknown_isNmstrt; 1469 e->normal.isNmstrt4 = unknown_isNmstrt; 1470 e->normal.isInvalid2 = unknown_isInvalid; 1471 e->normal.isInvalid3 = unknown_isInvalid; 1472 e->normal.isInvalid4 = unknown_isInvalid; 1473 } 1474 e->normal.enc.utf8Convert = unknown_toUtf8; 1475 e->normal.enc.utf16Convert = unknown_toUtf16; 1476 return &(e->normal.enc); 1477} 1478 1479/* If this enumeration is changed, getEncodingIndex and encodings 1480must also be changed. */ 1481enum { 1482 UNKNOWN_ENC = -1, 1483 ISO_8859_1_ENC = 0, 1484 US_ASCII_ENC, 1485 UTF_8_ENC, 1486 UTF_16_ENC, 1487 UTF_16BE_ENC, 1488 UTF_16LE_ENC, 1489 /* must match encodingNames up to here */ 1490 NO_ENC 1491}; 1492 1493static const char KW_ISO_8859_1[] = { 1494 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, 1495 ASCII_MINUS, ASCII_1, '\0' 1496}; 1497static const char KW_US_ASCII[] = { 1498 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, 1499 '\0' 1500}; 1501static const char KW_UTF_8[] = { 1502 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0' 1503}; 1504static const char KW_UTF_16[] = { 1505 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0' 1506}; 1507static const char KW_UTF_16BE[] = { 1508 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, 1509 '\0' 1510}; 1511static const char KW_UTF_16LE[] = { 1512 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, 1513 '\0' 1514}; 1515 1516static int FASTCALL 1517getEncodingIndex(const char *name) 1518{ 1519 static const char * const encodingNames[] = { 1520 KW_ISO_8859_1, 1521 KW_US_ASCII, 1522 KW_UTF_8, 1523 KW_UTF_16, 1524 KW_UTF_16BE, 1525 KW_UTF_16LE, 1526 }; 1527 int i; 1528 if (name == NULL) 1529 return NO_ENC; 1530 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++) 1531 if (streqci(name, encodingNames[i])) 1532 return i; 1533 return UNKNOWN_ENC; 1534} 1535 1536/* For binary compatibility, we store the index of the encoding 1537 specified at initialization in the isUtf16 member. 1538*/ 1539 1540#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) 1541#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) 1542 1543/* This is what detects the encoding. encodingTable maps from 1544 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of 1545 the external (protocol) specified encoding; state is 1546 XML_CONTENT_STATE if we're parsing an external text entity, and 1547 XML_PROLOG_STATE otherwise. 1548*/ 1549 1550 1551static int 1552initScan(const ENCODING * const *encodingTable, 1553 const INIT_ENCODING *enc, 1554 int state, 1555 const char *ptr, 1556 const char *end, 1557 const char **nextTokPtr) 1558{ 1559 const ENCODING **encPtr; 1560 1561 if (ptr >= end) 1562 return XML_TOK_NONE; 1563 encPtr = enc->encPtr; 1564 if (ptr + 1 == end) { 1565 /* only a single byte available for auto-detection */ 1566#ifndef XML_DTD /* FIXME */ 1567 /* a well-formed document entity must have more than one byte */ 1568 if (state != XML_CONTENT_STATE) 1569 return XML_TOK_PARTIAL; 1570#endif 1571 /* so we're parsing an external text entity... */ 1572 /* if UTF-16 was externally specified, then we need at least 2 bytes */ 1573 switch (INIT_ENC_INDEX(enc)) { 1574 case UTF_16_ENC: 1575 case UTF_16LE_ENC: 1576 case UTF_16BE_ENC: 1577 return XML_TOK_PARTIAL; 1578 } 1579 switch ((unsigned char)*ptr) { 1580 case 0xFE: 1581 case 0xFF: 1582 case 0xEF: /* possibly first byte of UTF-8 BOM */ 1583 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1584 && state == XML_CONTENT_STATE) 1585 break; 1586 /* fall through */ 1587 case 0x00: 1588 case 0x3C: 1589 return XML_TOK_PARTIAL; 1590 } 1591 } 1592 else { 1593 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { 1594 case 0xFEFF: 1595 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1596 && state == XML_CONTENT_STATE) 1597 break; 1598 *nextTokPtr = ptr + 2; 1599 *encPtr = encodingTable[UTF_16BE_ENC]; 1600 return XML_TOK_BOM; 1601 /* 00 3C is handled in the default case */ 1602 case 0x3C00: 1603 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC 1604 || INIT_ENC_INDEX(enc) == UTF_16_ENC) 1605 && state == XML_CONTENT_STATE) 1606 break; 1607 *encPtr = encodingTable[UTF_16LE_ENC]; 1608 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1609 case 0xFFFE: 1610 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1611 && state == XML_CONTENT_STATE) 1612 break; 1613 *nextTokPtr = ptr + 2; 1614 *encPtr = encodingTable[UTF_16LE_ENC]; 1615 return XML_TOK_BOM; 1616 case 0xEFBB: 1617 /* Maybe a UTF-8 BOM (EF BB BF) */ 1618 /* If there's an explicitly specified (external) encoding 1619 of ISO-8859-1 or some flavour of UTF-16 1620 and this is an external text entity, 1621 don't look for the BOM, 1622 because it might be a legal data. 1623 */ 1624 if (state == XML_CONTENT_STATE) { 1625 int e = INIT_ENC_INDEX(enc); 1626 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC 1627 || e == UTF_16LE_ENC || e == UTF_16_ENC) 1628 break; 1629 } 1630 if (ptr + 2 == end) 1631 return XML_TOK_PARTIAL; 1632 if ((unsigned char)ptr[2] == 0xBF) { 1633 *nextTokPtr = ptr + 3; 1634 *encPtr = encodingTable[UTF_8_ENC]; 1635 return XML_TOK_BOM; 1636 } 1637 break; 1638 default: 1639 if (ptr[0] == '\0') { 1640 /* 0 isn't a legal data character. Furthermore a document 1641 entity can only start with ASCII characters. So the only 1642 way this can fail to be big-endian UTF-16 if it it's an 1643 external parsed general entity that's labelled as 1644 UTF-16LE. 1645 */ 1646 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) 1647 break; 1648 *encPtr = encodingTable[UTF_16BE_ENC]; 1649 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1650 } 1651 else if (ptr[1] == '\0') { 1652 /* We could recover here in the case: 1653 - parsing an external entity 1654 - second byte is 0 1655 - no externally specified encoding 1656 - no encoding declaration 1657 by assuming UTF-16LE. But we don't, because this would mean when 1658 presented just with a single byte, we couldn't reliably determine 1659 whether we needed further bytes. 1660 */ 1661 if (state == XML_CONTENT_STATE) 1662 break; 1663 *encPtr = encodingTable[UTF_16LE_ENC]; 1664 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1665 } 1666 break; 1667 } 1668 } 1669 *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; 1670 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1671} 1672 1673 1674#define NS(x) x 1675#define ns(x) x 1676#define XML_TOK_NS_C 1677#include "xmltok_ns.c" 1678#undef XML_TOK_NS_C 1679#undef NS 1680#undef ns 1681 1682#ifdef XML_NS 1683 1684#define NS(x) x ## NS 1685#define ns(x) x ## _ns 1686 1687#define XML_TOK_NS_C 1688#include "xmltok_ns.c" 1689#undef XML_TOK_NS_C 1690 1691#undef NS 1692#undef ns 1693 1694ENCODING * 1695XmlInitUnknownEncodingNS(void *mem, 1696 int *table, 1697 CONVERTER convert, 1698 void *userData) 1699{ 1700 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); 1701 if (enc) 1702 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON; 1703 return enc; 1704} 1705 1706#endif /* XML_NS */ 1707