1/** 2 * Test the UTF-8 decoding routines 3 * 4 * author: Daniel Veillard 5 * copy: see Copyright for the status of this software. 6 */ 7 8#include <stdio.h> 9#include <string.h> 10#include <libxml/parser.h> 11#include <libxml/parserInternals.h> 12 13#include "buf.h" 14 15int lastError; 16 17static void errorHandler(void *unused, xmlErrorPtr err) { 18 if ((unused == NULL) && (err != NULL) && (lastError == 0)) { 19 lastError = err->code; 20 } 21} 22 23char document1[100] = "<doc>XXXX</doc>"; 24char document2[100] = "<doc foo='XXXX'/>"; 25 26static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document, 27 int len, char *data, int forbid1, int forbid2) { 28 int i; 29 xmlDocPtr res; 30 31 for (i = 0;i <= 0xFF;i++) { 32 lastError = 0; 33 xmlCtxtReset(ctxt); 34 35 data[0] = i; 36 37 res = xmlReadMemory(document, len, "test", NULL, 0); 38 39 if ((i == forbid1) || (i == forbid2)) { 40 if ((lastError == 0) || (res != NULL)) 41 fprintf(stderr, 42 "Failed to detect invalid char for Byte 0x%02X: %c\n", 43 i, i); 44 } 45 46 else if ((i == '<') || (i == '&')) { 47 if ((lastError == 0) || (res != NULL)) 48 fprintf(stderr, 49 "Failed to detect illegal char %c for Byte 0x%02X\n", i, i); 50 } 51 else if (((i < 0x20) || (i >= 0x80)) && 52 (i != 0x9) && (i != 0xA) && (i != 0xD)) { 53 if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL)) 54 fprintf(stderr, 55 "Failed to detect invalid char for Byte 0x%02X\n", i); 56 } 57 else if (res == NULL) { 58 fprintf(stderr, 59 "Failed to parse valid char for Byte 0x%02X : %c\n", i, i); 60 } 61 if (res != NULL) 62 xmlFreeDoc(res); 63 } 64} 65 66static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document, 67 int len, char *data) { 68 int i, j; 69 xmlDocPtr res; 70 71 for (i = 0x80;i <= 0xFF;i++) { 72 for (j = 0;j <= 0xFF;j++) { 73 lastError = 0; 74 xmlCtxtReset(ctxt); 75 76 data[0] = i; 77 data[1] = j; 78 79 res = xmlReadMemory(document, len, "test", NULL, 0); 80 81 /* if first bit of first char is set, then second bit must too */ 82 if ((i & 0x80) && ((i & 0x40) == 0)) { 83 if ((lastError == 0) || (res != NULL)) 84 fprintf(stderr, 85 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n", 86 i, j); 87 } 88 89 /* 90 * if first bit of first char is set, then second char first 91 * bits must be 10 92 */ 93 else if ((i & 0x80) && ((j & 0xC0) != 0x80)) { 94 if ((lastError == 0) || (res != NULL)) 95 fprintf(stderr, 96 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n", 97 i, j); 98 } 99 100 /* 101 * if using a 2 byte encoding then the value must be greater 102 * than 0x80, i.e. one of bits 5 to 1 of i must be set 103 */ 104 else if ((i & 0x80) && ((i & 0x1E) == 0)) { 105 if ((lastError == 0) || (res != NULL)) 106 fprintf(stderr, 107 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n", 108 i, j); 109 } 110 111 /* 112 * if third bit of first char is set, then the sequence would need 113 * at least 3 bytes, but we give only 2 ! 114 */ 115 else if ((i & 0xE0) == 0xE0) { 116 if ((lastError == 0) || (res != NULL)) 117 fprintf(stderr, 118 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n", 119 i, j); 120 } 121 122 /* 123 * We should see no error in remaning cases 124 */ 125 else if ((lastError != 0) || (res == NULL)) { 126 fprintf(stderr, 127 "Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j); 128 } 129 if (res != NULL) 130 xmlFreeDoc(res); 131 } 132 } 133} 134 135/** 136 * testDocumentRanges: 137 * 138 * Test the correct UTF8 character parsing in context of XML documents 139 * Those are in-context injection tests checking the parser behaviour on 140 * edge case values at different point in content, beginning and end of 141 * CDATA in text or in attribute values. 142 */ 143 144static void testDocumentRanges(void) { 145 xmlParserCtxtPtr ctxt; 146 char *data; 147 148 /* 149 * Set up a parsing context using the first document as 150 * the current input source. 151 */ 152 ctxt = xmlNewParserCtxt(); 153 if (ctxt == NULL) { 154 fprintf(stderr, "Failed to allocate parser context\n"); 155 return; 156 } 157 158 printf("testing 1 byte char in document: 1"); 159 fflush(stdout); 160 data = &document1[5]; 161 data[0] = ' '; 162 data[1] = ' '; 163 data[2] = ' '; 164 data[3] = ' '; 165 /* test 1 byte injection at beginning of area */ 166 testDocumentRangeByte1(ctxt, &document1[0], strlen(document1), 167 data, -1, -1); 168 printf(" 2"); 169 fflush(stdout); 170 data[0] = ' '; 171 data[1] = ' '; 172 data[2] = ' '; 173 data[3] = ' '; 174 /* test 1 byte injection at end of area */ 175 testDocumentRangeByte1(ctxt, &document1[0], strlen(document1), 176 data + 3, -1, -1); 177 178 printf(" 3"); 179 fflush(stdout); 180 data = &document2[10]; 181 data[0] = ' '; 182 data[1] = ' '; 183 data[2] = ' '; 184 data[3] = ' '; 185 /* test 1 byte injection at beginning of area */ 186 testDocumentRangeByte1(ctxt, &document2[0], strlen(document2), 187 data, '\'', -1); 188 printf(" 4"); 189 fflush(stdout); 190 data[0] = ' '; 191 data[1] = ' '; 192 data[2] = ' '; 193 data[3] = ' '; 194 /* test 1 byte injection at end of area */ 195 testDocumentRangeByte1(ctxt, &document2[0], strlen(document2), 196 data + 3, '\'', -1); 197 printf(" done\n"); 198 199 printf("testing 2 byte char in document: 1"); 200 fflush(stdout); 201 data = &document1[5]; 202 data[0] = ' '; 203 data[1] = ' '; 204 data[2] = ' '; 205 data[3] = ' '; 206 /* test 2 byte injection at beginning of area */ 207 testDocumentRangeByte2(ctxt, &document1[0], strlen(document1), 208 data); 209 printf(" 2"); 210 fflush(stdout); 211 data[0] = ' '; 212 data[1] = ' '; 213 data[2] = ' '; 214 data[3] = ' '; 215 /* test 2 byte injection at end of area */ 216 testDocumentRangeByte2(ctxt, &document1[0], strlen(document1), 217 data + 2); 218 219 printf(" 3"); 220 fflush(stdout); 221 data = &document2[10]; 222 data[0] = ' '; 223 data[1] = ' '; 224 data[2] = ' '; 225 data[3] = ' '; 226 /* test 2 byte injection at beginning of area */ 227 testDocumentRangeByte2(ctxt, &document2[0], strlen(document2), 228 data); 229 printf(" 4"); 230 fflush(stdout); 231 data[0] = ' '; 232 data[1] = ' '; 233 data[2] = ' '; 234 data[3] = ' '; 235 /* test 2 byte injection at end of area */ 236 testDocumentRangeByte2(ctxt, &document2[0], strlen(document2), 237 data + 2); 238 printf(" done\n"); 239 240 xmlFreeParserCtxt(ctxt); 241} 242 243static void testCharRangeByte1(xmlParserCtxtPtr ctxt, char *data) { 244 int i = 0; 245 int len, c; 246 247 data[1] = 0; 248 data[2] = 0; 249 data[3] = 0; 250 for (i = 0;i <= 0xFF;i++) { 251 data[0] = i; 252 ctxt->charset = XML_CHAR_ENCODING_UTF8; 253 254 lastError = 0; 255 c = xmlCurrentChar(ctxt, &len); 256 if ((i == 0) || (i >= 0x80)) { 257 /* we must see an error there */ 258 if (lastError != XML_ERR_INVALID_CHAR) 259 fprintf(stderr, 260 "Failed to detect invalid char for Byte 0x%02X\n", i); 261 } else if (i == 0xD) { 262 if ((c != 0xA) || (len != 1)) 263 fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i); 264 } else if ((c != i) || (len != 1)) { 265 fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i); 266 } 267 } 268} 269 270static void testCharRangeByte2(xmlParserCtxtPtr ctxt, char *data) { 271 int i, j; 272 int len, c; 273 274 data[2] = 0; 275 data[3] = 0; 276 for (i = 0x80;i <= 0xFF;i++) { 277 for (j = 0;j <= 0xFF;j++) { 278 data[0] = i; 279 data[1] = j; 280 ctxt->charset = XML_CHAR_ENCODING_UTF8; 281 282 lastError = 0; 283 c = xmlCurrentChar(ctxt, &len); 284 285 /* if first bit of first char is set, then second bit must too */ 286 if ((i & 0x80) && ((i & 0x40) == 0)) { 287 if (lastError != XML_ERR_INVALID_CHAR) 288 fprintf(stderr, 289 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n", 290 i, j); 291 } 292 293 /* 294 * if first bit of first char is set, then second char first 295 * bits must be 10 296 */ 297 else if ((i & 0x80) && ((j & 0xC0) != 0x80)) { 298 if (lastError != XML_ERR_INVALID_CHAR) 299 fprintf(stderr, 300 "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n", 301 i, j, c); 302 } 303 304 /* 305 * if using a 2 byte encoding then the value must be greater 306 * than 0x80, i.e. one of bits 5 to 1 of i must be set 307 */ 308 else if ((i & 0x80) && ((i & 0x1E) == 0)) { 309 if (lastError != XML_ERR_INVALID_CHAR) 310 fprintf(stderr, 311 "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n", 312 i, j, c); 313 } 314 315 /* 316 * if third bit of first char is set, then the sequence would need 317 * at least 3 bytes, but we give only 2 ! 318 */ 319 else if ((i & 0xE0) == 0xE0) { 320 if (lastError != XML_ERR_INVALID_CHAR) 321 fprintf(stderr, 322 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n", 323 i, j); 324 } 325 326 /* 327 * We should see no error in remaning cases 328 */ 329 else if ((lastError != 0) || (len != 2)) { 330 fprintf(stderr, 331 "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j); 332 } 333 334 /* 335 * Finally check the value is right 336 */ 337 else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) { 338 fprintf(stderr, 339 "Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n", 340 i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c); 341 } 342 } 343 } 344} 345 346static void testCharRangeByte3(xmlParserCtxtPtr ctxt, char *data) { 347 int i, j, k, K; 348 int len, c; 349 unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF}; 350 int value; 351 352 data[3] = 0; 353 for (i = 0xE0;i <= 0xFF;i++) { 354 for (j = 0;j <= 0xFF;j++) { 355 for (k = 0;k < 6;k++) { 356 data[0] = i; 357 data[1] = j; 358 K = lows[k]; 359 data[2] = (char) K; 360 value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12); 361 ctxt->charset = XML_CHAR_ENCODING_UTF8; 362 363 lastError = 0; 364 c = xmlCurrentChar(ctxt, &len); 365 366 /* 367 * if fourth bit of first char is set, then the sequence would need 368 * at least 4 bytes, but we give only 3 ! 369 */ 370 if ((i & 0xF0) == 0xF0) { 371 if (lastError != XML_ERR_INVALID_CHAR) 372 fprintf(stderr, 373 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", 374 i, j, K, data[3]); 375 } 376 377 /* 378 * The second and the third bytes must start with 10 379 */ 380 else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) { 381 if (lastError != XML_ERR_INVALID_CHAR) 382 fprintf(stderr, 383 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n", 384 i, j, K); 385 } 386 387 /* 388 * if using a 3 byte encoding then the value must be greater 389 * than 0x800, i.e. one of bits 4 to 0 of i must be set or 390 * the 6th byte of data[1] must be set 391 */ 392 else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) { 393 if (lastError != XML_ERR_INVALID_CHAR) 394 fprintf(stderr, 395 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n", 396 i, j, K); 397 } 398 399 /* 400 * There are values in that range that are not allowed in XML-1.0 401 */ 402 else if (((value > 0xD7FF) && (value <0xE000)) || 403 ((value > 0xFFFD) && (value <0x10000))) { 404 if (lastError != XML_ERR_INVALID_CHAR) 405 fprintf(stderr, 406 "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n", 407 value, i, j, K); 408 } 409 410 /* 411 * We should see no error in remaining cases 412 */ 413 else if ((lastError != 0) || (len != 3)) { 414 fprintf(stderr, 415 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n", 416 i, j, K); 417 } 418 419 /* 420 * Finally check the value is right 421 */ 422 else if (c != value) { 423 fprintf(stderr, 424 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n", 425 i, j, data[2], value, c); 426 } 427 } 428 } 429 } 430} 431 432static void testCharRangeByte4(xmlParserCtxtPtr ctxt, char *data) { 433 int i, j, k, K, l, L; 434 int len, c; 435 unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF}; 436 int value; 437 438 data[4] = 0; 439 for (i = 0xF0;i <= 0xFF;i++) { 440 for (j = 0;j <= 0xFF;j++) { 441 for (k = 0;k < 6;k++) { 442 for (l = 0;l < 6;l++) { 443 data[0] = i; 444 data[1] = j; 445 K = lows[k]; 446 data[2] = (char) K; 447 L = lows[l]; 448 data[3] = (char) L; 449 value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) + 450 ((i & 0x7) << 18); 451 ctxt->charset = XML_CHAR_ENCODING_UTF8; 452 453 lastError = 0; 454 c = xmlCurrentChar(ctxt, &len); 455 456 /* 457 * if fifth bit of first char is set, then the sequence would need 458 * at least 5 bytes, but we give only 4 ! 459 */ 460 if ((i & 0xF8) == 0xF8) { 461 if (lastError != XML_ERR_INVALID_CHAR) 462 fprintf(stderr, 463 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", 464 i, j, K, data[3]); 465 } 466 467 /* 468 * The second, third and fourth bytes must start with 10 469 */ 470 else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) || 471 ((L & 0xC0) != 0x80)) { 472 if (lastError != XML_ERR_INVALID_CHAR) 473 fprintf(stderr, 474 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", 475 i, j, K, L); 476 } 477 478 /* 479 * if using a 3 byte encoding then the value must be greater 480 * than 0x10000, i.e. one of bits 3 to 0 of i must be set or 481 * the 6 or 5th byte of j must be set 482 */ 483 else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) { 484 if (lastError != XML_ERR_INVALID_CHAR) 485 fprintf(stderr, 486 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", 487 i, j, K, L); 488 } 489 490 /* 491 * There are values in that range that are not allowed in XML-1.0 492 */ 493 else if (((value > 0xD7FF) && (value <0xE000)) || 494 ((value > 0xFFFD) && (value <0x10000)) || 495 (value > 0x10FFFF)) { 496 if (lastError != XML_ERR_INVALID_CHAR) 497 fprintf(stderr, 498"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", 499 value, i, j, K, L); 500 } 501 502 /* 503 * We should see no error in remaining cases 504 */ 505 else if ((lastError != 0) || (len != 4)) { 506 fprintf(stderr, 507 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n", 508 i, j, K); 509 } 510 511 /* 512 * Finally check the value is right 513 */ 514 else if (c != value) { 515 fprintf(stderr, 516 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n", 517 i, j, data[2], value, c); 518 } 519 } 520 } 521 } 522 } 523} 524 525/** 526 * testCharRanges: 527 * 528 * Test the correct UTF8 character parsing in isolation i.e. 529 * not when parsing a full document, this is less expensive and we can 530 * cover the full range of UTF-8 chars accepted by XML-1.0 531 */ 532 533static void testCharRanges(void) { 534 char data[5]; 535 xmlParserCtxtPtr ctxt; 536 xmlParserInputBufferPtr buf; 537 xmlParserInputPtr input; 538 539 memset(data, 0, 5); 540 541 /* 542 * Set up a parsing context using the above data buffer as 543 * the current input source. 544 */ 545 ctxt = xmlNewParserCtxt(); 546 if (ctxt == NULL) { 547 fprintf(stderr, "Failed to allocate parser context\n"); 548 return; 549 } 550 buf = xmlParserInputBufferCreateStatic(data, sizeof(data), 551 XML_CHAR_ENCODING_NONE); 552 if (buf == NULL) { 553 fprintf(stderr, "Failed to allocate input buffer\n"); 554 goto error; 555 } 556 input = xmlNewInputStream(ctxt); 557 if (input == NULL) { 558 xmlFreeParserInputBuffer(buf); 559 goto error; 560 } 561 input->filename = NULL; 562 input->buf = buf; 563 input->cur = 564 input->base = xmlBufContent(input->buf->buffer); 565 input->end = input->base + 4; 566 inputPush(ctxt, input); 567 568 printf("testing char range: 1"); 569 fflush(stdout); 570 testCharRangeByte1(ctxt, data); 571 printf(" 2"); 572 fflush(stdout); 573 testCharRangeByte2(ctxt, data); 574 printf(" 3"); 575 fflush(stdout); 576 testCharRangeByte3(ctxt, data); 577 printf(" 4"); 578 fflush(stdout); 579 testCharRangeByte4(ctxt, data); 580 printf(" done\n"); 581 fflush(stdout); 582 583error: 584 xmlFreeParserCtxt(ctxt); 585} 586 587int main(void) { 588 589 /* 590 * this initialize the library and check potential ABI mismatches 591 * between the version it was compiled for and the actual shared 592 * library used. 593 */ 594 LIBXML_TEST_VERSION 595 596 /* 597 * Catch errors separately 598 */ 599 600 xmlSetStructuredErrorFunc(NULL, errorHandler); 601 602 /* 603 * Run the tests 604 */ 605 testCharRanges(); 606 testDocumentRanges(); 607 608 /* 609 * Cleanup function for the XML library. 610 */ 611 xmlCleanupParser(); 612 /* 613 * this is to debug memory for regression tests 614 */ 615 xmlMemoryDump(); 616 return(0); 617} 618