1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4****************************************************************************** 5* 6* Copyright (C) 2002-2016, International Business Machines 7* Corporation and others. All Rights Reserved. 8* 9****************************************************************************** 10* file name: ucnvbocu.cpp 11* encoding: US-ASCII 12* tab size: 8 (not used) 13* indentation:4 14* 15* created on: 2002mar27 16* created by: Markus W. Scherer 17* 18* This is an implementation of the Binary Ordered Compression for Unicode, 19* in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/ 20*/ 21 22#include "unicode/utypes.h" 23 24#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION 25 26#include "unicode/ucnv.h" 27#include "unicode/ucnv_cb.h" 28#include "unicode/utf16.h" 29#include "putilimp.h" 30#include "ucnv_bld.h" 31#include "ucnv_cnv.h" 32#include "uassert.h" 33 34/* BOCU-1 constants and macros ---------------------------------------------- */ 35 36/* 37 * BOCU-1 encodes the code points of a Unicode string as 38 * a sequence of byte-encoded differences (slope detection), 39 * preserving lexical order. 40 * 41 * Optimize the difference-taking for runs of Unicode text within 42 * small scripts: 43 * 44 * Most small scripts are allocated within aligned 128-blocks of Unicode 45 * code points. Lexical order is preserved if the "previous code point" state 46 * is always moved into the middle of such a block. 47 * 48 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul 49 * areas into the middle of those areas. 50 * 51 * C0 control codes and space are encoded with their US-ASCII bytes. 52 * "prev" is reset for C0 controls but not for space. 53 */ 54 55/* initial value for "prev": middle of the ASCII range */ 56#define BOCU1_ASCII_PREV 0x40 57 58/* bounding byte values for differences */ 59#define BOCU1_MIN 0x21 60#define BOCU1_MIDDLE 0x90 61#define BOCU1_MAX_LEAD 0xfe 62#define BOCU1_MAX_TRAIL 0xff 63#define BOCU1_RESET 0xff 64 65/* number of lead bytes */ 66#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) 67 68/* adjust trail byte counts for the use of some C0 control byte values */ 69#define BOCU1_TRAIL_CONTROLS_COUNT 20 70#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) 71 72/* number of trail bytes */ 73#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) 74 75/* 76 * number of positive and negative single-byte codes 77 * (counting 0==BOCU1_MIDDLE among the positive ones) 78 */ 79#define BOCU1_SINGLE 64 80 81/* number of lead bytes for positive and negative 2/3/4-byte sequences */ 82#define BOCU1_LEAD_2 43 83#define BOCU1_LEAD_3 3 84#define BOCU1_LEAD_4 1 85 86/* The difference value range for single-byters. */ 87#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) 88#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) 89 90/* The difference value range for double-byters. */ 91#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 92#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 93 94/* The difference value range for 3-byters. */ 95#define BOCU1_REACH_POS_3 \ 96 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 97 98#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 99 100/* The lead byte start values. */ 101#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) 102#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) 103#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) 104 /* ==BOCU1_MAX_LEAD */ 105 106#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) 107#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) 108#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) 109 /* ==BOCU1_MIN+1 */ 110 111/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ 112#define BOCU1_LENGTH_FROM_LEAD(lead) \ 113 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ 114 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ 115 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) 116 117/* The length of a byte sequence, according to its packed form. */ 118#define BOCU1_LENGTH_FROM_PACKED(packed) \ 119 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) 120 121/* 122 * 12 commonly used C0 control codes (and space) are only used to encode 123 * themselves directly, 124 * which makes BOCU-1 MIME-usable and reasonably safe for 125 * ASCII-oriented software. 126 * 127 * These controls are 128 * 0 NUL 129 * 130 * 7 BEL 131 * 8 BS 132 * 133 * 9 TAB 134 * a LF 135 * b VT 136 * c FF 137 * d CR 138 * 139 * e SO 140 * f SI 141 * 142 * 1a SUB 143 * 1b ESC 144 * 145 * The other 20 C0 controls are also encoded directly (to preserve order) 146 * but are also used as trail bytes in difference encoding 147 * (for better compression). 148 */ 149#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) 150 151/* 152 * Byte value map for control codes, 153 * from external byte values 0x00..0x20 154 * to trail byte values 0..19 (0..0x13) as used in the difference calculation. 155 * External byte values that are illegal as trail bytes are mapped to -1. 156 */ 157static const int8_t 158bocu1ByteToTrail[BOCU1_MIN]={ 159/* 0 1 2 3 4 5 6 7 */ 160 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, 161 162/* 8 9 a b c d e f */ 163 -1, -1, -1, -1, -1, -1, -1, -1, 164 165/* 10 11 12 13 14 15 16 17 */ 166 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 167 168/* 18 19 1a 1b 1c 1d 1e 1f */ 169 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, 170 171/* 20 */ 172 -1 173}; 174 175/* 176 * Byte value map for control codes, 177 * from trail byte values 0..19 (0..0x13) as used in the difference calculation 178 * to external byte values 0x00..0x20. 179 */ 180static const int8_t 181bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ 182/* 0 1 2 3 4 5 6 7 */ 183 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, 184 185/* 8 9 a b c d e f */ 186 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 187 188/* 10 11 12 13 */ 189 0x1c, 0x1d, 0x1e, 0x1f 190}; 191 192/** 193 * Integer division and modulo with negative numerators 194 * yields negative modulo results and quotients that are one more than 195 * what we need here. 196 * This macro adjust the results so that the modulo-value m is always >=0. 197 * 198 * For positive n, the if() condition is always FALSE. 199 * 200 * @param n Number to be split into quotient and rest. 201 * Will be modified to contain the quotient. 202 * @param d Divisor. 203 * @param m Output variable for the rest (modulo result). 204 */ 205#define NEGDIVMOD(n, d, m) { \ 206 (m)=(n)%(d); \ 207 (n)/=(d); \ 208 if((m)<0) { \ 209 --(n); \ 210 (m)+=(d); \ 211 } \ 212} 213 214/* Faster versions of packDiff() for single-byte-encoded diff values. */ 215 216/** Is a diff value encodable in a single byte? */ 217#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1) 218 219/** Encode a diff value in a single byte. */ 220#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff)) 221 222/** Is a diff value encodable in two bytes? */ 223#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2) 224 225/* BOCU-1 implementation functions ------------------------------------------ */ 226 227#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV) 228 229/** 230 * Compute the next "previous" value for differencing 231 * from the current code point. 232 * 233 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below) 234 * @return "previous code point" state value 235 */ 236static inline int32_t 237bocu1Prev(int32_t c) { 238 /* compute new prev */ 239 if(/* 0x3040<=c && */ c<=0x309f) { 240 /* Hiragana is not 128-aligned */ 241 return 0x3070; 242 } else if(0x4e00<=c && c<=0x9fa5) { 243 /* CJK Unihan */ 244 return 0x4e00-BOCU1_REACH_NEG_2; 245 } else if(0xac00<=c /* && c<=0xd7a3 */) { 246 /* Korean Hangul */ 247 return (0xd7a3+0xac00)/2; 248 } else { 249 /* mostly small scripts */ 250 return BOCU1_SIMPLE_PREV(c); 251 } 252} 253 254/** Fast version of bocu1Prev() for most scripts. */ 255#define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c)) 256 257/* 258 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c. 259 * The UConverter fields are used as follows: 260 * 261 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) 262 * 263 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) 264 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0) 265 */ 266 267/* BOCU-1-from-Unicode conversion functions --------------------------------- */ 268 269/** 270 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes 271 * and return a packed integer with them. 272 * 273 * The encoding favors small absolute differences with short encodings 274 * to compress runs of same-script characters. 275 * 276 * Optimized version with unrolled loops and fewer floating-point operations 277 * than the standard packDiff(). 278 * 279 * @param diff difference value -0x10ffff..0x10ffff 280 * @return 281 * 0x010000zz for 1-byte sequence zz 282 * 0x0200yyzz for 2-byte sequence yy zz 283 * 0x03xxyyzz for 3-byte sequence xx yy zz 284 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) 285 */ 286static int32_t 287packDiff(int32_t diff) { 288 int32_t result, m; 289 290 U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */ 291 if(diff>=BOCU1_REACH_NEG_1) { 292 /* mostly positive differences, and single-byte negative ones */ 293#if 0 /* single-byte case handled in macros, see below */ 294 if(diff<=BOCU1_REACH_POS_1) { 295 /* single byte */ 296 return 0x01000000|(BOCU1_MIDDLE+diff); 297 } else 298#endif 299 if(diff<=BOCU1_REACH_POS_2) { 300 /* two bytes */ 301 diff-=BOCU1_REACH_POS_1+1; 302 result=0x02000000; 303 304 m=diff%BOCU1_TRAIL_COUNT; 305 diff/=BOCU1_TRAIL_COUNT; 306 result|=BOCU1_TRAIL_TO_BYTE(m); 307 308 result|=(BOCU1_START_POS_2+diff)<<8; 309 } else if(diff<=BOCU1_REACH_POS_3) { 310 /* three bytes */ 311 diff-=BOCU1_REACH_POS_2+1; 312 result=0x03000000; 313 314 m=diff%BOCU1_TRAIL_COUNT; 315 diff/=BOCU1_TRAIL_COUNT; 316 result|=BOCU1_TRAIL_TO_BYTE(m); 317 318 m=diff%BOCU1_TRAIL_COUNT; 319 diff/=BOCU1_TRAIL_COUNT; 320 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 321 322 result|=(BOCU1_START_POS_3+diff)<<16; 323 } else { 324 /* four bytes */ 325 diff-=BOCU1_REACH_POS_3+1; 326 327 m=diff%BOCU1_TRAIL_COUNT; 328 diff/=BOCU1_TRAIL_COUNT; 329 result=BOCU1_TRAIL_TO_BYTE(m); 330 331 m=diff%BOCU1_TRAIL_COUNT; 332 diff/=BOCU1_TRAIL_COUNT; 333 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 334 335 /* 336 * We know that / and % would deliver quotient 0 and rest=diff. 337 * Avoid division and modulo for performance. 338 */ 339 result|=BOCU1_TRAIL_TO_BYTE(diff)<<16; 340 341 result|=((uint32_t)BOCU1_START_POS_4)<<24; 342 } 343 } else { 344 /* two- to four-byte negative differences */ 345 if(diff>=BOCU1_REACH_NEG_2) { 346 /* two bytes */ 347 diff-=BOCU1_REACH_NEG_1; 348 result=0x02000000; 349 350 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 351 result|=BOCU1_TRAIL_TO_BYTE(m); 352 353 result|=(BOCU1_START_NEG_2+diff)<<8; 354 } else if(diff>=BOCU1_REACH_NEG_3) { 355 /* three bytes */ 356 diff-=BOCU1_REACH_NEG_2; 357 result=0x03000000; 358 359 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 360 result|=BOCU1_TRAIL_TO_BYTE(m); 361 362 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 363 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 364 365 result|=(BOCU1_START_NEG_3+diff)<<16; 366 } else { 367 /* four bytes */ 368 diff-=BOCU1_REACH_NEG_3; 369 370 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 371 result=BOCU1_TRAIL_TO_BYTE(m); 372 373 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 374 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 375 376 /* 377 * We know that NEGDIVMOD would deliver 378 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. 379 * Avoid division and modulo for performance. 380 */ 381 m=diff+BOCU1_TRAIL_COUNT; 382 result|=BOCU1_TRAIL_TO_BYTE(m)<<16; 383 384 result|=BOCU1_MIN<<24; 385 } 386 } 387 return result; 388} 389 390 391static void U_CALLCONV 392_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 393 UErrorCode *pErrorCode) { 394 UConverter *cnv; 395 const UChar *source, *sourceLimit; 396 uint8_t *target; 397 int32_t targetCapacity; 398 int32_t *offsets; 399 400 int32_t prev, c, diff; 401 402 int32_t sourceIndex, nextSourceIndex; 403 404 /* set up the local pointers */ 405 cnv=pArgs->converter; 406 source=pArgs->source; 407 sourceLimit=pArgs->sourceLimit; 408 target=(uint8_t *)pArgs->target; 409 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 410 offsets=pArgs->offsets; 411 412 /* get the converter state from UConverter */ 413 c=cnv->fromUChar32; 414 prev=(int32_t)cnv->fromUnicodeStatus; 415 if(prev==0) { 416 prev=BOCU1_ASCII_PREV; 417 } 418 419 /* sourceIndex=-1 if the current character began in the previous buffer */ 420 sourceIndex= c==0 ? 0 : -1; 421 nextSourceIndex=0; 422 423 /* conversion loop */ 424 if(c!=0 && targetCapacity>0) { 425 goto getTrail; 426 } 427 428fastSingle: 429 /* fast loop for single-byte differences */ 430 /* use only one loop counter variable, targetCapacity, not also source */ 431 diff=(int32_t)(sourceLimit-source); 432 if(targetCapacity>diff) { 433 targetCapacity=diff; 434 } 435 while(targetCapacity>0 && (c=*source)<0x3000) { 436 if(c<=0x20) { 437 if(c!=0x20) { 438 prev=BOCU1_ASCII_PREV; 439 } 440 *target++=(uint8_t)c; 441 *offsets++=nextSourceIndex++; 442 ++source; 443 --targetCapacity; 444 } else { 445 diff=c-prev; 446 if(DIFF_IS_SINGLE(diff)) { 447 prev=BOCU1_SIMPLE_PREV(c); 448 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 449 *offsets++=nextSourceIndex++; 450 ++source; 451 --targetCapacity; 452 } else { 453 break; 454 } 455 } 456 } 457 /* restore real values */ 458 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); 459 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ 460 461 /* regular loop for all cases */ 462 while(source<sourceLimit) { 463 if(targetCapacity>0) { 464 c=*source++; 465 ++nextSourceIndex; 466 467 if(c<=0x20) { 468 /* 469 * ISO C0 control & space: 470 * Encode directly for MIME compatibility, 471 * and reset state except for space, to not disrupt compression. 472 */ 473 if(c!=0x20) { 474 prev=BOCU1_ASCII_PREV; 475 } 476 *target++=(uint8_t)c; 477 *offsets++=sourceIndex; 478 --targetCapacity; 479 480 sourceIndex=nextSourceIndex; 481 continue; 482 } 483 484 if(U16_IS_LEAD(c)) { 485getTrail: 486 if(source<sourceLimit) { 487 /* test the following code unit */ 488 UChar trail=*source; 489 if(U16_IS_TRAIL(trail)) { 490 ++source; 491 ++nextSourceIndex; 492 c=U16_GET_SUPPLEMENTARY(c, trail); 493 } 494 } else { 495 /* no more input */ 496 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ 497 break; 498 } 499 } 500 501 /* 502 * all other Unicode code points c==U+0021..U+10ffff 503 * are encoded with the difference c-prev 504 * 505 * a new prev is computed from c, 506 * placed in the middle of a 0x80-block (for most small scripts) or 507 * in the middle of the Unihan and Hangul blocks 508 * to statistically minimize the following difference 509 */ 510 diff=c-prev; 511 prev=BOCU1_PREV(c); 512 if(DIFF_IS_SINGLE(diff)) { 513 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 514 *offsets++=sourceIndex; 515 --targetCapacity; 516 sourceIndex=nextSourceIndex; 517 if(c<0x3000) { 518 goto fastSingle; 519 } 520 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { 521 /* optimize 2-byte case */ 522 int32_t m; 523 524 if(diff>=0) { 525 diff-=BOCU1_REACH_POS_1+1; 526 m=diff%BOCU1_TRAIL_COUNT; 527 diff/=BOCU1_TRAIL_COUNT; 528 diff+=BOCU1_START_POS_2; 529 } else { 530 diff-=BOCU1_REACH_NEG_1; 531 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 532 diff+=BOCU1_START_NEG_2; 533 } 534 *target++=(uint8_t)diff; 535 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); 536 *offsets++=sourceIndex; 537 *offsets++=sourceIndex; 538 targetCapacity-=2; 539 sourceIndex=nextSourceIndex; 540 } else { 541 int32_t length; /* will be 2..4 */ 542 543 diff=packDiff(diff); 544 length=BOCU1_LENGTH_FROM_PACKED(diff); 545 546 /* write the output character bytes from diff and length */ 547 /* from the first if in the loop we know that targetCapacity>0 */ 548 if(length<=targetCapacity) { 549 switch(length) { 550 /* each branch falls through to the next one */ 551 case 4: 552 *target++=(uint8_t)(diff>>24); 553 *offsets++=sourceIndex; 554 U_FALLTHROUGH; 555 case 3: 556 *target++=(uint8_t)(diff>>16); 557 *offsets++=sourceIndex; 558 U_FALLTHROUGH; 559 case 2: 560 *target++=(uint8_t)(diff>>8); 561 *offsets++=sourceIndex; 562 /* case 1: handled above */ 563 *target++=(uint8_t)diff; 564 *offsets++=sourceIndex; 565 U_FALLTHROUGH; 566 default: 567 /* will never occur */ 568 break; 569 } 570 targetCapacity-=length; 571 sourceIndex=nextSourceIndex; 572 } else { 573 uint8_t *charErrorBuffer; 574 575 /* 576 * We actually do this backwards here: 577 * In order to save an intermediate variable, we output 578 * first to the overflow buffer what does not fit into the 579 * regular target. 580 */ 581 /* we know that 1<=targetCapacity<length<=4 */ 582 length-=targetCapacity; 583 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 584 switch(length) { 585 /* each branch falls through to the next one */ 586 case 3: 587 *charErrorBuffer++=(uint8_t)(diff>>16); 588 U_FALLTHROUGH; 589 case 2: 590 *charErrorBuffer++=(uint8_t)(diff>>8); 591 U_FALLTHROUGH; 592 case 1: 593 *charErrorBuffer=(uint8_t)diff; 594 U_FALLTHROUGH; 595 default: 596 /* will never occur */ 597 break; 598 } 599 cnv->charErrorBufferLength=(int8_t)length; 600 601 /* now output what fits into the regular target */ 602 diff>>=8*length; /* length was reduced by targetCapacity */ 603 switch(targetCapacity) { 604 /* each branch falls through to the next one */ 605 case 3: 606 *target++=(uint8_t)(diff>>16); 607 *offsets++=sourceIndex; 608 U_FALLTHROUGH; 609 case 2: 610 *target++=(uint8_t)(diff>>8); 611 *offsets++=sourceIndex; 612 U_FALLTHROUGH; 613 case 1: 614 *target++=(uint8_t)diff; 615 *offsets++=sourceIndex; 616 U_FALLTHROUGH; 617 default: 618 /* will never occur */ 619 break; 620 } 621 622 /* target overflow */ 623 targetCapacity=0; 624 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 625 break; 626 } 627 } 628 } else { 629 /* target is full */ 630 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 631 break; 632 } 633 } 634 635 /* set the converter state back into UConverter */ 636 cnv->fromUChar32= c<0 ? -c : 0; 637 cnv->fromUnicodeStatus=(uint32_t)prev; 638 639 /* write back the updated pointers */ 640 pArgs->source=source; 641 pArgs->target=(char *)target; 642 pArgs->offsets=offsets; 643} 644 645/* 646 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling. 647 * If a change is made in the original function, then either 648 * change this function the same way or 649 * re-copy the original function and remove the variables 650 * offsets, sourceIndex, and nextSourceIndex. 651 */ 652static void U_CALLCONV 653_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs, 654 UErrorCode *pErrorCode) { 655 UConverter *cnv; 656 const UChar *source, *sourceLimit; 657 uint8_t *target; 658 int32_t targetCapacity; 659 660 int32_t prev, c, diff; 661 662 /* set up the local pointers */ 663 cnv=pArgs->converter; 664 source=pArgs->source; 665 sourceLimit=pArgs->sourceLimit; 666 target=(uint8_t *)pArgs->target; 667 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 668 669 /* get the converter state from UConverter */ 670 c=cnv->fromUChar32; 671 prev=(int32_t)cnv->fromUnicodeStatus; 672 if(prev==0) { 673 prev=BOCU1_ASCII_PREV; 674 } 675 676 /* conversion loop */ 677 if(c!=0 && targetCapacity>0) { 678 goto getTrail; 679 } 680 681fastSingle: 682 /* fast loop for single-byte differences */ 683 /* use only one loop counter variable, targetCapacity, not also source */ 684 diff=(int32_t)(sourceLimit-source); 685 if(targetCapacity>diff) { 686 targetCapacity=diff; 687 } 688 while(targetCapacity>0 && (c=*source)<0x3000) { 689 if(c<=0x20) { 690 if(c!=0x20) { 691 prev=BOCU1_ASCII_PREV; 692 } 693 *target++=(uint8_t)c; 694 } else { 695 diff=c-prev; 696 if(DIFF_IS_SINGLE(diff)) { 697 prev=BOCU1_SIMPLE_PREV(c); 698 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 699 } else { 700 break; 701 } 702 } 703 ++source; 704 --targetCapacity; 705 } 706 /* restore real values */ 707 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); 708 709 /* regular loop for all cases */ 710 while(source<sourceLimit) { 711 if(targetCapacity>0) { 712 c=*source++; 713 714 if(c<=0x20) { 715 /* 716 * ISO C0 control & space: 717 * Encode directly for MIME compatibility, 718 * and reset state except for space, to not disrupt compression. 719 */ 720 if(c!=0x20) { 721 prev=BOCU1_ASCII_PREV; 722 } 723 *target++=(uint8_t)c; 724 --targetCapacity; 725 continue; 726 } 727 728 if(U16_IS_LEAD(c)) { 729getTrail: 730 if(source<sourceLimit) { 731 /* test the following code unit */ 732 UChar trail=*source; 733 if(U16_IS_TRAIL(trail)) { 734 ++source; 735 c=U16_GET_SUPPLEMENTARY(c, trail); 736 } 737 } else { 738 /* no more input */ 739 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ 740 break; 741 } 742 } 743 744 /* 745 * all other Unicode code points c==U+0021..U+10ffff 746 * are encoded with the difference c-prev 747 * 748 * a new prev is computed from c, 749 * placed in the middle of a 0x80-block (for most small scripts) or 750 * in the middle of the Unihan and Hangul blocks 751 * to statistically minimize the following difference 752 */ 753 diff=c-prev; 754 prev=BOCU1_PREV(c); 755 if(DIFF_IS_SINGLE(diff)) { 756 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 757 --targetCapacity; 758 if(c<0x3000) { 759 goto fastSingle; 760 } 761 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { 762 /* optimize 2-byte case */ 763 int32_t m; 764 765 if(diff>=0) { 766 diff-=BOCU1_REACH_POS_1+1; 767 m=diff%BOCU1_TRAIL_COUNT; 768 diff/=BOCU1_TRAIL_COUNT; 769 diff+=BOCU1_START_POS_2; 770 } else { 771 diff-=BOCU1_REACH_NEG_1; 772 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 773 diff+=BOCU1_START_NEG_2; 774 } 775 *target++=(uint8_t)diff; 776 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); 777 targetCapacity-=2; 778 } else { 779 int32_t length; /* will be 2..4 */ 780 781 diff=packDiff(diff); 782 length=BOCU1_LENGTH_FROM_PACKED(diff); 783 784 /* write the output character bytes from diff and length */ 785 /* from the first if in the loop we know that targetCapacity>0 */ 786 if(length<=targetCapacity) { 787 switch(length) { 788 /* each branch falls through to the next one */ 789 case 4: 790 *target++=(uint8_t)(diff>>24); 791 U_FALLTHROUGH; 792 case 3: 793 *target++=(uint8_t)(diff>>16); 794 /* case 2: handled above */ 795 *target++=(uint8_t)(diff>>8); 796 /* case 1: handled above */ 797 *target++=(uint8_t)diff; 798 U_FALLTHROUGH; 799 default: 800 /* will never occur */ 801 break; 802 } 803 targetCapacity-=length; 804 } else { 805 uint8_t *charErrorBuffer; 806 807 /* 808 * We actually do this backwards here: 809 * In order to save an intermediate variable, we output 810 * first to the overflow buffer what does not fit into the 811 * regular target. 812 */ 813 /* we know that 1<=targetCapacity<length<=4 */ 814 length-=targetCapacity; 815 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 816 switch(length) { 817 /* each branch falls through to the next one */ 818 case 3: 819 *charErrorBuffer++=(uint8_t)(diff>>16); 820 U_FALLTHROUGH; 821 case 2: 822 *charErrorBuffer++=(uint8_t)(diff>>8); 823 U_FALLTHROUGH; 824 case 1: 825 *charErrorBuffer=(uint8_t)diff; 826 U_FALLTHROUGH; 827 default: 828 /* will never occur */ 829 break; 830 } 831 cnv->charErrorBufferLength=(int8_t)length; 832 833 /* now output what fits into the regular target */ 834 diff>>=8*length; /* length was reduced by targetCapacity */ 835 switch(targetCapacity) { 836 /* each branch falls through to the next one */ 837 case 3: 838 *target++=(uint8_t)(diff>>16); 839 U_FALLTHROUGH; 840 case 2: 841 *target++=(uint8_t)(diff>>8); 842 U_FALLTHROUGH; 843 case 1: 844 *target++=(uint8_t)diff; 845 U_FALLTHROUGH; 846 default: 847 /* will never occur */ 848 break; 849 } 850 851 /* target overflow */ 852 targetCapacity=0; 853 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 854 break; 855 } 856 } 857 } else { 858 /* target is full */ 859 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 860 break; 861 } 862 } 863 864 /* set the converter state back into UConverter */ 865 cnv->fromUChar32= c<0 ? -c : 0; 866 cnv->fromUnicodeStatus=(uint32_t)prev; 867 868 /* write back the updated pointers */ 869 pArgs->source=source; 870 pArgs->target=(char *)target; 871} 872 873/* BOCU-1-to-Unicode conversion functions ----------------------------------- */ 874 875/** 876 * Function for BOCU-1 decoder; handles multi-byte lead bytes. 877 * 878 * @param b lead byte; 879 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD 880 * @return (diff<<2)|count 881 */ 882static inline int32_t 883decodeBocu1LeadByte(int32_t b) { 884 int32_t diff, count; 885 886 if(b>=BOCU1_START_NEG_2) { 887 /* positive difference */ 888 if(b<BOCU1_START_POS_3) { 889 /* two bytes */ 890 diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 891 count=1; 892 } else if(b<BOCU1_START_POS_4) { 893 /* three bytes */ 894 diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; 895 count=2; 896 } else { 897 /* four bytes */ 898 diff=BOCU1_REACH_POS_3+1; 899 count=3; 900 } 901 } else { 902 /* negative difference */ 903 if(b>=BOCU1_START_NEG_3) { 904 /* two bytes */ 905 diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 906 count=1; 907 } else if(b>BOCU1_MIN) { 908 /* three bytes */ 909 diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; 910 count=2; 911 } else { 912 /* four bytes */ 913 diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; 914 count=3; 915 } 916 } 917 918 /* return the state for decoding the trail byte(s) */ 919 return (diff<<2)|count; 920} 921 922/** 923 * Function for BOCU-1 decoder; handles multi-byte trail bytes. 924 * 925 * @param count number of remaining trail bytes including this one 926 * @param b trail byte 927 * @return new delta for diff including b - <0 indicates an error 928 * 929 * @see decodeBocu1 930 */ 931static inline int32_t 932decodeBocu1TrailByte(int32_t count, int32_t b) { 933 if(b<=0x20) { 934 /* skip some C0 controls and make the trail byte range contiguous */ 935 b=bocu1ByteToTrail[b]; 936 /* b<0 for an illegal trail byte value will result in return<0 below */ 937#if BOCU1_MAX_TRAIL<0xff 938 } else if(b>BOCU1_MAX_TRAIL) { 939 return -99; 940#endif 941 } else { 942 b-=BOCU1_TRAIL_BYTE_OFFSET; 943 } 944 945 /* add trail byte into difference and decrement count */ 946 if(count==1) { 947 return b; 948 } else if(count==2) { 949 return b*BOCU1_TRAIL_COUNT; 950 } else /* count==3 */ { 951 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); 952 } 953} 954 955static void U_CALLCONV 956_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 957 UErrorCode *pErrorCode) { 958 UConverter *cnv; 959 const uint8_t *source, *sourceLimit; 960 UChar *target; 961 const UChar *targetLimit; 962 int32_t *offsets; 963 964 int32_t prev, count, diff, c; 965 966 int8_t byteIndex; 967 uint8_t *bytes; 968 969 int32_t sourceIndex, nextSourceIndex; 970 971 /* set up the local pointers */ 972 cnv=pArgs->converter; 973 source=(const uint8_t *)pArgs->source; 974 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 975 target=pArgs->target; 976 targetLimit=pArgs->targetLimit; 977 offsets=pArgs->offsets; 978 979 /* get the converter state from UConverter */ 980 prev=(int32_t)cnv->toUnicodeStatus; 981 if(prev==0) { 982 prev=BOCU1_ASCII_PREV; 983 } 984 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ 985 count=diff&3; 986 diff>>=2; 987 988 byteIndex=cnv->toULength; 989 bytes=cnv->toUBytes; 990 991 /* sourceIndex=-1 if the current character began in the previous buffer */ 992 sourceIndex=byteIndex==0 ? 0 : -1; 993 nextSourceIndex=0; 994 995 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ 996 if(count>0 && byteIndex>0 && target<targetLimit) { 997 goto getTrail; 998 } 999 1000fastSingle: 1001 /* fast loop for single-byte differences */ 1002 /* use count as the only loop counter variable */ 1003 diff=(int32_t)(sourceLimit-source); 1004 count=(int32_t)(pArgs->targetLimit-target); 1005 if(count>diff) { 1006 count=diff; 1007 } 1008 while(count>0) { 1009 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { 1010 c=prev+(c-BOCU1_MIDDLE); 1011 if(c<0x3000) { 1012 *target++=(UChar)c; 1013 *offsets++=nextSourceIndex++; 1014 prev=BOCU1_SIMPLE_PREV(c); 1015 } else { 1016 break; 1017 } 1018 } else if(c<=0x20) { 1019 if(c!=0x20) { 1020 prev=BOCU1_ASCII_PREV; 1021 } 1022 *target++=(UChar)c; 1023 *offsets++=nextSourceIndex++; 1024 } else { 1025 break; 1026 } 1027 ++source; 1028 --count; 1029 } 1030 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ 1031 1032 /* decode a sequence of single and lead bytes */ 1033 while(source<sourceLimit) { 1034 if(target>=targetLimit) { 1035 /* target is full */ 1036 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1037 break; 1038 } 1039 1040 ++nextSourceIndex; 1041 c=*source++; 1042 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { 1043 /* Write a code point directly from a single-byte difference. */ 1044 c=prev+(c-BOCU1_MIDDLE); 1045 if(c<0x3000) { 1046 *target++=(UChar)c; 1047 *offsets++=sourceIndex; 1048 prev=BOCU1_SIMPLE_PREV(c); 1049 sourceIndex=nextSourceIndex; 1050 goto fastSingle; 1051 } 1052 } else if(c<=0x20) { 1053 /* 1054 * Direct-encoded C0 control code or space. 1055 * Reset prev for C0 control codes but not for space. 1056 */ 1057 if(c!=0x20) { 1058 prev=BOCU1_ASCII_PREV; 1059 } 1060 *target++=(UChar)c; 1061 *offsets++=sourceIndex; 1062 sourceIndex=nextSourceIndex; 1063 continue; 1064 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { 1065 /* Optimize two-byte case. */ 1066 if(c>=BOCU1_MIDDLE) { 1067 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 1068 } else { 1069 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 1070 } 1071 1072 /* trail byte */ 1073 ++nextSourceIndex; 1074 c=decodeBocu1TrailByte(1, *source++); 1075 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { 1076 bytes[0]=source[-2]; 1077 bytes[1]=source[-1]; 1078 byteIndex=2; 1079 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1080 break; 1081 } 1082 } else if(c==BOCU1_RESET) { 1083 /* only reset the state, no code point */ 1084 prev=BOCU1_ASCII_PREV; 1085 sourceIndex=nextSourceIndex; 1086 continue; 1087 } else { 1088 /* 1089 * For multi-byte difference lead bytes, set the decoder state 1090 * with the partial difference value from the lead byte and 1091 * with the number of trail bytes. 1092 */ 1093 bytes[0]=(uint8_t)c; 1094 byteIndex=1; 1095 1096 diff=decodeBocu1LeadByte(c); 1097 count=diff&3; 1098 diff>>=2; 1099getTrail: 1100 for(;;) { 1101 if(source>=sourceLimit) { 1102 goto endloop; 1103 } 1104 ++nextSourceIndex; 1105 c=bytes[byteIndex++]=*source++; 1106 1107 /* trail byte in any position */ 1108 c=decodeBocu1TrailByte(count, c); 1109 if(c<0) { 1110 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1111 goto endloop; 1112 } 1113 1114 diff+=c; 1115 if(--count==0) { 1116 /* final trail byte, deliver a code point */ 1117 byteIndex=0; 1118 c=prev+diff; 1119 if((uint32_t)c>0x10ffff) { 1120 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1121 goto endloop; 1122 } 1123 break; 1124 } 1125 } 1126 } 1127 1128 /* calculate the next prev and output c */ 1129 prev=BOCU1_PREV(c); 1130 if(c<=0xffff) { 1131 *target++=(UChar)c; 1132 *offsets++=sourceIndex; 1133 } else { 1134 /* output surrogate pair */ 1135 *target++=U16_LEAD(c); 1136 if(target<targetLimit) { 1137 *target++=U16_TRAIL(c); 1138 *offsets++=sourceIndex; 1139 *offsets++=sourceIndex; 1140 } else { 1141 /* target overflow */ 1142 *offsets++=sourceIndex; 1143 cnv->UCharErrorBuffer[0]=U16_TRAIL(c); 1144 cnv->UCharErrorBufferLength=1; 1145 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1146 break; 1147 } 1148 } 1149 sourceIndex=nextSourceIndex; 1150 } 1151endloop: 1152 1153 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { 1154 /* set the converter state in UConverter to deal with the next character */ 1155 cnv->toUnicodeStatus=BOCU1_ASCII_PREV; 1156 cnv->mode=0; 1157 } else { 1158 /* set the converter state back into UConverter */ 1159 cnv->toUnicodeStatus=(uint32_t)prev; 1160 cnv->mode=(diff<<2)|count; 1161 } 1162 cnv->toULength=byteIndex; 1163 1164 /* write back the updated pointers */ 1165 pArgs->source=(const char *)source; 1166 pArgs->target=target; 1167 pArgs->offsets=offsets; 1168 return; 1169} 1170 1171/* 1172 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling. 1173 * If a change is made in the original function, then either 1174 * change this function the same way or 1175 * re-copy the original function and remove the variables 1176 * offsets, sourceIndex, and nextSourceIndex. 1177 */ 1178static void U_CALLCONV 1179_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs, 1180 UErrorCode *pErrorCode) { 1181 UConverter *cnv; 1182 const uint8_t *source, *sourceLimit; 1183 UChar *target; 1184 const UChar *targetLimit; 1185 1186 int32_t prev, count, diff, c; 1187 1188 int8_t byteIndex; 1189 uint8_t *bytes; 1190 1191 /* set up the local pointers */ 1192 cnv=pArgs->converter; 1193 source=(const uint8_t *)pArgs->source; 1194 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1195 target=pArgs->target; 1196 targetLimit=pArgs->targetLimit; 1197 1198 /* get the converter state from UConverter */ 1199 prev=(int32_t)cnv->toUnicodeStatus; 1200 if(prev==0) { 1201 prev=BOCU1_ASCII_PREV; 1202 } 1203 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ 1204 count=diff&3; 1205 diff>>=2; 1206 1207 byteIndex=cnv->toULength; 1208 bytes=cnv->toUBytes; 1209 1210 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ 1211 if(count>0 && byteIndex>0 && target<targetLimit) { 1212 goto getTrail; 1213 } 1214 1215fastSingle: 1216 /* fast loop for single-byte differences */ 1217 /* use count as the only loop counter variable */ 1218 diff=(int32_t)(sourceLimit-source); 1219 count=(int32_t)(pArgs->targetLimit-target); 1220 if(count>diff) { 1221 count=diff; 1222 } 1223 while(count>0) { 1224 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { 1225 c=prev+(c-BOCU1_MIDDLE); 1226 if(c<0x3000) { 1227 *target++=(UChar)c; 1228 prev=BOCU1_SIMPLE_PREV(c); 1229 } else { 1230 break; 1231 } 1232 } else if(c<=0x20) { 1233 if(c!=0x20) { 1234 prev=BOCU1_ASCII_PREV; 1235 } 1236 *target++=(UChar)c; 1237 } else { 1238 break; 1239 } 1240 ++source; 1241 --count; 1242 } 1243 1244 /* decode a sequence of single and lead bytes */ 1245 while(source<sourceLimit) { 1246 if(target>=targetLimit) { 1247 /* target is full */ 1248 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1249 break; 1250 } 1251 1252 c=*source++; 1253 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { 1254 /* Write a code point directly from a single-byte difference. */ 1255 c=prev+(c-BOCU1_MIDDLE); 1256 if(c<0x3000) { 1257 *target++=(UChar)c; 1258 prev=BOCU1_SIMPLE_PREV(c); 1259 goto fastSingle; 1260 } 1261 } else if(c<=0x20) { 1262 /* 1263 * Direct-encoded C0 control code or space. 1264 * Reset prev for C0 control codes but not for space. 1265 */ 1266 if(c!=0x20) { 1267 prev=BOCU1_ASCII_PREV; 1268 } 1269 *target++=(UChar)c; 1270 continue; 1271 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { 1272 /* Optimize two-byte case. */ 1273 if(c>=BOCU1_MIDDLE) { 1274 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 1275 } else { 1276 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 1277 } 1278 1279 /* trail byte */ 1280 c=decodeBocu1TrailByte(1, *source++); 1281 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { 1282 bytes[0]=source[-2]; 1283 bytes[1]=source[-1]; 1284 byteIndex=2; 1285 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1286 break; 1287 } 1288 } else if(c==BOCU1_RESET) { 1289 /* only reset the state, no code point */ 1290 prev=BOCU1_ASCII_PREV; 1291 continue; 1292 } else { 1293 /* 1294 * For multi-byte difference lead bytes, set the decoder state 1295 * with the partial difference value from the lead byte and 1296 * with the number of trail bytes. 1297 */ 1298 bytes[0]=(uint8_t)c; 1299 byteIndex=1; 1300 1301 diff=decodeBocu1LeadByte(c); 1302 count=diff&3; 1303 diff>>=2; 1304getTrail: 1305 for(;;) { 1306 if(source>=sourceLimit) { 1307 goto endloop; 1308 } 1309 c=bytes[byteIndex++]=*source++; 1310 1311 /* trail byte in any position */ 1312 c=decodeBocu1TrailByte(count, c); 1313 if(c<0) { 1314 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1315 goto endloop; 1316 } 1317 1318 diff+=c; 1319 if(--count==0) { 1320 /* final trail byte, deliver a code point */ 1321 byteIndex=0; 1322 c=prev+diff; 1323 if((uint32_t)c>0x10ffff) { 1324 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1325 goto endloop; 1326 } 1327 break; 1328 } 1329 } 1330 } 1331 1332 /* calculate the next prev and output c */ 1333 prev=BOCU1_PREV(c); 1334 if(c<=0xffff) { 1335 *target++=(UChar)c; 1336 } else { 1337 /* output surrogate pair */ 1338 *target++=U16_LEAD(c); 1339 if(target<targetLimit) { 1340 *target++=U16_TRAIL(c); 1341 } else { 1342 /* target overflow */ 1343 cnv->UCharErrorBuffer[0]=U16_TRAIL(c); 1344 cnv->UCharErrorBufferLength=1; 1345 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1346 break; 1347 } 1348 } 1349 } 1350endloop: 1351 1352 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { 1353 /* set the converter state in UConverter to deal with the next character */ 1354 cnv->toUnicodeStatus=BOCU1_ASCII_PREV; 1355 cnv->mode=0; 1356 } else { 1357 /* set the converter state back into UConverter */ 1358 cnv->toUnicodeStatus=(uint32_t)prev; 1359 cnv->mode=(diff<<2)|count; 1360 } 1361 cnv->toULength=byteIndex; 1362 1363 /* write back the updated pointers */ 1364 pArgs->source=(const char *)source; 1365 pArgs->target=target; 1366 return; 1367} 1368 1369/* miscellaneous ------------------------------------------------------------ */ 1370 1371static const UConverterImpl _Bocu1Impl={ 1372 UCNV_BOCU1, 1373 1374 NULL, 1375 NULL, 1376 1377 NULL, 1378 NULL, 1379 NULL, 1380 1381 _Bocu1ToUnicode, 1382 _Bocu1ToUnicodeWithOffsets, 1383 _Bocu1FromUnicode, 1384 _Bocu1FromUnicodeWithOffsets, 1385 NULL, 1386 1387 NULL, 1388 NULL, 1389 NULL, 1390 NULL, 1391 ucnv_getCompleteUnicodeSet, 1392 1393 NULL, 1394 NULL 1395}; 1396 1397static const UConverterStaticData _Bocu1StaticData={ 1398 sizeof(UConverterStaticData), 1399 "BOCU-1", 1400 1214, /* CCSID for BOCU-1 */ 1401 UCNV_IBM, UCNV_BOCU1, 1402 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */ 1403 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */ 1404 FALSE, FALSE, 1405 0, 1406 0, 1407 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1408}; 1409 1410const UConverterSharedData _Bocu1Data= 1411 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl); 1412 1413#endif 1414