1/* 2********************************************************************** 3* Copyright (C) 2002-2011, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6* file name: ucnv_u7.c 7* encoding: US-ASCII 8* tab size: 8 (not used) 9* indentation:4 10* 11* created on: 2002jul01 12* created by: Markus W. Scherer 13* 14* UTF-7 converter implementation. Used to be in ucnv_utf.c. 15*/ 16 17#include "unicode/utypes.h" 18 19#if !UCONFIG_NO_CONVERSION 20 21#include "unicode/ucnv.h" 22#include "ucnv_bld.h" 23#include "ucnv_cnv.h" 24#include "uassert.h" 25 26/* UTF-7 -------------------------------------------------------------------- */ 27 28/* 29 * UTF-7 is a stateful encoding of Unicode. 30 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt) 31 * It was intended for use in Internet email systems, using in its bytewise 32 * encoding only a subset of 7-bit US-ASCII. 33 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still 34 * occasionally used. 35 * 36 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII 37 * characters directly or in base64. Especially, the characters in set O 38 * as defined in the RFC (see below) may be encoded directly but are not 39 * allowed in, e.g., email headers. 40 * By default, the ICU UTF-7 converter encodes set O directly. 41 * By choosing the option "version=1", set O will be escaped instead. 42 * For example: 43 * utf7Converter=ucnv_open("UTF-7,version=1"); 44 * 45 * For details about email headers see RFC 2047. 46 */ 47 48/* 49 * Tests for US-ASCII characters belonging to character classes 50 * defined in UTF-7. 51 * 52 * Set D (directly encoded characters) consists of the following 53 * characters: the upper and lower case letters A through Z 54 * and a through z, the 10 digits 0-9, and the following nine special 55 * characters (note that "+" and "=" are omitted): 56 * '(),-./:? 57 * 58 * Set O (optional direct characters) consists of the following 59 * characters (note that "\" and "~" are omitted): 60 * !"#$%&*;<=>@[]^_`{|} 61 * 62 * According to the rules in RFC 2152, the byte values for the following 63 * US-ASCII characters are not used in UTF-7 and are therefore illegal: 64 * - all C0 control codes except for CR LF TAB 65 * - BACKSLASH 66 * - TILDE 67 * - DEL 68 * - all codes beyond US-ASCII, i.e. all >127 69 */ 70#define inSetD(c) \ 71 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \ 72 (uint8_t)((c)-48)<10 || /* digits */ \ 73 (uint8_t)((c)-39)<3 || /* '() */ \ 74 (uint8_t)((c)-44)<4 || /* ,-./ */ \ 75 (c)==58 || (c)==63 /* :? */ \ 76 ) 77 78#define inSetO(c) \ 79 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \ 80 (uint8_t)((c)-59)<4 || /* ;<=> */ \ 81 (uint8_t)((c)-93)<4 || /* ]^_` */ \ 82 (uint8_t)((c)-123)<3 || /* {|} */ \ 83 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \ 84 ) 85 86#define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9) 87#define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9) 88 89#define PLUS 43 90#define MINUS 45 91#define BACKSLASH 92 92#define TILDE 126 93 94/* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */ 95#define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c)) 96 97/* encode directly sets D and O and CR LF SP TAB */ 98static const UBool encodeDirectlyMaximum[128]={ 99 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 100 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102 103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 104 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 105 106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 108 109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 111}; 112 113/* encode directly set D and CR LF SP TAB but not set O */ 114static const UBool encodeDirectlyRestricted[128]={ 115 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 116 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 118 119 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 120 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 121 122 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 124 125 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 126 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 127}; 128 129static const uint8_t 130toBase64[64]={ 131 /* A-Z */ 132 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 133 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 134 /* a-z */ 135 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 136 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 137 /* 0-9 */ 138 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 139 /* +/ */ 140 43, 47 141}; 142 143static const int8_t 144fromBase64[128]={ 145 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */ 146 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3, 147 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, 148 149 /* general punctuation with + and / and a special value (-2) for - */ 150 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63, 151 /* digits */ 152 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, 153 154 /* A-Z */ 155 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 156 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1, 157 158 /* a-z */ 159 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 160 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3 161}; 162 163/* 164 * converter status values: 165 * 166 * toUnicodeStatus: 167 * 24 inDirectMode (boolean) 168 * 23..16 base64Counter (-1..7) 169 * 15..0 bits (up to 14 bits incoming base64) 170 * 171 * fromUnicodeStatus: 172 * 31..28 version (0: set O direct 1: set O escaped) 173 * 24 inDirectMode (boolean) 174 * 23..16 base64Counter (0..2) 175 * 7..0 bits (6 bits outgoing base64) 176 * 177 */ 178 179static void 180_UTF7Reset(UConverter *cnv, UConverterResetChoice choice) { 181 if(choice<=UCNV_RESET_TO_UNICODE) { 182 /* reset toUnicode */ 183 cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */ 184 cnv->toULength=0; 185 } 186 if(choice!=UCNV_RESET_TO_UNICODE) { 187 /* reset fromUnicode */ 188 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */ 189 } 190} 191 192static void 193_UTF7Open(UConverter *cnv, 194 UConverterLoadArgs *pArgs, 195 UErrorCode *pErrorCode) { 196 if(UCNV_GET_VERSION(cnv)<=1) { 197 /* TODO(markus): Should just use cnv->options rather than copying the version number. */ 198 cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28; 199 _UTF7Reset(cnv, UCNV_RESET_BOTH); 200 } else { 201 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 202 } 203} 204 205static void 206_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 207 UErrorCode *pErrorCode) { 208 UConverter *cnv; 209 const uint8_t *source, *sourceLimit; 210 UChar *target; 211 const UChar *targetLimit; 212 int32_t *offsets; 213 214 uint8_t *bytes; 215 uint8_t byteIndex; 216 217 int32_t length, targetCapacity; 218 219 /* UTF-7 state */ 220 uint16_t bits; 221 int8_t base64Counter; 222 UBool inDirectMode; 223 224 int8_t base64Value; 225 226 int32_t sourceIndex, nextSourceIndex; 227 228 uint8_t b; 229 /* set up the local pointers */ 230 cnv=pArgs->converter; 231 232 source=(const uint8_t *)pArgs->source; 233 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 234 target=pArgs->target; 235 targetLimit=pArgs->targetLimit; 236 offsets=pArgs->offsets; 237 /* get the state machine state */ 238 { 239 uint32_t status=cnv->toUnicodeStatus; 240 inDirectMode=(UBool)((status>>24)&1); 241 base64Counter=(int8_t)(status>>16); 242 bits=(uint16_t)status; 243 } 244 bytes=cnv->toUBytes; 245 byteIndex=cnv->toULength; 246 247 /* sourceIndex=-1 if the current character began in the previous buffer */ 248 sourceIndex=byteIndex==0 ? 0 : -1; 249 nextSourceIndex=0; 250 251 if(inDirectMode) { 252directMode: 253 /* 254 * In Direct Mode, most US-ASCII characters are encoded directly, i.e., 255 * with their US-ASCII byte values. 256 * Backslash and Tilde and most control characters are not allowed in UTF-7. 257 * A plus sign starts Unicode (or "escape") Mode. 258 * 259 * In Direct Mode, only the sourceIndex is used. 260 */ 261 byteIndex=0; 262 length=(int32_t)(sourceLimit-source); 263 targetCapacity=(int32_t)(targetLimit-target); 264 if(length>targetCapacity) { 265 length=targetCapacity; 266 } 267 while(length>0) { 268 b=*source++; 269 if(!isLegalUTF7(b)) { 270 /* illegal */ 271 bytes[0]=b; 272 byteIndex=1; 273 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 274 break; 275 } else if(b!=PLUS) { 276 /* write directly encoded character */ 277 *target++=b; 278 if(offsets!=NULL) { 279 *offsets++=sourceIndex++; 280 } 281 } else /* PLUS */ { 282 /* switch to Unicode mode */ 283 nextSourceIndex=++sourceIndex; 284 inDirectMode=FALSE; 285 byteIndex=0; 286 bits=0; 287 base64Counter=-1; 288 goto unicodeMode; 289 } 290 --length; 291 } 292 if(source<sourceLimit && target>=targetLimit) { 293 /* target is full */ 294 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 295 } 296 } else { 297unicodeMode: 298 /* 299 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded. 300 * The base64 sequence ends with any character that is not in the base64 alphabet. 301 * A terminating minus sign is consumed. 302 * 303 * In Unicode Mode, the sourceIndex has the index to the start of the current 304 * base64 bytes, while nextSourceIndex is precisely parallel to source, 305 * keeping the index to the following byte. 306 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte. 307 */ 308 while(source<sourceLimit) { 309 if(target<targetLimit) { 310 bytes[byteIndex++]=b=*source++; 311 ++nextSourceIndex; 312 base64Value = -3; /* initialize as illegal */ 313 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) { 314 /* either 315 * base64Value==-1 for any legal character except base64 and minus sign, or 316 * base64Value==-3 for illegal characters: 317 * 1. In either case, leave Unicode mode. 318 * 2.1. If we ended with an incomplete UChar or none after the +, then 319 * generate an error for the preceding erroneous sequence and deal with 320 * the current (possibly illegal) character next time through. 321 * 2.2. Else the current char comes after a complete UChar, which was already 322 * pushed to the output buf, so: 323 * 2.2.1. If the current char is legal, just save it for processing next time. 324 * It may be for example, a plus which we need to deal with in direct mode. 325 * 2.2.2. Else if the current char is illegal, we might as well deal with it here. 326 */ 327 inDirectMode=TRUE; 328 if(base64Counter==-1) { 329 /* illegal: + immediately followed by something other than base64 or minus sign */ 330 /* include the plus sign in the reported sequence, but not the subsequent char */ 331 --source; 332 bytes[0]=PLUS; 333 byteIndex=1; 334 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 335 break; 336 } else if(bits!=0) { 337 /* bits are illegally left over, a UChar is incomplete */ 338 /* don't include current char (legal or illegal) in error seq */ 339 --source; 340 --byteIndex; 341 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 342 break; 343 } else { 344 /* previous UChar was complete */ 345 if(base64Value==-3) { 346 /* current character is illegal, deal with it here */ 347 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 348 break; 349 } else { 350 /* un-read the current character in case it is a plus sign */ 351 --source; 352 sourceIndex=nextSourceIndex-1; 353 goto directMode; 354 } 355 } 356 } else if(base64Value>=0) { 357 /* collect base64 bytes into UChars */ 358 switch(base64Counter) { 359 case -1: /* -1 is immediately after the + */ 360 case 0: 361 bits=base64Value; 362 base64Counter=1; 363 break; 364 case 1: 365 case 3: 366 case 4: 367 case 6: 368 bits=(uint16_t)((bits<<6)|base64Value); 369 ++base64Counter; 370 break; 371 case 2: 372 *target++=(UChar)((bits<<4)|(base64Value>>2)); 373 if(offsets!=NULL) { 374 *offsets++=sourceIndex; 375 sourceIndex=nextSourceIndex-1; 376 } 377 bytes[0]=b; /* keep this byte in case an error occurs */ 378 byteIndex=1; 379 bits=(uint16_t)(base64Value&3); 380 base64Counter=3; 381 break; 382 case 5: 383 *target++=(UChar)((bits<<2)|(base64Value>>4)); 384 if(offsets!=NULL) { 385 *offsets++=sourceIndex; 386 sourceIndex=nextSourceIndex-1; 387 } 388 bytes[0]=b; /* keep this byte in case an error occurs */ 389 byteIndex=1; 390 bits=(uint16_t)(base64Value&15); 391 base64Counter=6; 392 break; 393 case 7: 394 *target++=(UChar)((bits<<6)|base64Value); 395 if(offsets!=NULL) { 396 *offsets++=sourceIndex; 397 sourceIndex=nextSourceIndex; 398 } 399 byteIndex=0; 400 bits=0; 401 base64Counter=0; 402 break; 403 default: 404 /* will never occur */ 405 break; 406 } 407 } else /*base64Value==-2*/ { 408 /* minus sign terminates the base64 sequence */ 409 inDirectMode=TRUE; 410 if(base64Counter==-1) { 411 /* +- i.e. a minus immediately following a plus */ 412 *target++=PLUS; 413 if(offsets!=NULL) { 414 *offsets++=sourceIndex-1; 415 } 416 } else { 417 /* absorb the minus and leave the Unicode Mode */ 418 if(bits!=0) { 419 /* bits are illegally left over, a UChar is incomplete */ 420 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 421 break; 422 } 423 } 424 sourceIndex=nextSourceIndex; 425 goto directMode; 426 } 427 } else { 428 /* target is full */ 429 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 430 break; 431 } 432 } 433 } 434 435 if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) { 436 /* 437 * if we are in Unicode mode, then the byteIndex might not be 0, 438 * but that is ok if bits==0 439 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error 440 * (not true for IMAP-mailbox-name where we must end in direct mode) 441 */ 442 byteIndex=0; 443 } 444 445 /* set the converter state back into UConverter */ 446 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits; 447 cnv->toULength=byteIndex; 448 449 /* write back the updated pointers */ 450 pArgs->source=(const char *)source; 451 pArgs->target=target; 452 pArgs->offsets=offsets; 453 return; 454} 455 456static void 457_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 458 UErrorCode *pErrorCode) { 459 UConverter *cnv; 460 const UChar *source, *sourceLimit; 461 uint8_t *target, *targetLimit; 462 int32_t *offsets; 463 464 int32_t length, targetCapacity, sourceIndex; 465 UChar c; 466 467 /* UTF-7 state */ 468 const UBool *encodeDirectly; 469 uint8_t bits; 470 int8_t base64Counter; 471 UBool inDirectMode; 472 473 /* set up the local pointers */ 474 cnv=pArgs->converter; 475 476 /* set up the local pointers */ 477 source=pArgs->source; 478 sourceLimit=pArgs->sourceLimit; 479 target=(uint8_t *)pArgs->target; 480 targetLimit=(uint8_t *)pArgs->targetLimit; 481 offsets=pArgs->offsets; 482 483 /* get the state machine state */ 484 { 485 uint32_t status=cnv->fromUnicodeStatus; 486 encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted; 487 inDirectMode=(UBool)((status>>24)&1); 488 base64Counter=(int8_t)(status>>16); 489 bits=(uint8_t)status; 490 U_ASSERT(bits<=sizeof(toBase64)/sizeof(toBase64[0])); 491 } 492 493 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */ 494 sourceIndex=0; 495 496 if(inDirectMode) { 497directMode: 498 length=(int32_t)(sourceLimit-source); 499 targetCapacity=(int32_t)(targetLimit-target); 500 if(length>targetCapacity) { 501 length=targetCapacity; 502 } 503 while(length>0) { 504 c=*source++; 505 /* currently always encode CR LF SP TAB directly */ 506 if(c<=127 && encodeDirectly[c]) { 507 /* encode directly */ 508 *target++=(uint8_t)c; 509 if(offsets!=NULL) { 510 *offsets++=sourceIndex++; 511 } 512 } else if(c==PLUS) { 513 /* output +- for + */ 514 *target++=PLUS; 515 if(target<targetLimit) { 516 *target++=MINUS; 517 if(offsets!=NULL) { 518 *offsets++=sourceIndex; 519 *offsets++=sourceIndex++; 520 } 521 /* realign length and targetCapacity */ 522 goto directMode; 523 } else { 524 if(offsets!=NULL) { 525 *offsets++=sourceIndex++; 526 } 527 cnv->charErrorBuffer[0]=MINUS; 528 cnv->charErrorBufferLength=1; 529 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 530 break; 531 } 532 } else { 533 /* un-read this character and switch to Unicode Mode */ 534 --source; 535 *target++=PLUS; 536 if(offsets!=NULL) { 537 *offsets++=sourceIndex; 538 } 539 inDirectMode=FALSE; 540 base64Counter=0; 541 goto unicodeMode; 542 } 543 --length; 544 } 545 if(source<sourceLimit && target>=targetLimit) { 546 /* target is full */ 547 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 548 } 549 } else { 550unicodeMode: 551 while(source<sourceLimit) { 552 if(target<targetLimit) { 553 c=*source++; 554 if(c<=127 && encodeDirectly[c]) { 555 /* encode directly */ 556 inDirectMode=TRUE; 557 558 /* trick: back out this character to make this easier */ 559 --source; 560 561 /* terminate the base64 sequence */ 562 if(base64Counter!=0) { 563 /* write remaining bits for the previous character */ 564 *target++=toBase64[bits]; 565 if(offsets!=NULL) { 566 *offsets++=sourceIndex-1; 567 } 568 } 569 if(fromBase64[c]!=-1) { 570 /* need to terminate with a minus */ 571 if(target<targetLimit) { 572 *target++=MINUS; 573 if(offsets!=NULL) { 574 *offsets++=sourceIndex-1; 575 } 576 } else { 577 cnv->charErrorBuffer[0]=MINUS; 578 cnv->charErrorBufferLength=1; 579 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 580 break; 581 } 582 } 583 goto directMode; 584 } else { 585 /* 586 * base64 this character: 587 * Output 2 or 3 base64 bytes for the remaining bits of the previous character 588 * and the bits of this character, each implicitly in UTF-16BE. 589 * 590 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one 591 * character to the next. The actual 2 or 4 bits are shifted to the left edge 592 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier. 593 */ 594 switch(base64Counter) { 595 case 0: 596 *target++=toBase64[c>>10]; 597 if(target<targetLimit) { 598 *target++=toBase64[(c>>4)&0x3f]; 599 if(offsets!=NULL) { 600 *offsets++=sourceIndex; 601 *offsets++=sourceIndex++; 602 } 603 } else { 604 if(offsets!=NULL) { 605 *offsets++=sourceIndex++; 606 } 607 cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f]; 608 cnv->charErrorBufferLength=1; 609 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 610 } 611 bits=(uint8_t)((c&15)<<2); 612 base64Counter=1; 613 break; 614 case 1: 615 *target++=toBase64[bits|(c>>14)]; 616 if(target<targetLimit) { 617 *target++=toBase64[(c>>8)&0x3f]; 618 if(target<targetLimit) { 619 *target++=toBase64[(c>>2)&0x3f]; 620 if(offsets!=NULL) { 621 *offsets++=sourceIndex; 622 *offsets++=sourceIndex; 623 *offsets++=sourceIndex++; 624 } 625 } else { 626 if(offsets!=NULL) { 627 *offsets++=sourceIndex; 628 *offsets++=sourceIndex++; 629 } 630 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f]; 631 cnv->charErrorBufferLength=1; 632 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 633 } 634 } else { 635 if(offsets!=NULL) { 636 *offsets++=sourceIndex++; 637 } 638 cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f]; 639 cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f]; 640 cnv->charErrorBufferLength=2; 641 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 642 } 643 bits=(uint8_t)((c&3)<<4); 644 base64Counter=2; 645 break; 646 case 2: 647 *target++=toBase64[bits|(c>>12)]; 648 if(target<targetLimit) { 649 *target++=toBase64[(c>>6)&0x3f]; 650 if(target<targetLimit) { 651 *target++=toBase64[c&0x3f]; 652 if(offsets!=NULL) { 653 *offsets++=sourceIndex; 654 *offsets++=sourceIndex; 655 *offsets++=sourceIndex++; 656 } 657 } else { 658 if(offsets!=NULL) { 659 *offsets++=sourceIndex; 660 *offsets++=sourceIndex++; 661 } 662 cnv->charErrorBuffer[0]=toBase64[c&0x3f]; 663 cnv->charErrorBufferLength=1; 664 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 665 } 666 } else { 667 if(offsets!=NULL) { 668 *offsets++=sourceIndex++; 669 } 670 cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f]; 671 cnv->charErrorBuffer[1]=toBase64[c&0x3f]; 672 cnv->charErrorBufferLength=2; 673 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 674 } 675 bits=0; 676 base64Counter=0; 677 break; 678 default: 679 /* will never occur */ 680 break; 681 } 682 } 683 } else { 684 /* target is full */ 685 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 686 break; 687 } 688 } 689 } 690 691 if(pArgs->flush && source>=sourceLimit) { 692 /* flush remaining bits to the target */ 693 if(!inDirectMode) { 694 if (base64Counter!=0) { 695 if(target<targetLimit) { 696 *target++=toBase64[bits]; 697 if(offsets!=NULL) { 698 *offsets++=sourceIndex-1; 699 } 700 } else { 701 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits]; 702 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 703 } 704 } 705 /* Add final MINUS to terminate unicodeMode */ 706 if(target<targetLimit) { 707 *target++=MINUS; 708 if(offsets!=NULL) { 709 *offsets++=sourceIndex-1; 710 } 711 } else { 712 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS; 713 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 714 } 715 } 716 /* reset the state for the next conversion */ 717 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */ 718 } else { 719 /* set the converter state back into UConverter */ 720 cnv->fromUnicodeStatus= 721 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/ 722 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits; 723 } 724 725 /* write back the updated pointers */ 726 pArgs->source=source; 727 pArgs->target=(char *)target; 728 pArgs->offsets=offsets; 729 return; 730} 731 732static const char * 733_UTF7GetName(const UConverter *cnv) { 734 switch(cnv->fromUnicodeStatus>>28) { 735 case 1: 736 return "UTF-7,version=1"; 737 default: 738 return "UTF-7"; 739 } 740} 741 742static const UConverterImpl _UTF7Impl={ 743 UCNV_UTF7, 744 745 NULL, 746 NULL, 747 748 _UTF7Open, 749 NULL, 750 _UTF7Reset, 751 752 _UTF7ToUnicodeWithOffsets, 753 _UTF7ToUnicodeWithOffsets, 754 _UTF7FromUnicodeWithOffsets, 755 _UTF7FromUnicodeWithOffsets, 756 NULL, 757 758 NULL, 759 _UTF7GetName, 760 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */ 761 NULL, 762 ucnv_getCompleteUnicodeSet 763}; 764 765static const UConverterStaticData _UTF7StaticData={ 766 sizeof(UConverterStaticData), 767 "UTF-7", 768 0, /* TODO CCSID for UTF-7 */ 769 UCNV_IBM, UCNV_UTF7, 770 1, 4, 771 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */ 772 FALSE, FALSE, 773 0, 774 0, 775 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 776}; 777 778const UConverterSharedData _UTF7Data={ 779 sizeof(UConverterSharedData), ~((uint32_t)0), 780 NULL, NULL, &_UTF7StaticData, FALSE, &_UTF7Impl, 781 0 782}; 783 784/* IMAP mailbox name encoding ----------------------------------------------- */ 785 786/* 787 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1 788 * http://www.ietf.org/rfc/rfc2060.txt 789 * 790 * 5.1.3. Mailbox International Naming Convention 791 * 792 * By convention, international mailbox names are specified using a 793 * modified version of the UTF-7 encoding described in [UTF-7]. The 794 * purpose of these modifications is to correct the following problems 795 * with UTF-7: 796 * 797 * 1) UTF-7 uses the "+" character for shifting; this conflicts with 798 * the common use of "+" in mailbox names, in particular USENET 799 * newsgroup names. 800 * 801 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this 802 * conflicts with the use of "/" as a popular hierarchy delimiter. 803 * 804 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with 805 * the use of "\" as a popular hierarchy delimiter. 806 * 807 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with 808 * the use of "~" in some servers as a home directory indicator. 809 * 810 * 5) UTF-7 permits multiple alternate forms to represent the same 811 * string; in particular, printable US-ASCII chararacters can be 812 * represented in encoded form. 813 * 814 * In modified UTF-7, printable US-ASCII characters except for "&" 815 * represent themselves; that is, characters with octet values 0x20-0x25 816 * and 0x27-0x7e. The character "&" (0x26) is represented by the two- 817 * octet sequence "&-". 818 * 819 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all 820 * Unicode 16-bit octets) are represented in modified BASE64, with a 821 * further modification from [UTF-7] that "," is used instead of "/". 822 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII 823 * character which can represent itself. 824 * 825 * "&" is used to shift to modified BASE64 and "-" to shift back to US- 826 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that 827 * is, a name that ends with a Unicode 16-bit octet MUST end with a "- 828 * "). 829 * 830 * For example, here is a mailbox name which mixes English, Japanese, 831 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw- 832 */ 833 834/* 835 * Tests for US-ASCII characters belonging to character classes 836 * defined in UTF-7. 837 * 838 * Set D (directly encoded characters) consists of the following 839 * characters: the upper and lower case letters A through Z 840 * and a through z, the 10 digits 0-9, and the following nine special 841 * characters (note that "+" and "=" are omitted): 842 * '(),-./:? 843 * 844 * Set O (optional direct characters) consists of the following 845 * characters (note that "\" and "~" are omitted): 846 * !"#$%&*;<=>@[]^_`{|} 847 * 848 * According to the rules in RFC 2152, the byte values for the following 849 * US-ASCII characters are not used in UTF-7 and are therefore illegal: 850 * - all C0 control codes except for CR LF TAB 851 * - BACKSLASH 852 * - TILDE 853 * - DEL 854 * - all codes beyond US-ASCII, i.e. all >127 855 */ 856 857/* uses '&' not '+' to start a base64 sequence */ 858#define AMPERSAND 0x26 859#define COMMA 0x2c 860#define SLASH 0x2f 861 862/* legal byte values: all US-ASCII graphic characters 0x20..0x7e */ 863#define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e) 864 865/* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */ 866#define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND) 867 868#define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA) 869#define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c]) 870 871/* 872 * converter status values: 873 * 874 * toUnicodeStatus: 875 * 24 inDirectMode (boolean) 876 * 23..16 base64Counter (-1..7) 877 * 15..0 bits (up to 14 bits incoming base64) 878 * 879 * fromUnicodeStatus: 880 * 24 inDirectMode (boolean) 881 * 23..16 base64Counter (0..2) 882 * 7..0 bits (6 bits outgoing base64) 883 * 884 * ignore bits 31..25 885 */ 886 887static void 888_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 889 UErrorCode *pErrorCode) { 890 UConverter *cnv; 891 const uint8_t *source, *sourceLimit; 892 UChar *target; 893 const UChar *targetLimit; 894 int32_t *offsets; 895 896 uint8_t *bytes; 897 uint8_t byteIndex; 898 899 int32_t length, targetCapacity; 900 901 /* UTF-7 state */ 902 uint16_t bits; 903 int8_t base64Counter; 904 UBool inDirectMode; 905 906 int8_t base64Value; 907 908 int32_t sourceIndex, nextSourceIndex; 909 910 UChar c; 911 uint8_t b; 912 913 /* set up the local pointers */ 914 cnv=pArgs->converter; 915 916 source=(const uint8_t *)pArgs->source; 917 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 918 target=pArgs->target; 919 targetLimit=pArgs->targetLimit; 920 offsets=pArgs->offsets; 921 /* get the state machine state */ 922 { 923 uint32_t status=cnv->toUnicodeStatus; 924 inDirectMode=(UBool)((status>>24)&1); 925 base64Counter=(int8_t)(status>>16); 926 bits=(uint16_t)status; 927 } 928 bytes=cnv->toUBytes; 929 byteIndex=cnv->toULength; 930 931 /* sourceIndex=-1 if the current character began in the previous buffer */ 932 sourceIndex=byteIndex==0 ? 0 : -1; 933 nextSourceIndex=0; 934 935 if(inDirectMode) { 936directMode: 937 /* 938 * In Direct Mode, US-ASCII characters are encoded directly, i.e., 939 * with their US-ASCII byte values. 940 * An ampersand starts Unicode (or "escape") Mode. 941 * 942 * In Direct Mode, only the sourceIndex is used. 943 */ 944 byteIndex=0; 945 length=(int32_t)(sourceLimit-source); 946 targetCapacity=(int32_t)(targetLimit-target); 947 if(length>targetCapacity) { 948 length=targetCapacity; 949 } 950 while(length>0) { 951 b=*source++; 952 if(!isLegalIMAP(b)) { 953 /* illegal */ 954 bytes[0]=b; 955 byteIndex=1; 956 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 957 break; 958 } else if(b!=AMPERSAND) { 959 /* write directly encoded character */ 960 *target++=b; 961 if(offsets!=NULL) { 962 *offsets++=sourceIndex++; 963 } 964 } else /* AMPERSAND */ { 965 /* switch to Unicode mode */ 966 nextSourceIndex=++sourceIndex; 967 inDirectMode=FALSE; 968 byteIndex=0; 969 bits=0; 970 base64Counter=-1; 971 goto unicodeMode; 972 } 973 --length; 974 } 975 if(source<sourceLimit && target>=targetLimit) { 976 /* target is full */ 977 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 978 } 979 } else { 980unicodeMode: 981 /* 982 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded. 983 * The base64 sequence ends with any character that is not in the base64 alphabet. 984 * A terminating minus sign is consumed. 985 * US-ASCII must not be base64-ed. 986 * 987 * In Unicode Mode, the sourceIndex has the index to the start of the current 988 * base64 bytes, while nextSourceIndex is precisely parallel to source, 989 * keeping the index to the following byte. 990 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte. 991 */ 992 while(source<sourceLimit) { 993 if(target<targetLimit) { 994 bytes[byteIndex++]=b=*source++; 995 ++nextSourceIndex; 996 if(b>0x7e) { 997 /* illegal - test other illegal US-ASCII values by base64Value==-3 */ 998 inDirectMode=TRUE; 999 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1000 break; 1001 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) { 1002 /* collect base64 bytes into UChars */ 1003 switch(base64Counter) { 1004 case -1: /* -1 is immediately after the & */ 1005 case 0: 1006 bits=base64Value; 1007 base64Counter=1; 1008 break; 1009 case 1: 1010 case 3: 1011 case 4: 1012 case 6: 1013 bits=(uint16_t)((bits<<6)|base64Value); 1014 ++base64Counter; 1015 break; 1016 case 2: 1017 c=(UChar)((bits<<4)|(base64Value>>2)); 1018 if(isLegalIMAP(c)) { 1019 /* illegal */ 1020 inDirectMode=TRUE; 1021 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1022 goto endloop; 1023 } 1024 *target++=c; 1025 if(offsets!=NULL) { 1026 *offsets++=sourceIndex; 1027 sourceIndex=nextSourceIndex-1; 1028 } 1029 bytes[0]=b; /* keep this byte in case an error occurs */ 1030 byteIndex=1; 1031 bits=(uint16_t)(base64Value&3); 1032 base64Counter=3; 1033 break; 1034 case 5: 1035 c=(UChar)((bits<<2)|(base64Value>>4)); 1036 if(isLegalIMAP(c)) { 1037 /* illegal */ 1038 inDirectMode=TRUE; 1039 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1040 goto endloop; 1041 } 1042 *target++=c; 1043 if(offsets!=NULL) { 1044 *offsets++=sourceIndex; 1045 sourceIndex=nextSourceIndex-1; 1046 } 1047 bytes[0]=b; /* keep this byte in case an error occurs */ 1048 byteIndex=1; 1049 bits=(uint16_t)(base64Value&15); 1050 base64Counter=6; 1051 break; 1052 case 7: 1053 c=(UChar)((bits<<6)|base64Value); 1054 if(isLegalIMAP(c)) { 1055 /* illegal */ 1056 inDirectMode=TRUE; 1057 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1058 goto endloop; 1059 } 1060 *target++=c; 1061 if(offsets!=NULL) { 1062 *offsets++=sourceIndex; 1063 sourceIndex=nextSourceIndex; 1064 } 1065 byteIndex=0; 1066 bits=0; 1067 base64Counter=0; 1068 break; 1069 default: 1070 /* will never occur */ 1071 break; 1072 } 1073 } else if(base64Value==-2) { 1074 /* minus sign terminates the base64 sequence */ 1075 inDirectMode=TRUE; 1076 if(base64Counter==-1) { 1077 /* &- i.e. a minus immediately following an ampersand */ 1078 *target++=AMPERSAND; 1079 if(offsets!=NULL) { 1080 *offsets++=sourceIndex-1; 1081 } 1082 } else { 1083 /* absorb the minus and leave the Unicode Mode */ 1084 if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) { 1085 /* bits are illegally left over, a UChar is incomplete */ 1086 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */ 1087 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1088 break; 1089 } 1090 } 1091 sourceIndex=nextSourceIndex; 1092 goto directMode; 1093 } else { 1094 if(base64Counter==-1) { 1095 /* illegal: & immediately followed by something other than base64 or minus sign */ 1096 /* include the ampersand in the reported sequence */ 1097 --sourceIndex; 1098 bytes[0]=AMPERSAND; 1099 bytes[1]=b; 1100 byteIndex=2; 1101 } 1102 /* base64Value==-1 for characters that are illegal only in Unicode mode */ 1103 /* base64Value==-3 for illegal characters */ 1104 /* illegal */ 1105 inDirectMode=TRUE; 1106 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1107 break; 1108 } 1109 } else { 1110 /* target is full */ 1111 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1112 break; 1113 } 1114 } 1115 } 1116endloop: 1117 1118 /* 1119 * the end of the input stream and detection of truncated input 1120 * are handled by the framework, but here we must check if we are in Unicode 1121 * mode and byteIndex==0 because we must end in direct mode 1122 * 1123 * conditions: 1124 * successful 1125 * in Unicode mode and byteIndex==0 1126 * end of input and no truncated input 1127 */ 1128 if( U_SUCCESS(*pErrorCode) && 1129 !inDirectMode && byteIndex==0 && 1130 pArgs->flush && source>=sourceLimit 1131 ) { 1132 if(base64Counter==-1) { 1133 /* & at the very end of the input */ 1134 /* make the ampersand the reported sequence */ 1135 bytes[0]=AMPERSAND; 1136 byteIndex=1; 1137 } 1138 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */ 1139 1140 inDirectMode=TRUE; /* avoid looping */ 1141 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 1142 } 1143 1144 /* set the converter state back into UConverter */ 1145 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits; 1146 cnv->toULength=byteIndex; 1147 1148 /* write back the updated pointers */ 1149 pArgs->source=(const char *)source; 1150 pArgs->target=target; 1151 pArgs->offsets=offsets; 1152 return; 1153} 1154 1155static void 1156_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1157 UErrorCode *pErrorCode) { 1158 UConverter *cnv; 1159 const UChar *source, *sourceLimit; 1160 uint8_t *target, *targetLimit; 1161 int32_t *offsets; 1162 1163 int32_t length, targetCapacity, sourceIndex; 1164 UChar c; 1165 uint8_t b; 1166 1167 /* UTF-7 state */ 1168 uint8_t bits; 1169 int8_t base64Counter; 1170 UBool inDirectMode; 1171 1172 /* set up the local pointers */ 1173 cnv=pArgs->converter; 1174 1175 /* set up the local pointers */ 1176 source=pArgs->source; 1177 sourceLimit=pArgs->sourceLimit; 1178 target=(uint8_t *)pArgs->target; 1179 targetLimit=(uint8_t *)pArgs->targetLimit; 1180 offsets=pArgs->offsets; 1181 1182 /* get the state machine state */ 1183 { 1184 uint32_t status=cnv->fromUnicodeStatus; 1185 inDirectMode=(UBool)((status>>24)&1); 1186 base64Counter=(int8_t)(status>>16); 1187 bits=(uint8_t)status; 1188 } 1189 1190 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */ 1191 sourceIndex=0; 1192 1193 if(inDirectMode) { 1194directMode: 1195 length=(int32_t)(sourceLimit-source); 1196 targetCapacity=(int32_t)(targetLimit-target); 1197 if(length>targetCapacity) { 1198 length=targetCapacity; 1199 } 1200 while(length>0) { 1201 c=*source++; 1202 /* encode 0x20..0x7e except '&' directly */ 1203 if(inSetDIMAP(c)) { 1204 /* encode directly */ 1205 *target++=(uint8_t)c; 1206 if(offsets!=NULL) { 1207 *offsets++=sourceIndex++; 1208 } 1209 } else if(c==AMPERSAND) { 1210 /* output &- for & */ 1211 *target++=AMPERSAND; 1212 if(target<targetLimit) { 1213 *target++=MINUS; 1214 if(offsets!=NULL) { 1215 *offsets++=sourceIndex; 1216 *offsets++=sourceIndex++; 1217 } 1218 /* realign length and targetCapacity */ 1219 goto directMode; 1220 } else { 1221 if(offsets!=NULL) { 1222 *offsets++=sourceIndex++; 1223 } 1224 cnv->charErrorBuffer[0]=MINUS; 1225 cnv->charErrorBufferLength=1; 1226 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1227 break; 1228 } 1229 } else { 1230 /* un-read this character and switch to Unicode Mode */ 1231 --source; 1232 *target++=AMPERSAND; 1233 if(offsets!=NULL) { 1234 *offsets++=sourceIndex; 1235 } 1236 inDirectMode=FALSE; 1237 base64Counter=0; 1238 goto unicodeMode; 1239 } 1240 --length; 1241 } 1242 if(source<sourceLimit && target>=targetLimit) { 1243 /* target is full */ 1244 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1245 } 1246 } else { 1247unicodeMode: 1248 while(source<sourceLimit) { 1249 if(target<targetLimit) { 1250 c=*source++; 1251 if(isLegalIMAP(c)) { 1252 /* encode directly */ 1253 inDirectMode=TRUE; 1254 1255 /* trick: back out this character to make this easier */ 1256 --source; 1257 1258 /* terminate the base64 sequence */ 1259 if(base64Counter!=0) { 1260 /* write remaining bits for the previous character */ 1261 *target++=TO_BASE64_IMAP(bits); 1262 if(offsets!=NULL) { 1263 *offsets++=sourceIndex-1; 1264 } 1265 } 1266 /* need to terminate with a minus */ 1267 if(target<targetLimit) { 1268 *target++=MINUS; 1269 if(offsets!=NULL) { 1270 *offsets++=sourceIndex-1; 1271 } 1272 } else { 1273 cnv->charErrorBuffer[0]=MINUS; 1274 cnv->charErrorBufferLength=1; 1275 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1276 break; 1277 } 1278 goto directMode; 1279 } else { 1280 /* 1281 * base64 this character: 1282 * Output 2 or 3 base64 bytes for the remaining bits of the previous character 1283 * and the bits of this character, each implicitly in UTF-16BE. 1284 * 1285 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one 1286 * character to the next. The actual 2 or 4 bits are shifted to the left edge 1287 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier. 1288 */ 1289 switch(base64Counter) { 1290 case 0: 1291 b=(uint8_t)(c>>10); 1292 *target++=TO_BASE64_IMAP(b); 1293 if(target<targetLimit) { 1294 b=(uint8_t)((c>>4)&0x3f); 1295 *target++=TO_BASE64_IMAP(b); 1296 if(offsets!=NULL) { 1297 *offsets++=sourceIndex; 1298 *offsets++=sourceIndex++; 1299 } 1300 } else { 1301 if(offsets!=NULL) { 1302 *offsets++=sourceIndex++; 1303 } 1304 b=(uint8_t)((c>>4)&0x3f); 1305 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1306 cnv->charErrorBufferLength=1; 1307 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1308 } 1309 bits=(uint8_t)((c&15)<<2); 1310 base64Counter=1; 1311 break; 1312 case 1: 1313 b=(uint8_t)(bits|(c>>14)); 1314 *target++=TO_BASE64_IMAP(b); 1315 if(target<targetLimit) { 1316 b=(uint8_t)((c>>8)&0x3f); 1317 *target++=TO_BASE64_IMAP(b); 1318 if(target<targetLimit) { 1319 b=(uint8_t)((c>>2)&0x3f); 1320 *target++=TO_BASE64_IMAP(b); 1321 if(offsets!=NULL) { 1322 *offsets++=sourceIndex; 1323 *offsets++=sourceIndex; 1324 *offsets++=sourceIndex++; 1325 } 1326 } else { 1327 if(offsets!=NULL) { 1328 *offsets++=sourceIndex; 1329 *offsets++=sourceIndex++; 1330 } 1331 b=(uint8_t)((c>>2)&0x3f); 1332 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1333 cnv->charErrorBufferLength=1; 1334 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1335 } 1336 } else { 1337 if(offsets!=NULL) { 1338 *offsets++=sourceIndex++; 1339 } 1340 b=(uint8_t)((c>>8)&0x3f); 1341 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1342 b=(uint8_t)((c>>2)&0x3f); 1343 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b); 1344 cnv->charErrorBufferLength=2; 1345 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1346 } 1347 bits=(uint8_t)((c&3)<<4); 1348 base64Counter=2; 1349 break; 1350 case 2: 1351 b=(uint8_t)(bits|(c>>12)); 1352 *target++=TO_BASE64_IMAP(b); 1353 if(target<targetLimit) { 1354 b=(uint8_t)((c>>6)&0x3f); 1355 *target++=TO_BASE64_IMAP(b); 1356 if(target<targetLimit) { 1357 b=(uint8_t)(c&0x3f); 1358 *target++=TO_BASE64_IMAP(b); 1359 if(offsets!=NULL) { 1360 *offsets++=sourceIndex; 1361 *offsets++=sourceIndex; 1362 *offsets++=sourceIndex++; 1363 } 1364 } else { 1365 if(offsets!=NULL) { 1366 *offsets++=sourceIndex; 1367 *offsets++=sourceIndex++; 1368 } 1369 b=(uint8_t)(c&0x3f); 1370 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1371 cnv->charErrorBufferLength=1; 1372 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1373 } 1374 } else { 1375 if(offsets!=NULL) { 1376 *offsets++=sourceIndex++; 1377 } 1378 b=(uint8_t)((c>>6)&0x3f); 1379 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1380 b=(uint8_t)(c&0x3f); 1381 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b); 1382 cnv->charErrorBufferLength=2; 1383 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1384 } 1385 bits=0; 1386 base64Counter=0; 1387 break; 1388 default: 1389 /* will never occur */ 1390 break; 1391 } 1392 } 1393 } else { 1394 /* target is full */ 1395 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1396 break; 1397 } 1398 } 1399 } 1400 1401 if(pArgs->flush && source>=sourceLimit) { 1402 /* flush remaining bits to the target */ 1403 if(!inDirectMode) { 1404 if(base64Counter!=0) { 1405 if(target<targetLimit) { 1406 *target++=TO_BASE64_IMAP(bits); 1407 if(offsets!=NULL) { 1408 *offsets++=sourceIndex-1; 1409 } 1410 } else { 1411 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits); 1412 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1413 } 1414 } 1415 /* need to terminate with a minus */ 1416 if(target<targetLimit) { 1417 *target++=MINUS; 1418 if(offsets!=NULL) { 1419 *offsets++=sourceIndex-1; 1420 } 1421 } else { 1422 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS; 1423 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1424 } 1425 } 1426 /* reset the state for the next conversion */ 1427 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */ 1428 } else { 1429 /* set the converter state back into UConverter */ 1430 cnv->fromUnicodeStatus= 1431 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/ 1432 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits; 1433 } 1434 1435 /* write back the updated pointers */ 1436 pArgs->source=source; 1437 pArgs->target=(char *)target; 1438 pArgs->offsets=offsets; 1439 return; 1440} 1441 1442static const UConverterImpl _IMAPImpl={ 1443 UCNV_IMAP_MAILBOX, 1444 1445 NULL, 1446 NULL, 1447 1448 _UTF7Open, 1449 NULL, 1450 _UTF7Reset, 1451 1452 _IMAPToUnicodeWithOffsets, 1453 _IMAPToUnicodeWithOffsets, 1454 _IMAPFromUnicodeWithOffsets, 1455 _IMAPFromUnicodeWithOffsets, 1456 NULL, 1457 1458 NULL, 1459 NULL, 1460 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */ 1461 NULL, 1462 ucnv_getCompleteUnicodeSet 1463}; 1464 1465static const UConverterStaticData _IMAPStaticData={ 1466 sizeof(UConverterStaticData), 1467 "IMAP-mailbox-name", 1468 0, /* TODO CCSID for IMAP-mailbox-name */ 1469 UCNV_IBM, UCNV_IMAP_MAILBOX, 1470 1, 4, 1471 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */ 1472 FALSE, FALSE, 1473 0, 1474 0, 1475 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1476}; 1477 1478const UConverterSharedData _IMAPData={ 1479 sizeof(UConverterSharedData), ~((uint32_t)0), 1480 NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl, 1481 0 1482}; 1483 1484#endif 1485