1/*
2**********************************************************************
3*   Copyright (C) 2002-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   file name:  ucnv_u7.c
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2002jul01
12*   created by: Markus W. Scherer
13*
14*   UTF-7 converter implementation. Used to be in ucnv_utf.c.
15*/
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_CONVERSION
20
21#include "unicode/ucnv.h"
22#include "ucnv_bld.h"
23#include "ucnv_cnv.h"
24
25/* UTF-7 -------------------------------------------------------------------- */
26
27/*
28 * UTF-7 is a stateful encoding of Unicode.
29 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
30 * It was intended for use in Internet email systems, using in its bytewise
31 * encoding only a subset of 7-bit US-ASCII.
32 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
33 * occasionally used.
34 *
35 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
36 * characters directly or in base64. Especially, the characters in set O
37 * as defined in the RFC (see below) may be encoded directly but are not
38 * allowed in, e.g., email headers.
39 * By default, the ICU UTF-7 converter encodes set O directly.
40 * By choosing the option "version=1", set O will be escaped instead.
41 * For example:
42 *     utf7Converter=ucnv_open("UTF-7,version=1");
43 *
44 * For details about email headers see RFC 2047.
45 */
46
47/*
48 * Tests for US-ASCII characters belonging to character classes
49 * defined in UTF-7.
50 *
51 * Set D (directly encoded characters) consists of the following
52 * characters: the upper and lower case letters A through Z
53 * and a through z, the 10 digits 0-9, and the following nine special
54 * characters (note that "+" and "=" are omitted):
55 *     '(),-./:?
56 *
57 * Set O (optional direct characters) consists of the following
58 * characters (note that "\" and "~" are omitted):
59 *     !"#$%&*;<=>@[]^_`{|}
60 *
61 * According to the rules in RFC 2152, the byte values for the following
62 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
63 * - all C0 control codes except for CR LF TAB
64 * - BACKSLASH
65 * - TILDE
66 * - DEL
67 * - all codes beyond US-ASCII, i.e. all >127
68 */
69#define inSetD(c) \
70    ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
71     (uint8_t)((c)-48)<10 ||    /* digits */ \
72     (uint8_t)((c)-39)<3 ||     /* '() */ \
73     (uint8_t)((c)-44)<4 ||     /* ,-./ */ \
74     (c)==58 || (c)==63         /* :? */ \
75    )
76
77#define inSetO(c) \
78    ((uint8_t)((c)-33)<6 ||         /* !"#$%& */ \
79     (uint8_t)((c)-59)<4 ||         /* ;<=> */ \
80     (uint8_t)((c)-93)<4 ||         /* ]^_` */ \
81     (uint8_t)((c)-123)<3 ||        /* {|} */ \
82     (c)==42 || (c)==64 || (c)==91  /* *@[ */ \
83    )
84
85#define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
86#define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
87
88#define PLUS  43
89#define MINUS 45
90#define BACKSLASH 92
91#define TILDE 126
92
93/* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
94#define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
95
96/* encode directly sets D and O and CR LF SP TAB */
97static const UBool encodeDirectlyMaximum[128]={
98 /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
99    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
100    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101
102    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
103    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104
105    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
107
108    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
110};
111
112/* encode directly set D and CR LF SP TAB but not set O */
113static const UBool encodeDirectlyRestricted[128]={
114 /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
115    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
116    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117
118    1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
119    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
120
121    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
122    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
123
124    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
126};
127
128static const uint8_t
129toBase64[64]={
130    /* A-Z */
131    65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
132    78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
133    /* a-z */
134    97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
135    110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
136    /* 0-9 */
137    48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
138    /* +/ */
139    43, 47
140};
141
142static const int8_t
143fromBase64[128]={
144    /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
145    -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
146    -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
147
148    /* general punctuation with + and / and a special value (-2) for - */
149    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
150    /* digits */
151    52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
152
153    /* A-Z */
154    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
155    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
156
157    /* a-z */
158    -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
159    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
160};
161
162/*
163 * converter status values:
164 *
165 * toUnicodeStatus:
166 *     24 inDirectMode (boolean)
167 * 23..16 base64Counter (-1..7)
168 * 15..0  bits (up to 14 bits incoming base64)
169 *
170 * fromUnicodeStatus:
171 * 31..28 version (0: set O direct  1: set O escaped)
172 *     24 inDirectMode (boolean)
173 * 23..16 base64Counter (0..2)
174 *  7..0  bits (6 bits outgoing base64)
175 *
176 */
177
178static void
179_UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
180    if(choice<=UCNV_RESET_TO_UNICODE) {
181        /* reset toUnicode */
182        cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
183        cnv->toULength=0;
184    }
185    if(choice!=UCNV_RESET_TO_UNICODE) {
186        /* reset fromUnicode */
187        cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
188    }
189}
190
191static void
192_UTF7Open(UConverter *cnv,
193          UConverterLoadArgs *pArgs,
194          UErrorCode *pErrorCode) {
195    if(UCNV_GET_VERSION(cnv)<=1) {
196        /* TODO(markus): Should just use cnv->options rather than copying the version number. */
197        cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
198        _UTF7Reset(cnv, UCNV_RESET_BOTH);
199    } else {
200        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
201    }
202}
203
204static void
205_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
206                          UErrorCode *pErrorCode) {
207    UConverter *cnv;
208    const uint8_t *source, *sourceLimit;
209    UChar *target;
210    const UChar *targetLimit;
211    int32_t *offsets;
212
213    uint8_t *bytes;
214    uint8_t byteIndex;
215
216    int32_t length, targetCapacity;
217
218    /* UTF-7 state */
219    uint16_t bits;
220    int8_t base64Counter;
221    UBool inDirectMode;
222
223    int8_t base64Value;
224
225    int32_t sourceIndex, nextSourceIndex;
226
227    uint8_t b;
228    /* set up the local pointers */
229    cnv=pArgs->converter;
230
231    source=(const uint8_t *)pArgs->source;
232    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
233    target=pArgs->target;
234    targetLimit=pArgs->targetLimit;
235    offsets=pArgs->offsets;
236    /* get the state machine state */
237    {
238        uint32_t status=cnv->toUnicodeStatus;
239        inDirectMode=(UBool)((status>>24)&1);
240        base64Counter=(int8_t)(status>>16);
241        bits=(uint16_t)status;
242    }
243    bytes=cnv->toUBytes;
244    byteIndex=cnv->toULength;
245
246    /* sourceIndex=-1 if the current character began in the previous buffer */
247    sourceIndex=byteIndex==0 ? 0 : -1;
248    nextSourceIndex=0;
249
250    if(inDirectMode) {
251directMode:
252        /*
253         * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
254         * with their US-ASCII byte values.
255         * Backslash and Tilde and most control characters are not allowed in UTF-7.
256         * A plus sign starts Unicode (or "escape") Mode.
257         *
258         * In Direct Mode, only the sourceIndex is used.
259         */
260        byteIndex=0;
261        length=(int32_t)(sourceLimit-source);
262        targetCapacity=(int32_t)(targetLimit-target);
263        if(length>targetCapacity) {
264            length=targetCapacity;
265        }
266        while(length>0) {
267            b=*source++;
268            if(!isLegalUTF7(b)) {
269                /* illegal */
270                bytes[0]=b;
271                byteIndex=1;
272                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
273                break;
274            } else if(b!=PLUS) {
275                /* write directly encoded character */
276                *target++=b;
277                if(offsets!=NULL) {
278                    *offsets++=sourceIndex++;
279                }
280            } else /* PLUS */ {
281                /* switch to Unicode mode */
282                nextSourceIndex=++sourceIndex;
283                inDirectMode=FALSE;
284                byteIndex=0;
285                bits=0;
286                base64Counter=-1;
287                goto unicodeMode;
288            }
289            --length;
290        }
291        if(source<sourceLimit && target>=targetLimit) {
292            /* target is full */
293            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
294        }
295    } else {
296unicodeMode:
297        /*
298         * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
299         * The base64 sequence ends with any character that is not in the base64 alphabet.
300         * A terminating minus sign is consumed.
301         *
302         * In Unicode Mode, the sourceIndex has the index to the start of the current
303         * base64 bytes, while nextSourceIndex is precisely parallel to source,
304         * keeping the index to the following byte.
305         * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
306         */
307        while(source<sourceLimit) {
308            if(target<targetLimit) {
309                bytes[byteIndex++]=b=*source++;
310                ++nextSourceIndex;
311                base64Value = -3; /* initialize as illegal */
312                if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
313                    /* either
314                     * base64Value==-1 for any legal character except base64 and minus sign, or
315                     * base64Value==-3 for illegal characters:
316                     * 1. In either case, leave Unicode mode.
317                     * 2.1. If we ended with an incomplete UChar or none after the +, then
318                     *      generate an error for the preceding erroneous sequence and deal with
319                     *      the current (possibly illegal) character next time through.
320                     * 2.2. Else the current char comes after a complete UChar, which was already
321                     *      pushed to the output buf, so:
322                     * 2.2.1. If the current char is legal, just save it for processing next time.
323                     *        It may be for example, a plus which we need to deal with in direct mode.
324                     * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
325                     */
326                    inDirectMode=TRUE;
327                    if(base64Counter==-1) {
328                        /* illegal: + immediately followed by something other than base64 or minus sign */
329                        /* include the plus sign in the reported sequence, but not the subsequent char */
330                        --source;
331                        bytes[0]=PLUS;
332                        byteIndex=1;
333                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
334                        break;
335                    } else if(bits!=0) {
336                        /* bits are illegally left over, a UChar is incomplete */
337                        /* don't include current char (legal or illegal) in error seq */
338                        --source;
339                        --byteIndex;
340                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
341                        break;
342                    } else {
343                        /* previous UChar was complete */
344                        if(base64Value==-3) {
345                            /* current character is illegal, deal with it here */
346                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
347                            break;
348                        } else {
349                            /* un-read the current character in case it is a plus sign */
350                            --source;
351                            sourceIndex=nextSourceIndex-1;
352                            goto directMode;
353                        }
354                    }
355                } else if(base64Value>=0) {
356                    /* collect base64 bytes into UChars */
357                    switch(base64Counter) {
358                    case -1: /* -1 is immediately after the + */
359                    case 0:
360                        bits=base64Value;
361                        base64Counter=1;
362                        break;
363                    case 1:
364                    case 3:
365                    case 4:
366                    case 6:
367                        bits=(uint16_t)((bits<<6)|base64Value);
368                        ++base64Counter;
369                        break;
370                    case 2:
371                        *target++=(UChar)((bits<<4)|(base64Value>>2));
372                        if(offsets!=NULL) {
373                            *offsets++=sourceIndex;
374                            sourceIndex=nextSourceIndex-1;
375                        }
376                        bytes[0]=b; /* keep this byte in case an error occurs */
377                        byteIndex=1;
378                        bits=(uint16_t)(base64Value&3);
379                        base64Counter=3;
380                        break;
381                    case 5:
382                        *target++=(UChar)((bits<<2)|(base64Value>>4));
383                        if(offsets!=NULL) {
384                            *offsets++=sourceIndex;
385                            sourceIndex=nextSourceIndex-1;
386                        }
387                        bytes[0]=b; /* keep this byte in case an error occurs */
388                        byteIndex=1;
389                        bits=(uint16_t)(base64Value&15);
390                        base64Counter=6;
391                        break;
392                    case 7:
393                        *target++=(UChar)((bits<<6)|base64Value);
394                        if(offsets!=NULL) {
395                            *offsets++=sourceIndex;
396                            sourceIndex=nextSourceIndex;
397                        }
398                        byteIndex=0;
399                        bits=0;
400                        base64Counter=0;
401                        break;
402                    default:
403                        /* will never occur */
404                        break;
405                    }
406                } else /*base64Value==-2*/ {
407                    /* minus sign terminates the base64 sequence */
408                    inDirectMode=TRUE;
409                    if(base64Counter==-1) {
410                        /* +- i.e. a minus immediately following a plus */
411                        *target++=PLUS;
412                        if(offsets!=NULL) {
413                            *offsets++=sourceIndex-1;
414                        }
415                    } else {
416                        /* absorb the minus and leave the Unicode Mode */
417                        if(bits!=0) {
418                            /* bits are illegally left over, a UChar is incomplete */
419                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
420                            break;
421                        }
422                    }
423                    sourceIndex=nextSourceIndex;
424                    goto directMode;
425                }
426            } else {
427                /* target is full */
428                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
429                break;
430            }
431        }
432    }
433
434    if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
435        /*
436         * if we are in Unicode mode, then the byteIndex might not be 0,
437         * but that is ok if bits==0
438         * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
439         * (not true for IMAP-mailbox-name where we must end in direct mode)
440         */
441        byteIndex=0;
442    }
443
444    /* set the converter state back into UConverter */
445    cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
446    cnv->toULength=byteIndex;
447
448    /* write back the updated pointers */
449    pArgs->source=(const char *)source;
450    pArgs->target=target;
451    pArgs->offsets=offsets;
452    return;
453}
454
455static void
456_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
457                            UErrorCode *pErrorCode) {
458    UConverter *cnv;
459    const UChar *source, *sourceLimit;
460    uint8_t *target, *targetLimit;
461    int32_t *offsets;
462
463    int32_t length, targetCapacity, sourceIndex;
464    UChar c;
465
466    /* UTF-7 state */
467    const UBool *encodeDirectly;
468    uint8_t bits;
469    int8_t base64Counter;
470    UBool inDirectMode;
471
472    /* set up the local pointers */
473    cnv=pArgs->converter;
474
475    /* set up the local pointers */
476    source=pArgs->source;
477    sourceLimit=pArgs->sourceLimit;
478    target=(uint8_t *)pArgs->target;
479    targetLimit=(uint8_t *)pArgs->targetLimit;
480    offsets=pArgs->offsets;
481
482    /* get the state machine state */
483    {
484        uint32_t status=cnv->fromUnicodeStatus;
485        encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
486        inDirectMode=(UBool)((status>>24)&1);
487        base64Counter=(int8_t)(status>>16);
488        bits=(uint8_t)status;
489    }
490
491    /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
492    sourceIndex=0;
493
494    if(inDirectMode) {
495directMode:
496        length=(int32_t)(sourceLimit-source);
497        targetCapacity=(int32_t)(targetLimit-target);
498        if(length>targetCapacity) {
499            length=targetCapacity;
500        }
501        while(length>0) {
502            c=*source++;
503            /* currently always encode CR LF SP TAB directly */
504            if(c<=127 && encodeDirectly[c]) {
505                /* encode directly */
506                *target++=(uint8_t)c;
507                if(offsets!=NULL) {
508                    *offsets++=sourceIndex++;
509                }
510            } else if(c==PLUS) {
511                /* output +- for + */
512                *target++=PLUS;
513                if(target<targetLimit) {
514                    *target++=MINUS;
515                    if(offsets!=NULL) {
516                        *offsets++=sourceIndex;
517                        *offsets++=sourceIndex++;
518                    }
519                    /* realign length and targetCapacity */
520                    goto directMode;
521                } else {
522                    if(offsets!=NULL) {
523                        *offsets++=sourceIndex++;
524                    }
525                    cnv->charErrorBuffer[0]=MINUS;
526                    cnv->charErrorBufferLength=1;
527                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
528                    break;
529                }
530            } else {
531                /* un-read this character and switch to Unicode Mode */
532                --source;
533                *target++=PLUS;
534                if(offsets!=NULL) {
535                    *offsets++=sourceIndex;
536                }
537                inDirectMode=FALSE;
538                base64Counter=0;
539                goto unicodeMode;
540            }
541            --length;
542        }
543        if(source<sourceLimit && target>=targetLimit) {
544            /* target is full */
545            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
546        }
547    } else {
548unicodeMode:
549        while(source<sourceLimit) {
550            if(target<targetLimit) {
551                c=*source++;
552                if(c<=127 && encodeDirectly[c]) {
553                    /* encode directly */
554                    inDirectMode=TRUE;
555
556                    /* trick: back out this character to make this easier */
557                    --source;
558
559                    /* terminate the base64 sequence */
560                    if(base64Counter!=0) {
561                        /* write remaining bits for the previous character */
562                        *target++=toBase64[bits];
563                        if(offsets!=NULL) {
564                            *offsets++=sourceIndex-1;
565                        }
566                    }
567                    if(fromBase64[c]!=-1) {
568                        /* need to terminate with a minus */
569                        if(target<targetLimit) {
570                            *target++=MINUS;
571                            if(offsets!=NULL) {
572                                *offsets++=sourceIndex-1;
573                            }
574                        } else {
575                            cnv->charErrorBuffer[0]=MINUS;
576                            cnv->charErrorBufferLength=1;
577                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
578                            break;
579                        }
580                    }
581                    goto directMode;
582                } else {
583                    /*
584                     * base64 this character:
585                     * Output 2 or 3 base64 bytes for the remaining bits of the previous character
586                     * and the bits of this character, each implicitly in UTF-16BE.
587                     *
588                     * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
589                     * character to the next. The actual 2 or 4 bits are shifted to the left edge
590                     * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
591                     */
592                    switch(base64Counter) {
593                    case 0:
594                        *target++=toBase64[c>>10];
595                        if(target<targetLimit) {
596                            *target++=toBase64[(c>>4)&0x3f];
597                            if(offsets!=NULL) {
598                                *offsets++=sourceIndex;
599                                *offsets++=sourceIndex++;
600                            }
601                        } else {
602                            if(offsets!=NULL) {
603                                *offsets++=sourceIndex++;
604                            }
605                            cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
606                            cnv->charErrorBufferLength=1;
607                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
608                        }
609                        bits=(uint8_t)((c&15)<<2);
610                        base64Counter=1;
611                        break;
612                    case 1:
613                        *target++=toBase64[bits|(c>>14)];
614                        if(target<targetLimit) {
615                            *target++=toBase64[(c>>8)&0x3f];
616                            if(target<targetLimit) {
617                                *target++=toBase64[(c>>2)&0x3f];
618                                if(offsets!=NULL) {
619                                    *offsets++=sourceIndex;
620                                    *offsets++=sourceIndex;
621                                    *offsets++=sourceIndex++;
622                                }
623                            } else {
624                                if(offsets!=NULL) {
625                                    *offsets++=sourceIndex;
626                                    *offsets++=sourceIndex++;
627                                }
628                                cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
629                                cnv->charErrorBufferLength=1;
630                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
631                            }
632                        } else {
633                            if(offsets!=NULL) {
634                                *offsets++=sourceIndex++;
635                            }
636                            cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
637                            cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
638                            cnv->charErrorBufferLength=2;
639                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
640                        }
641                        bits=(uint8_t)((c&3)<<4);
642                        base64Counter=2;
643                        break;
644                    case 2:
645                        *target++=toBase64[bits|(c>>12)];
646                        if(target<targetLimit) {
647                            *target++=toBase64[(c>>6)&0x3f];
648                            if(target<targetLimit) {
649                                *target++=toBase64[c&0x3f];
650                                if(offsets!=NULL) {
651                                    *offsets++=sourceIndex;
652                                    *offsets++=sourceIndex;
653                                    *offsets++=sourceIndex++;
654                                }
655                            } else {
656                                if(offsets!=NULL) {
657                                    *offsets++=sourceIndex;
658                                    *offsets++=sourceIndex++;
659                                }
660                                cnv->charErrorBuffer[0]=toBase64[c&0x3f];
661                                cnv->charErrorBufferLength=1;
662                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
663                            }
664                        } else {
665                            if(offsets!=NULL) {
666                                *offsets++=sourceIndex++;
667                            }
668                            cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
669                            cnv->charErrorBuffer[1]=toBase64[c&0x3f];
670                            cnv->charErrorBufferLength=2;
671                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
672                        }
673                        bits=0;
674                        base64Counter=0;
675                        break;
676                    default:
677                        /* will never occur */
678                        break;
679                    }
680                }
681            } else {
682                /* target is full */
683                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
684                break;
685            }
686        }
687    }
688
689    if(pArgs->flush && source>=sourceLimit) {
690        /* flush remaining bits to the target */
691        if(!inDirectMode) {
692            if (base64Counter!=0) {
693                if(target<targetLimit) {
694                    *target++=toBase64[bits];
695                    if(offsets!=NULL) {
696                        *offsets++=sourceIndex-1;
697                    }
698                } else {
699                    cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
700                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
701                }
702            }
703            /* Add final MINUS to terminate unicodeMode */
704            if(target<targetLimit) {
705                *target++=MINUS;
706                if(offsets!=NULL) {
707                    *offsets++=sourceIndex-1;
708                }
709            } else {
710                cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
711                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
712            }
713        }
714        /* reset the state for the next conversion */
715        cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
716    } else {
717        /* set the converter state back into UConverter */
718        cnv->fromUnicodeStatus=
719            (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
720            ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
721    }
722
723    /* write back the updated pointers */
724    pArgs->source=source;
725    pArgs->target=(char *)target;
726    pArgs->offsets=offsets;
727    return;
728}
729
730static const char *
731_UTF7GetName(const UConverter *cnv) {
732    switch(cnv->fromUnicodeStatus>>28) {
733    case 1:
734        return "UTF-7,version=1";
735    default:
736        return "UTF-7";
737    }
738}
739
740static const UConverterImpl _UTF7Impl={
741    UCNV_UTF7,
742
743    NULL,
744    NULL,
745
746    _UTF7Open,
747    NULL,
748    _UTF7Reset,
749
750    _UTF7ToUnicodeWithOffsets,
751    _UTF7ToUnicodeWithOffsets,
752    _UTF7FromUnicodeWithOffsets,
753    _UTF7FromUnicodeWithOffsets,
754    NULL,
755
756    NULL,
757    _UTF7GetName,
758    NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
759    NULL,
760    ucnv_getCompleteUnicodeSet
761};
762
763static const UConverterStaticData _UTF7StaticData={
764    sizeof(UConverterStaticData),
765    "UTF-7",
766    0, /* TODO CCSID for UTF-7 */
767    UCNV_IBM, UCNV_UTF7,
768    1, 4,
769    { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
770    FALSE, FALSE,
771    0,
772    0,
773    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
774};
775
776const UConverterSharedData _UTF7Data={
777    sizeof(UConverterSharedData), ~((uint32_t)0),
778    NULL, NULL, &_UTF7StaticData, FALSE, &_UTF7Impl,
779    0
780};
781
782/* IMAP mailbox name encoding ----------------------------------------------- */
783
784/*
785 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
786 * http://www.ietf.org/rfc/rfc2060.txt
787 *
788 * 5.1.3.  Mailbox International Naming Convention
789 *
790 * By convention, international mailbox names are specified using a
791 * modified version of the UTF-7 encoding described in [UTF-7].  The
792 * purpose of these modifications is to correct the following problems
793 * with UTF-7:
794 *
795 *    1) UTF-7 uses the "+" character for shifting; this conflicts with
796 *       the common use of "+" in mailbox names, in particular USENET
797 *       newsgroup names.
798 *
799 *    2) UTF-7's encoding is BASE64 which uses the "/" character; this
800 *       conflicts with the use of "/" as a popular hierarchy delimiter.
801 *
802 *    3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
803 *       the use of "\" as a popular hierarchy delimiter.
804 *
805 *    4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
806 *       the use of "~" in some servers as a home directory indicator.
807 *
808 *    5) UTF-7 permits multiple alternate forms to represent the same
809 *       string; in particular, printable US-ASCII chararacters can be
810 *       represented in encoded form.
811 *
812 * In modified UTF-7, printable US-ASCII characters except for "&"
813 * represent themselves; that is, characters with octet values 0x20-0x25
814 * and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
815 * octet sequence "&-".
816 *
817 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
818 * Unicode 16-bit octets) are represented in modified BASE64, with a
819 * further modification from [UTF-7] that "," is used instead of "/".
820 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
821 * character which can represent itself.
822 *
823 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
824 * ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
825 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
826 * ").
827 *
828 * For example, here is a mailbox name which mixes English, Japanese,
829 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
830 */
831
832/*
833 * Tests for US-ASCII characters belonging to character classes
834 * defined in UTF-7.
835 *
836 * Set D (directly encoded characters) consists of the following
837 * characters: the upper and lower case letters A through Z
838 * and a through z, the 10 digits 0-9, and the following nine special
839 * characters (note that "+" and "=" are omitted):
840 *     '(),-./:?
841 *
842 * Set O (optional direct characters) consists of the following
843 * characters (note that "\" and "~" are omitted):
844 *     !"#$%&*;<=>@[]^_`{|}
845 *
846 * According to the rules in RFC 2152, the byte values for the following
847 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
848 * - all C0 control codes except for CR LF TAB
849 * - BACKSLASH
850 * - TILDE
851 * - DEL
852 * - all codes beyond US-ASCII, i.e. all >127
853 */
854
855/* uses '&' not '+' to start a base64 sequence */
856#define AMPERSAND 0x26
857#define COMMA 0x2c
858#define SLASH 0x2f
859
860/* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
861#define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
862
863/* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
864#define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
865
866#define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
867#define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
868
869/*
870 * converter status values:
871 *
872 * toUnicodeStatus:
873 *     24 inDirectMode (boolean)
874 * 23..16 base64Counter (-1..7)
875 * 15..0  bits (up to 14 bits incoming base64)
876 *
877 * fromUnicodeStatus:
878 *     24 inDirectMode (boolean)
879 * 23..16 base64Counter (0..2)
880 *  7..0  bits (6 bits outgoing base64)
881 *
882 * ignore bits 31..25
883 */
884
885static void
886_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
887                          UErrorCode *pErrorCode) {
888    UConverter *cnv;
889    const uint8_t *source, *sourceLimit;
890    UChar *target;
891    const UChar *targetLimit;
892    int32_t *offsets;
893
894    uint8_t *bytes;
895    uint8_t byteIndex;
896
897    int32_t length, targetCapacity;
898
899    /* UTF-7 state */
900    uint16_t bits;
901    int8_t base64Counter;
902    UBool inDirectMode;
903
904    int8_t base64Value;
905
906    int32_t sourceIndex, nextSourceIndex;
907
908    UChar c;
909    uint8_t b;
910
911    /* set up the local pointers */
912    cnv=pArgs->converter;
913
914    source=(const uint8_t *)pArgs->source;
915    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
916    target=pArgs->target;
917    targetLimit=pArgs->targetLimit;
918    offsets=pArgs->offsets;
919    /* get the state machine state */
920    {
921        uint32_t status=cnv->toUnicodeStatus;
922        inDirectMode=(UBool)((status>>24)&1);
923        base64Counter=(int8_t)(status>>16);
924        bits=(uint16_t)status;
925    }
926    bytes=cnv->toUBytes;
927    byteIndex=cnv->toULength;
928
929    /* sourceIndex=-1 if the current character began in the previous buffer */
930    sourceIndex=byteIndex==0 ? 0 : -1;
931    nextSourceIndex=0;
932
933    if(inDirectMode) {
934directMode:
935        /*
936         * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
937         * with their US-ASCII byte values.
938         * An ampersand starts Unicode (or "escape") Mode.
939         *
940         * In Direct Mode, only the sourceIndex is used.
941         */
942        byteIndex=0;
943        length=(int32_t)(sourceLimit-source);
944        targetCapacity=(int32_t)(targetLimit-target);
945        if(length>targetCapacity) {
946            length=targetCapacity;
947        }
948        while(length>0) {
949            b=*source++;
950            if(!isLegalIMAP(b)) {
951                /* illegal */
952                bytes[0]=b;
953                byteIndex=1;
954                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
955                break;
956            } else if(b!=AMPERSAND) {
957                /* write directly encoded character */
958                *target++=b;
959                if(offsets!=NULL) {
960                    *offsets++=sourceIndex++;
961                }
962            } else /* AMPERSAND */ {
963                /* switch to Unicode mode */
964                nextSourceIndex=++sourceIndex;
965                inDirectMode=FALSE;
966                byteIndex=0;
967                bits=0;
968                base64Counter=-1;
969                goto unicodeMode;
970            }
971            --length;
972        }
973        if(source<sourceLimit && target>=targetLimit) {
974            /* target is full */
975            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
976        }
977    } else {
978unicodeMode:
979        /*
980         * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
981         * The base64 sequence ends with any character that is not in the base64 alphabet.
982         * A terminating minus sign is consumed.
983         * US-ASCII must not be base64-ed.
984         *
985         * In Unicode Mode, the sourceIndex has the index to the start of the current
986         * base64 bytes, while nextSourceIndex is precisely parallel to source,
987         * keeping the index to the following byte.
988         * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
989         */
990        while(source<sourceLimit) {
991            if(target<targetLimit) {
992                bytes[byteIndex++]=b=*source++;
993                ++nextSourceIndex;
994                if(b>0x7e) {
995                    /* illegal - test other illegal US-ASCII values by base64Value==-3 */
996                    inDirectMode=TRUE;
997                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
998                    break;
999                } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1000                    /* collect base64 bytes into UChars */
1001                    switch(base64Counter) {
1002                    case -1: /* -1 is immediately after the & */
1003                    case 0:
1004                        bits=base64Value;
1005                        base64Counter=1;
1006                        break;
1007                    case 1:
1008                    case 3:
1009                    case 4:
1010                    case 6:
1011                        bits=(uint16_t)((bits<<6)|base64Value);
1012                        ++base64Counter;
1013                        break;
1014                    case 2:
1015                        c=(UChar)((bits<<4)|(base64Value>>2));
1016                        if(isLegalIMAP(c)) {
1017                            /* illegal */
1018                            inDirectMode=TRUE;
1019                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1020                            goto endloop;
1021                        }
1022                        *target++=c;
1023                        if(offsets!=NULL) {
1024                            *offsets++=sourceIndex;
1025                            sourceIndex=nextSourceIndex-1;
1026                        }
1027                        bytes[0]=b; /* keep this byte in case an error occurs */
1028                        byteIndex=1;
1029                        bits=(uint16_t)(base64Value&3);
1030                        base64Counter=3;
1031                        break;
1032                    case 5:
1033                        c=(UChar)((bits<<2)|(base64Value>>4));
1034                        if(isLegalIMAP(c)) {
1035                            /* illegal */
1036                            inDirectMode=TRUE;
1037                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1038                            goto endloop;
1039                        }
1040                        *target++=c;
1041                        if(offsets!=NULL) {
1042                            *offsets++=sourceIndex;
1043                            sourceIndex=nextSourceIndex-1;
1044                        }
1045                        bytes[0]=b; /* keep this byte in case an error occurs */
1046                        byteIndex=1;
1047                        bits=(uint16_t)(base64Value&15);
1048                        base64Counter=6;
1049                        break;
1050                    case 7:
1051                        c=(UChar)((bits<<6)|base64Value);
1052                        if(isLegalIMAP(c)) {
1053                            /* illegal */
1054                            inDirectMode=TRUE;
1055                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1056                            goto endloop;
1057                        }
1058                        *target++=c;
1059                        if(offsets!=NULL) {
1060                            *offsets++=sourceIndex;
1061                            sourceIndex=nextSourceIndex;
1062                        }
1063                        byteIndex=0;
1064                        bits=0;
1065                        base64Counter=0;
1066                        break;
1067                    default:
1068                        /* will never occur */
1069                        break;
1070                    }
1071                } else if(base64Value==-2) {
1072                    /* minus sign terminates the base64 sequence */
1073                    inDirectMode=TRUE;
1074                    if(base64Counter==-1) {
1075                        /* &- i.e. a minus immediately following an ampersand */
1076                        *target++=AMPERSAND;
1077                        if(offsets!=NULL) {
1078                            *offsets++=sourceIndex-1;
1079                        }
1080                    } else {
1081                        /* absorb the minus and leave the Unicode Mode */
1082                        if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1083                            /* bits are illegally left over, a UChar is incomplete */
1084                            /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1085                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1086                            break;
1087                        }
1088                    }
1089                    sourceIndex=nextSourceIndex;
1090                    goto directMode;
1091                } else {
1092                    if(base64Counter==-1) {
1093                        /* illegal: & immediately followed by something other than base64 or minus sign */
1094                        /* include the ampersand in the reported sequence */
1095                        --sourceIndex;
1096                        bytes[0]=AMPERSAND;
1097                        bytes[1]=b;
1098                        byteIndex=2;
1099                    }
1100                    /* base64Value==-1 for characters that are illegal only in Unicode mode */
1101                    /* base64Value==-3 for illegal characters */
1102                    /* illegal */
1103                    inDirectMode=TRUE;
1104                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1105                    break;
1106                }
1107            } else {
1108                /* target is full */
1109                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1110                break;
1111            }
1112        }
1113    }
1114endloop:
1115
1116    /*
1117     * the end of the input stream and detection of truncated input
1118     * are handled by the framework, but here we must check if we are in Unicode
1119     * mode and byteIndex==0 because we must end in direct mode
1120     *
1121     * conditions:
1122     *   successful
1123     *   in Unicode mode and byteIndex==0
1124     *   end of input and no truncated input
1125     */
1126    if( U_SUCCESS(*pErrorCode) &&
1127        !inDirectMode && byteIndex==0 &&
1128        pArgs->flush && source>=sourceLimit
1129    ) {
1130        if(base64Counter==-1) {
1131            /* & at the very end of the input */
1132            /* make the ampersand the reported sequence */
1133            bytes[0]=AMPERSAND;
1134            byteIndex=1;
1135        }
1136        /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1137
1138        inDirectMode=TRUE; /* avoid looping */
1139        *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1140    }
1141
1142    /* set the converter state back into UConverter */
1143    cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1144    cnv->toULength=byteIndex;
1145
1146    /* write back the updated pointers */
1147    pArgs->source=(const char *)source;
1148    pArgs->target=target;
1149    pArgs->offsets=offsets;
1150    return;
1151}
1152
1153static void
1154_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1155                            UErrorCode *pErrorCode) {
1156    UConverter *cnv;
1157    const UChar *source, *sourceLimit;
1158    uint8_t *target, *targetLimit;
1159    int32_t *offsets;
1160
1161    int32_t length, targetCapacity, sourceIndex;
1162    UChar c;
1163    uint8_t b;
1164
1165    /* UTF-7 state */
1166    uint8_t bits;
1167    int8_t base64Counter;
1168    UBool inDirectMode;
1169
1170    /* set up the local pointers */
1171    cnv=pArgs->converter;
1172
1173    /* set up the local pointers */
1174    source=pArgs->source;
1175    sourceLimit=pArgs->sourceLimit;
1176    target=(uint8_t *)pArgs->target;
1177    targetLimit=(uint8_t *)pArgs->targetLimit;
1178    offsets=pArgs->offsets;
1179
1180    /* get the state machine state */
1181    {
1182        uint32_t status=cnv->fromUnicodeStatus;
1183        inDirectMode=(UBool)((status>>24)&1);
1184        base64Counter=(int8_t)(status>>16);
1185        bits=(uint8_t)status;
1186    }
1187
1188    /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1189    sourceIndex=0;
1190
1191    if(inDirectMode) {
1192directMode:
1193        length=(int32_t)(sourceLimit-source);
1194        targetCapacity=(int32_t)(targetLimit-target);
1195        if(length>targetCapacity) {
1196            length=targetCapacity;
1197        }
1198        while(length>0) {
1199            c=*source++;
1200            /* encode 0x20..0x7e except '&' directly */
1201            if(inSetDIMAP(c)) {
1202                /* encode directly */
1203                *target++=(uint8_t)c;
1204                if(offsets!=NULL) {
1205                    *offsets++=sourceIndex++;
1206                }
1207            } else if(c==AMPERSAND) {
1208                /* output &- for & */
1209                *target++=AMPERSAND;
1210                if(target<targetLimit) {
1211                    *target++=MINUS;
1212                    if(offsets!=NULL) {
1213                        *offsets++=sourceIndex;
1214                        *offsets++=sourceIndex++;
1215                    }
1216                    /* realign length and targetCapacity */
1217                    goto directMode;
1218                } else {
1219                    if(offsets!=NULL) {
1220                        *offsets++=sourceIndex++;
1221                    }
1222                    cnv->charErrorBuffer[0]=MINUS;
1223                    cnv->charErrorBufferLength=1;
1224                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1225                    break;
1226                }
1227            } else {
1228                /* un-read this character and switch to Unicode Mode */
1229                --source;
1230                *target++=AMPERSAND;
1231                if(offsets!=NULL) {
1232                    *offsets++=sourceIndex;
1233                }
1234                inDirectMode=FALSE;
1235                base64Counter=0;
1236                goto unicodeMode;
1237            }
1238            --length;
1239        }
1240        if(source<sourceLimit && target>=targetLimit) {
1241            /* target is full */
1242            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1243        }
1244    } else {
1245unicodeMode:
1246        while(source<sourceLimit) {
1247            if(target<targetLimit) {
1248                c=*source++;
1249                if(isLegalIMAP(c)) {
1250                    /* encode directly */
1251                    inDirectMode=TRUE;
1252
1253                    /* trick: back out this character to make this easier */
1254                    --source;
1255
1256                    /* terminate the base64 sequence */
1257                    if(base64Counter!=0) {
1258                        /* write remaining bits for the previous character */
1259                        *target++=TO_BASE64_IMAP(bits);
1260                        if(offsets!=NULL) {
1261                            *offsets++=sourceIndex-1;
1262                        }
1263                    }
1264                    /* need to terminate with a minus */
1265                    if(target<targetLimit) {
1266                        *target++=MINUS;
1267                        if(offsets!=NULL) {
1268                            *offsets++=sourceIndex-1;
1269                        }
1270                    } else {
1271                        cnv->charErrorBuffer[0]=MINUS;
1272                        cnv->charErrorBufferLength=1;
1273                        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1274                        break;
1275                    }
1276                    goto directMode;
1277                } else {
1278                    /*
1279                     * base64 this character:
1280                     * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1281                     * and the bits of this character, each implicitly in UTF-16BE.
1282                     *
1283                     * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1284                     * character to the next. The actual 2 or 4 bits are shifted to the left edge
1285                     * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1286                     */
1287                    switch(base64Counter) {
1288                    case 0:
1289                        b=(uint8_t)(c>>10);
1290                        *target++=TO_BASE64_IMAP(b);
1291                        if(target<targetLimit) {
1292                            b=(uint8_t)((c>>4)&0x3f);
1293                            *target++=TO_BASE64_IMAP(b);
1294                            if(offsets!=NULL) {
1295                                *offsets++=sourceIndex;
1296                                *offsets++=sourceIndex++;
1297                            }
1298                        } else {
1299                            if(offsets!=NULL) {
1300                                *offsets++=sourceIndex++;
1301                            }
1302                            b=(uint8_t)((c>>4)&0x3f);
1303                            cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1304                            cnv->charErrorBufferLength=1;
1305                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1306                        }
1307                        bits=(uint8_t)((c&15)<<2);
1308                        base64Counter=1;
1309                        break;
1310                    case 1:
1311                        b=(uint8_t)(bits|(c>>14));
1312                        *target++=TO_BASE64_IMAP(b);
1313                        if(target<targetLimit) {
1314                            b=(uint8_t)((c>>8)&0x3f);
1315                            *target++=TO_BASE64_IMAP(b);
1316                            if(target<targetLimit) {
1317                                b=(uint8_t)((c>>2)&0x3f);
1318                                *target++=TO_BASE64_IMAP(b);
1319                                if(offsets!=NULL) {
1320                                    *offsets++=sourceIndex;
1321                                    *offsets++=sourceIndex;
1322                                    *offsets++=sourceIndex++;
1323                                }
1324                            } else {
1325                                if(offsets!=NULL) {
1326                                    *offsets++=sourceIndex;
1327                                    *offsets++=sourceIndex++;
1328                                }
1329                                b=(uint8_t)((c>>2)&0x3f);
1330                                cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1331                                cnv->charErrorBufferLength=1;
1332                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1333                            }
1334                        } else {
1335                            if(offsets!=NULL) {
1336                                *offsets++=sourceIndex++;
1337                            }
1338                            b=(uint8_t)((c>>8)&0x3f);
1339                            cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1340                            b=(uint8_t)((c>>2)&0x3f);
1341                            cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1342                            cnv->charErrorBufferLength=2;
1343                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1344                        }
1345                        bits=(uint8_t)((c&3)<<4);
1346                        base64Counter=2;
1347                        break;
1348                    case 2:
1349                        b=(uint8_t)(bits|(c>>12));
1350                        *target++=TO_BASE64_IMAP(b);
1351                        if(target<targetLimit) {
1352                            b=(uint8_t)((c>>6)&0x3f);
1353                            *target++=TO_BASE64_IMAP(b);
1354                            if(target<targetLimit) {
1355                                b=(uint8_t)(c&0x3f);
1356                                *target++=TO_BASE64_IMAP(b);
1357                                if(offsets!=NULL) {
1358                                    *offsets++=sourceIndex;
1359                                    *offsets++=sourceIndex;
1360                                    *offsets++=sourceIndex++;
1361                                }
1362                            } else {
1363                                if(offsets!=NULL) {
1364                                    *offsets++=sourceIndex;
1365                                    *offsets++=sourceIndex++;
1366                                }
1367                                b=(uint8_t)(c&0x3f);
1368                                cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1369                                cnv->charErrorBufferLength=1;
1370                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1371                            }
1372                        } else {
1373                            if(offsets!=NULL) {
1374                                *offsets++=sourceIndex++;
1375                            }
1376                            b=(uint8_t)((c>>6)&0x3f);
1377                            cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1378                            b=(uint8_t)(c&0x3f);
1379                            cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1380                            cnv->charErrorBufferLength=2;
1381                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1382                        }
1383                        bits=0;
1384                        base64Counter=0;
1385                        break;
1386                    default:
1387                        /* will never occur */
1388                        break;
1389                    }
1390                }
1391            } else {
1392                /* target is full */
1393                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1394                break;
1395            }
1396        }
1397    }
1398
1399    if(pArgs->flush && source>=sourceLimit) {
1400        /* flush remaining bits to the target */
1401        if(!inDirectMode) {
1402            if(base64Counter!=0) {
1403                if(target<targetLimit) {
1404                    *target++=TO_BASE64_IMAP(bits);
1405                    if(offsets!=NULL) {
1406                        *offsets++=sourceIndex-1;
1407                    }
1408                } else {
1409                    cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1410                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1411                }
1412            }
1413            /* need to terminate with a minus */
1414            if(target<targetLimit) {
1415                *target++=MINUS;
1416                if(offsets!=NULL) {
1417                    *offsets++=sourceIndex-1;
1418                }
1419            } else {
1420                cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1421                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1422            }
1423        }
1424        /* reset the state for the next conversion */
1425        cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1426    } else {
1427        /* set the converter state back into UConverter */
1428        cnv->fromUnicodeStatus=
1429            (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
1430            ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1431    }
1432
1433    /* write back the updated pointers */
1434    pArgs->source=source;
1435    pArgs->target=(char *)target;
1436    pArgs->offsets=offsets;
1437    return;
1438}
1439
1440static const UConverterImpl _IMAPImpl={
1441    UCNV_IMAP_MAILBOX,
1442
1443    NULL,
1444    NULL,
1445
1446    _UTF7Open,
1447    NULL,
1448    _UTF7Reset,
1449
1450    _IMAPToUnicodeWithOffsets,
1451    _IMAPToUnicodeWithOffsets,
1452    _IMAPFromUnicodeWithOffsets,
1453    _IMAPFromUnicodeWithOffsets,
1454    NULL,
1455
1456    NULL,
1457    NULL,
1458    NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1459    NULL,
1460    ucnv_getCompleteUnicodeSet
1461};
1462
1463static const UConverterStaticData _IMAPStaticData={
1464    sizeof(UConverterStaticData),
1465    "IMAP-mailbox-name",
1466    0, /* TODO CCSID for IMAP-mailbox-name */
1467    UCNV_IBM, UCNV_IMAP_MAILBOX,
1468    1, 4,
1469    { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1470    FALSE, FALSE,
1471    0,
1472    0,
1473    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1474};
1475
1476const UConverterSharedData _IMAPData={
1477    sizeof(UConverterSharedData), ~((uint32_t)0),
1478    NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl,
1479    0
1480};
1481
1482#endif
1483