1/*
2**********************************************************************
3*   Copyright (C) 2002-2012, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   file name:  ucnv_u8.c
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2002jul01
12*   created by: Markus W. Scherer
13*
14*   UTF-8 converter implementation. Used to be in ucnv_utf.c.
15*
16*   Also, CESU-8 implementation, see UTR 26.
17*   The CESU-8 converter uses all the same functions as the
18*   UTF-8 converter, with a branch for converting supplementary code points.
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_CONVERSION
24
25#include "unicode/ucnv.h"
26#include "unicode/utf.h"
27#include "unicode/utf8.h"
28#include "unicode/utf16.h"
29#include "ucnv_bld.h"
30#include "ucnv_cnv.h"
31#include "cmemory.h"
32
33/* Prototypes --------------------------------------------------------------- */
34
35/* Keep these here to make finicky compilers happy */
36
37U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
38                                           UErrorCode *err);
39U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
40                                                        UErrorCode *err);
41
42
43/* UTF-8 -------------------------------------------------------------------- */
44
45/* UTF-8 Conversion DATA
46 *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
47 */
48/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
49#define MAXIMUM_UCS2            0x0000FFFF
50#define MAXIMUM_UTF             0x0010FFFF
51#define MAXIMUM_UCS4            0x7FFFFFFF
52#define HALF_SHIFT              10
53#define HALF_BASE               0x0010000
54#define HALF_MASK               0x3FF
55#define SURROGATE_HIGH_START    0xD800
56#define SURROGATE_HIGH_END      0xDBFF
57#define SURROGATE_LOW_START     0xDC00
58#define SURROGATE_LOW_END       0xDFFF
59
60/* -SURROGATE_LOW_START + HALF_BASE */
61#define SURROGATE_LOW_BASE      9216
62
63static const uint32_t offsetsFromUTF8[7] = {0,
64  (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
65  (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
66};
67
68/* END OF UTF-8 Conversion DATA */
69
70static const int8_t bytesFromUTF8[256] = {
71  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
78  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
79};
80
81/*
82 * Starting with Unicode 3.0.1:
83 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
84 * byte sequences with more than 4 bytes are illegal in UTF-8,
85 * which is tested with impossible values for them
86 */
87static const uint32_t
88utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
89
90static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
91                                  UErrorCode * err)
92{
93    UConverter *cnv = args->converter;
94    const unsigned char *mySource = (unsigned char *) args->source;
95    UChar *myTarget = args->target;
96    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
97    const UChar *targetLimit = args->targetLimit;
98    unsigned char *toUBytes = cnv->toUBytes;
99    UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
100    uint32_t ch, ch2 = 0;
101    int32_t i, inBytes;
102
103    /* Restore size of current sequence */
104    if (cnv->toUnicodeStatus && myTarget < targetLimit)
105    {
106        inBytes = cnv->mode;            /* restore # of bytes to consume */
107        i = cnv->toULength;             /* restore # of bytes consumed */
108        cnv->toULength = 0;
109
110        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
111        cnv->toUnicodeStatus = 0;
112        goto morebytes;
113    }
114
115
116    while (mySource < sourceLimit && myTarget < targetLimit)
117    {
118        ch = *(mySource++);
119        if (ch < 0x80)        /* Simple case */
120        {
121            *(myTarget++) = (UChar) ch;
122        }
123        else
124        {
125            /* store the first char */
126            toUBytes[0] = (char)ch;
127            inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
128            i = 1;
129
130morebytes:
131            while (i < inBytes)
132            {
133                if (mySource < sourceLimit)
134                {
135                    toUBytes[i] = (char) (ch2 = *mySource);
136                    if (!U8_IS_TRAIL(ch2))
137                    {
138                        break; /* i < inBytes */
139                    }
140                    ch = (ch << 6) + ch2;
141                    ++mySource;
142                    i++;
143                }
144                else
145                {
146                    /* stores a partially calculated target*/
147                    cnv->toUnicodeStatus = ch;
148                    cnv->mode = inBytes;
149                    cnv->toULength = (int8_t) i;
150                    goto donefornow;
151                }
152            }
153
154            /* Remove the accumulated high bits */
155            ch -= offsetsFromUTF8[inBytes];
156
157            /*
158             * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
159             * - use only trail bytes after a lead byte (checked above)
160             * - use the right number of trail bytes for a given lead byte
161             * - encode a code point <= U+10ffff
162             * - use the fewest possible number of bytes for their code points
163             * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
164             *
165             * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
166             * There are no irregular sequences any more.
167             * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
168             */
169            if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
170                (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
171            {
172                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
173                if (ch <= MAXIMUM_UCS2)
174                {
175                    /* fits in 16 bits */
176                    *(myTarget++) = (UChar) ch;
177                }
178                else
179                {
180                    /* write out the surrogates */
181                    ch -= HALF_BASE;
182                    *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
183                    ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
184                    if (myTarget < targetLimit)
185                    {
186                        *(myTarget++) = (UChar)ch;
187                    }
188                    else
189                    {
190                        /* Put in overflow buffer (not handled here) */
191                        cnv->UCharErrorBuffer[0] = (UChar) ch;
192                        cnv->UCharErrorBufferLength = 1;
193                        *err = U_BUFFER_OVERFLOW_ERROR;
194                        break;
195                    }
196                }
197            }
198            else
199            {
200                cnv->toULength = (int8_t)i;
201                *err = U_ILLEGAL_CHAR_FOUND;
202                break;
203            }
204        }
205    }
206
207donefornow:
208    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
209    {
210        /* End of target buffer */
211        *err = U_BUFFER_OVERFLOW_ERROR;
212    }
213
214    args->target = myTarget;
215    args->source = (const char *) mySource;
216}
217
218static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
219                                                UErrorCode * err)
220{
221    UConverter *cnv = args->converter;
222    const unsigned char *mySource = (unsigned char *) args->source;
223    UChar *myTarget = args->target;
224    int32_t *myOffsets = args->offsets;
225    int32_t offsetNum = 0;
226    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
227    const UChar *targetLimit = args->targetLimit;
228    unsigned char *toUBytes = cnv->toUBytes;
229    UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
230    uint32_t ch, ch2 = 0;
231    int32_t i, inBytes;
232
233    /* Restore size of current sequence */
234    if (cnv->toUnicodeStatus && myTarget < targetLimit)
235    {
236        inBytes = cnv->mode;            /* restore # of bytes to consume */
237        i = cnv->toULength;             /* restore # of bytes consumed */
238        cnv->toULength = 0;
239
240        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
241        cnv->toUnicodeStatus = 0;
242        goto morebytes;
243    }
244
245    while (mySource < sourceLimit && myTarget < targetLimit)
246    {
247        ch = *(mySource++);
248        if (ch < 0x80)        /* Simple case */
249        {
250            *(myTarget++) = (UChar) ch;
251            *(myOffsets++) = offsetNum++;
252        }
253        else
254        {
255            toUBytes[0] = (char)ch;
256            inBytes = bytesFromUTF8[ch];
257            i = 1;
258
259morebytes:
260            while (i < inBytes)
261            {
262                if (mySource < sourceLimit)
263                {
264                    toUBytes[i] = (char) (ch2 = *mySource);
265                    if (!U8_IS_TRAIL(ch2))
266                    {
267                        break; /* i < inBytes */
268                    }
269                    ch = (ch << 6) + ch2;
270                    ++mySource;
271                    i++;
272                }
273                else
274                {
275                    cnv->toUnicodeStatus = ch;
276                    cnv->mode = inBytes;
277                    cnv->toULength = (int8_t)i;
278                    goto donefornow;
279                }
280            }
281
282            /* Remove the accumulated high bits */
283            ch -= offsetsFromUTF8[inBytes];
284
285            /*
286             * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
287             * - use only trail bytes after a lead byte (checked above)
288             * - use the right number of trail bytes for a given lead byte
289             * - encode a code point <= U+10ffff
290             * - use the fewest possible number of bytes for their code points
291             * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
292             *
293             * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
294             * There are no irregular sequences any more.
295             * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
296             */
297            if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
298                (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
299            {
300                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
301                if (ch <= MAXIMUM_UCS2)
302                {
303                    /* fits in 16 bits */
304                    *(myTarget++) = (UChar) ch;
305                    *(myOffsets++) = offsetNum;
306                }
307                else
308                {
309                    /* write out the surrogates */
310                    ch -= HALF_BASE;
311                    *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
312                    *(myOffsets++) = offsetNum;
313                    ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
314                    if (myTarget < targetLimit)
315                    {
316                        *(myTarget++) = (UChar)ch;
317                        *(myOffsets++) = offsetNum;
318                    }
319                    else
320                    {
321                        cnv->UCharErrorBuffer[0] = (UChar) ch;
322                        cnv->UCharErrorBufferLength = 1;
323                        *err = U_BUFFER_OVERFLOW_ERROR;
324                    }
325                }
326                offsetNum += i;
327            }
328            else
329            {
330                cnv->toULength = (int8_t)i;
331                *err = U_ILLEGAL_CHAR_FOUND;
332                break;
333            }
334        }
335    }
336
337donefornow:
338    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
339    {   /* End of target buffer */
340        *err = U_BUFFER_OVERFLOW_ERROR;
341    }
342
343    args->target = myTarget;
344    args->source = (const char *) mySource;
345    args->offsets = myOffsets;
346}
347
348U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
349                                    UErrorCode * err)
350{
351    UConverter *cnv = args->converter;
352    const UChar *mySource = args->source;
353    const UChar *sourceLimit = args->sourceLimit;
354    uint8_t *myTarget = (uint8_t *) args->target;
355    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
356    uint8_t *tempPtr;
357    UChar32 ch;
358    uint8_t tempBuf[4];
359    int32_t indexToWrite;
360    UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
361
362    if (cnv->fromUChar32 && myTarget < targetLimit)
363    {
364        ch = cnv->fromUChar32;
365        cnv->fromUChar32 = 0;
366        goto lowsurrogate;
367    }
368
369    while (mySource < sourceLimit && myTarget < targetLimit)
370    {
371        ch = *(mySource++);
372
373        if (ch < 0x80)        /* Single byte */
374        {
375            *(myTarget++) = (uint8_t) ch;
376        }
377        else if (ch < 0x800)  /* Double byte */
378        {
379            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
380            if (myTarget < targetLimit)
381            {
382                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
383            }
384            else
385            {
386                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
387                cnv->charErrorBufferLength = 1;
388                *err = U_BUFFER_OVERFLOW_ERROR;
389            }
390        }
391        else {
392            /* Check for surrogates */
393            if(U16_IS_SURROGATE(ch) && isNotCESU8) {
394lowsurrogate:
395                if (mySource < sourceLimit) {
396                    /* test both code units */
397                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
398                        /* convert and consume this supplementary code point */
399                        ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
400                        ++mySource;
401                        /* exit this condition tree */
402                    }
403                    else {
404                        /* this is an unpaired trail or lead code unit */
405                        /* callback(illegal) */
406                        cnv->fromUChar32 = ch;
407                        *err = U_ILLEGAL_CHAR_FOUND;
408                        break;
409                    }
410                }
411                else {
412                    /* no more input */
413                    cnv->fromUChar32 = ch;
414                    break;
415                }
416            }
417
418            /* Do we write the buffer directly for speed,
419            or do we have to be careful about target buffer space? */
420            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
421
422            if (ch <= MAXIMUM_UCS2) {
423                indexToWrite = 2;
424                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
425            }
426            else {
427                indexToWrite = 3;
428                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
429                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
430            }
431            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
432            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
433
434            if (tempPtr == myTarget) {
435                /* There was enough space to write the codepoint directly. */
436                myTarget += (indexToWrite + 1);
437            }
438            else {
439                /* We might run out of room soon. Write it slowly. */
440                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
441                    if (myTarget < targetLimit) {
442                        *(myTarget++) = *tempPtr;
443                    }
444                    else {
445                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
446                        *err = U_BUFFER_OVERFLOW_ERROR;
447                    }
448                }
449            }
450        }
451    }
452
453    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
454    {
455        *err = U_BUFFER_OVERFLOW_ERROR;
456    }
457
458    args->target = (char *) myTarget;
459    args->source = mySource;
460}
461
462U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
463                                                  UErrorCode * err)
464{
465    UConverter *cnv = args->converter;
466    const UChar *mySource = args->source;
467    int32_t *myOffsets = args->offsets;
468    const UChar *sourceLimit = args->sourceLimit;
469    uint8_t *myTarget = (uint8_t *) args->target;
470    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
471    uint8_t *tempPtr;
472    UChar32 ch;
473    int32_t offsetNum, nextSourceIndex;
474    int32_t indexToWrite;
475    uint8_t tempBuf[4];
476    UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
477
478    if (cnv->fromUChar32 && myTarget < targetLimit)
479    {
480        ch = cnv->fromUChar32;
481        cnv->fromUChar32 = 0;
482        offsetNum = -1;
483        nextSourceIndex = 0;
484        goto lowsurrogate;
485    } else {
486        offsetNum = 0;
487    }
488
489    while (mySource < sourceLimit && myTarget < targetLimit)
490    {
491        ch = *(mySource++);
492
493        if (ch < 0x80)        /* Single byte */
494        {
495            *(myOffsets++) = offsetNum++;
496            *(myTarget++) = (char) ch;
497        }
498        else if (ch < 0x800)  /* Double byte */
499        {
500            *(myOffsets++) = offsetNum;
501            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
502            if (myTarget < targetLimit)
503            {
504                *(myOffsets++) = offsetNum++;
505                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
506            }
507            else
508            {
509                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
510                cnv->charErrorBufferLength = 1;
511                *err = U_BUFFER_OVERFLOW_ERROR;
512            }
513        }
514        else
515        /* Check for surrogates */
516        {
517            nextSourceIndex = offsetNum + 1;
518
519            if(U16_IS_SURROGATE(ch) && isNotCESU8) {
520lowsurrogate:
521                if (mySource < sourceLimit) {
522                    /* test both code units */
523                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
524                        /* convert and consume this supplementary code point */
525                        ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
526                        ++mySource;
527                        ++nextSourceIndex;
528                        /* exit this condition tree */
529                    }
530                    else {
531                        /* this is an unpaired trail or lead code unit */
532                        /* callback(illegal) */
533                        cnv->fromUChar32 = ch;
534                        *err = U_ILLEGAL_CHAR_FOUND;
535                        break;
536                    }
537                }
538                else {
539                    /* no more input */
540                    cnv->fromUChar32 = ch;
541                    break;
542                }
543            }
544
545            /* Do we write the buffer directly for speed,
546            or do we have to be careful about target buffer space? */
547            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
548
549            if (ch <= MAXIMUM_UCS2) {
550                indexToWrite = 2;
551                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
552            }
553            else {
554                indexToWrite = 3;
555                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
556                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
557            }
558            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
559            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
560
561            if (tempPtr == myTarget) {
562                /* There was enough space to write the codepoint directly. */
563                myTarget += (indexToWrite + 1);
564                myOffsets[0] = offsetNum;
565                myOffsets[1] = offsetNum;
566                myOffsets[2] = offsetNum;
567                if (indexToWrite >= 3) {
568                    myOffsets[3] = offsetNum;
569                }
570                myOffsets += (indexToWrite + 1);
571            }
572            else {
573                /* We might run out of room soon. Write it slowly. */
574                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
575                    if (myTarget < targetLimit)
576                    {
577                        *(myOffsets++) = offsetNum;
578                        *(myTarget++) = *tempPtr;
579                    }
580                    else
581                    {
582                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
583                        *err = U_BUFFER_OVERFLOW_ERROR;
584                    }
585                }
586            }
587            offsetNum = nextSourceIndex;
588        }
589    }
590
591    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
592    {
593        *err = U_BUFFER_OVERFLOW_ERROR;
594    }
595
596    args->target = (char *) myTarget;
597    args->source = mySource;
598    args->offsets = myOffsets;
599}
600
601static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
602                                               UErrorCode *err) {
603    UConverter *cnv;
604    const uint8_t *sourceInitial;
605    const uint8_t *source;
606    uint16_t extraBytesToWrite;
607    uint8_t myByte;
608    UChar32 ch;
609    int8_t i, isLegalSequence;
610
611    /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
612
613    cnv = args->converter;
614    sourceInitial = source = (const uint8_t *)args->source;
615    if (source >= (const uint8_t *)args->sourceLimit)
616    {
617        /* no input */
618        *err = U_INDEX_OUTOFBOUNDS_ERROR;
619        return 0xffff;
620    }
621
622    myByte = (uint8_t)*(source++);
623    if (myByte < 0x80)
624    {
625        args->source = (const char *)source;
626        return (UChar32)myByte;
627    }
628
629    extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
630    if (extraBytesToWrite == 0) {
631        cnv->toUBytes[0] = myByte;
632        cnv->toULength = 1;
633        *err = U_ILLEGAL_CHAR_FOUND;
634        args->source = (const char *)source;
635        return 0xffff;
636    }
637
638    /*The byte sequence is longer than the buffer area passed*/
639    if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
640    {
641        /* check if all of the remaining bytes are trail bytes */
642        cnv->toUBytes[0] = myByte;
643        i = 1;
644        *err = U_TRUNCATED_CHAR_FOUND;
645        while(source < (const uint8_t *)args->sourceLimit) {
646            if(U8_IS_TRAIL(myByte = *source)) {
647                cnv->toUBytes[i++] = myByte;
648                ++source;
649            } else {
650                /* error even before we run out of input */
651                *err = U_ILLEGAL_CHAR_FOUND;
652                break;
653            }
654        }
655        cnv->toULength = i;
656        args->source = (const char *)source;
657        return 0xffff;
658    }
659
660    isLegalSequence = 1;
661    ch = myByte << 6;
662    switch(extraBytesToWrite)
663    {
664      /* note: code falls through cases! (sic)*/
665    case 6:
666        ch += (myByte = *source);
667        ch <<= 6;
668        if (!U8_IS_TRAIL(myByte))
669        {
670            isLegalSequence = 0;
671            break;
672        }
673        ++source;
674    case 5: /*fall through*/
675        ch += (myByte = *source);
676        ch <<= 6;
677        if (!U8_IS_TRAIL(myByte))
678        {
679            isLegalSequence = 0;
680            break;
681        }
682        ++source;
683    case 4: /*fall through*/
684        ch += (myByte = *source);
685        ch <<= 6;
686        if (!U8_IS_TRAIL(myByte))
687        {
688            isLegalSequence = 0;
689            break;
690        }
691        ++source;
692    case 3: /*fall through*/
693        ch += (myByte = *source);
694        ch <<= 6;
695        if (!U8_IS_TRAIL(myByte))
696        {
697            isLegalSequence = 0;
698            break;
699        }
700        ++source;
701    case 2: /*fall through*/
702        ch += (myByte = *source);
703        if (!U8_IS_TRAIL(myByte))
704        {
705            isLegalSequence = 0;
706            break;
707        }
708        ++source;
709    };
710    ch -= offsetsFromUTF8[extraBytesToWrite];
711    args->source = (const char *)source;
712
713    /*
714     * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
715     * - use only trail bytes after a lead byte (checked above)
716     * - use the right number of trail bytes for a given lead byte
717     * - encode a code point <= U+10ffff
718     * - use the fewest possible number of bytes for their code points
719     * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
720     *
721     * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
722     * There are no irregular sequences any more.
723     */
724    if (isLegalSequence &&
725        (uint32_t)ch <= MAXIMUM_UTF &&
726        (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
727        !U_IS_SURROGATE(ch)
728    ) {
729        return ch; /* return the code point */
730    }
731
732    for(i = 0; sourceInitial < source; ++i) {
733        cnv->toUBytes[i] = *sourceInitial++;
734    }
735    cnv->toULength = i;
736    *err = U_ILLEGAL_CHAR_FOUND;
737    return 0xffff;
738}
739
740/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
741
742/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
743static const UChar32
744utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
745
746/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
747static const UChar32
748utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
749
750/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
751static void
752ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
753                  UConverterToUnicodeArgs *pToUArgs,
754                  UErrorCode *pErrorCode) {
755    UConverter *utf8;
756    const uint8_t *source, *sourceLimit;
757    uint8_t *target;
758    int32_t targetCapacity;
759    int32_t count;
760
761    int8_t oldToULength, toULength, toULimit;
762
763    UChar32 c;
764    uint8_t b, t1, t2;
765
766    /* set up the local pointers */
767    utf8=pToUArgs->converter;
768    source=(uint8_t *)pToUArgs->source;
769    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
770    target=(uint8_t *)pFromUArgs->target;
771    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
772
773    /* get the converter state from the UTF-8 UConverter */
774    c=(UChar32)utf8->toUnicodeStatus;
775    if(c!=0) {
776        toULength=oldToULength=utf8->toULength;
777        toULimit=(int8_t)utf8->mode;
778    } else {
779        toULength=oldToULength=toULimit=0;
780    }
781
782    count=(int32_t)(sourceLimit-source)+oldToULength;
783    if(count<toULimit) {
784        /*
785         * Not enough input to complete the partial character.
786         * Jump to moreBytes below - it will not output to target.
787         */
788    } else if(targetCapacity<toULimit) {
789        /*
790         * Not enough target capacity to output the partial character.
791         * Let the standard converter handle this.
792         */
793        *pErrorCode=U_USING_DEFAULT_WARNING;
794        return;
795    } else {
796        /*
797         * Use a single counter for source and target, counting the minimum of
798         * the source length and the target capacity.
799         * As a result, the source length is checked only once per multi-byte
800         * character instead of twice.
801         *
802         * Make sure that the last byte sequence is complete, or else
803         * stop just before it.
804         * (The longest legal byte sequence has 3 trail bytes.)
805         * Count oldToULength (number of source bytes from a previous buffer)
806         * into the source length but reduce the source index by toULimit
807         * while going back over trail bytes in order to not go back into
808         * the bytes that will be read for finishing a partial
809         * sequence from the previous buffer.
810         * Let the standard converter handle edge cases.
811         */
812        int32_t i;
813
814        if(count>targetCapacity) {
815            count=targetCapacity;
816        }
817
818        i=0;
819        while(i<3 && i<(count-toULimit)) {
820            b=source[count-oldToULength-i-1];
821            if(U8_IS_TRAIL(b)) {
822                ++i;
823            } else {
824                if(i<U8_COUNT_TRAIL_BYTES(b)) {
825                    /* stop converting before the lead byte if there are not enough trail bytes for it */
826                    count-=i+1;
827                }
828                break;
829            }
830        }
831    }
832
833    if(c!=0) {
834        utf8->toUnicodeStatus=0;
835        utf8->toULength=0;
836        goto moreBytes;
837        /* See note in ucnv_SBCSFromUTF8() about this goto. */
838    }
839
840    /* conversion loop */
841    while(count>0) {
842        b=*source++;
843        if((int8_t)b>=0) {
844            /* convert ASCII */
845            *target++=b;
846            --count;
847            continue;
848        } else {
849            if(b>0xe0) {
850                if( /* handle U+1000..U+D7FF inline */
851                    (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
852                                               (b==0xed && (t1 <= 0x9f))) &&
853                    (t2=source[1]) >= 0x80 && t2 <= 0xbf
854                ) {
855                    source+=2;
856                    *target++=b;
857                    *target++=t1;
858                    *target++=t2;
859                    count-=3;
860                    continue;
861                }
862            } else if(b<0xe0) {
863                if( /* handle U+0080..U+07FF inline */
864                    b>=0xc2 &&
865                    (t1=*source) >= 0x80 && t1 <= 0xbf
866                ) {
867                    ++source;
868                    *target++=b;
869                    *target++=t1;
870                    count-=2;
871                    continue;
872                }
873            } else if(b==0xe0) {
874                if( /* handle U+0800..U+0FFF inline */
875                    (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
876                    (t2=source[1]) >= 0x80 && t2 <= 0xbf
877                ) {
878                    source+=2;
879                    *target++=b;
880                    *target++=t1;
881                    *target++=t2;
882                    count-=3;
883                    continue;
884                }
885            }
886
887            /* handle "complicated" and error cases, and continuing partial characters */
888            oldToULength=0;
889            toULength=1;
890            toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
891            c=b;
892moreBytes:
893            while(toULength<toULimit) {
894                if(source<sourceLimit) {
895                    b=*source;
896                    if(U8_IS_TRAIL(b)) {
897                        ++source;
898                        ++toULength;
899                        c=(c<<6)+b;
900                    } else {
901                        break; /* sequence too short, stop with toULength<toULimit */
902                    }
903                } else {
904                    /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
905                    source-=(toULength-oldToULength);
906                    while(oldToULength<toULength) {
907                        utf8->toUBytes[oldToULength++]=*source++;
908                    }
909                    utf8->toUnicodeStatus=c;
910                    utf8->toULength=toULength;
911                    utf8->mode=toULimit;
912                    pToUArgs->source=(char *)source;
913                    pFromUArgs->target=(char *)target;
914                    return;
915                }
916            }
917
918            if( toULength==toULimit &&      /* consumed all trail bytes */
919                (toULength==3 || toULength==2) &&             /* BMP */
920                (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
921                (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
922            ) {
923                /* legal byte sequence for BMP code point */
924            } else if(
925                toULength==toULimit && toULength==4 &&
926                (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
927            ) {
928                /* legal byte sequence for supplementary code point */
929            } else {
930                /* error handling: illegal UTF-8 byte sequence */
931                source-=(toULength-oldToULength);
932                while(oldToULength<toULength) {
933                    utf8->toUBytes[oldToULength++]=*source++;
934                }
935                utf8->toULength=toULength;
936                pToUArgs->source=(char *)source;
937                pFromUArgs->target=(char *)target;
938                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
939                return;
940            }
941
942            /* copy the legal byte sequence to the target */
943            {
944                int8_t i;
945
946                for(i=0; i<oldToULength; ++i) {
947                    *target++=utf8->toUBytes[i];
948                }
949                source-=(toULength-oldToULength);
950                for(; i<toULength; ++i) {
951                    *target++=*source++;
952                }
953                count-=toULength;
954            }
955        }
956    }
957
958    if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
959        if(target==(const uint8_t *)pFromUArgs->targetLimit) {
960            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
961        } else {
962            b=*source;
963            toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
964            if(toULimit>(sourceLimit-source)) {
965                /* collect a truncated byte sequence */
966                toULength=0;
967                c=b;
968                for(;;) {
969                    utf8->toUBytes[toULength++]=b;
970                    if(++source==sourceLimit) {
971                        /* partial byte sequence at end of source */
972                        utf8->toUnicodeStatus=c;
973                        utf8->toULength=toULength;
974                        utf8->mode=toULimit;
975                        break;
976                    } else if(!U8_IS_TRAIL(b=*source)) {
977                        /* lead byte in trail byte position */
978                        utf8->toULength=toULength;
979                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
980                        break;
981                    }
982                    c=(c<<6)+b;
983                }
984            } else {
985                /* partial-sequence target overflow: fall back to the pivoting implementation */
986                *pErrorCode=U_USING_DEFAULT_WARNING;
987            }
988        }
989    }
990
991    /* write back the updated pointers */
992    pToUArgs->source=(char *)source;
993    pFromUArgs->target=(char *)target;
994}
995
996/* UTF-8 converter data ----------------------------------------------------- */
997
998static const UConverterImpl _UTF8Impl={
999    UCNV_UTF8,
1000
1001    NULL,
1002    NULL,
1003
1004    NULL,
1005    NULL,
1006    NULL,
1007
1008    ucnv_toUnicode_UTF8,
1009    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1010    ucnv_fromUnicode_UTF8,
1011    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1012    ucnv_getNextUChar_UTF8,
1013
1014    NULL,
1015    NULL,
1016    NULL,
1017    NULL,
1018    ucnv_getNonSurrogateUnicodeSet,
1019
1020    ucnv_UTF8FromUTF8,
1021    ucnv_UTF8FromUTF8
1022};
1023
1024/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1025static const UConverterStaticData _UTF8StaticData={
1026    sizeof(UConverterStaticData),
1027    "UTF-8",
1028    1208, UCNV_IBM, UCNV_UTF8,
1029    1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1030    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1031    0,
1032    0,
1033    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1034};
1035
1036
1037const UConverterSharedData _UTF8Data={
1038    sizeof(UConverterSharedData), ~((uint32_t) 0),
1039    NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
1040    0
1041};
1042
1043/* CESU-8 converter data ---------------------------------------------------- */
1044
1045static const UConverterImpl _CESU8Impl={
1046    UCNV_CESU8,
1047
1048    NULL,
1049    NULL,
1050
1051    NULL,
1052    NULL,
1053    NULL,
1054
1055    ucnv_toUnicode_UTF8,
1056    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1057    ucnv_fromUnicode_UTF8,
1058    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1059    NULL,
1060
1061    NULL,
1062    NULL,
1063    NULL,
1064    NULL,
1065    ucnv_getCompleteUnicodeSet
1066};
1067
1068static const UConverterStaticData _CESU8StaticData={
1069    sizeof(UConverterStaticData),
1070    "CESU-8",
1071    9400, /* CCSID for CESU-8 */
1072    UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1073    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1074    0,
1075    0,
1076    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1077};
1078
1079
1080const UConverterSharedData _CESU8Data={
1081    sizeof(UConverterSharedData), ~((uint32_t) 0),
1082    NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
1083    0
1084};
1085
1086#endif
1087