1/*
2**********************************************************************
3*   Copyright (C) 2002-2012, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   file name:  ucnv_u8.c
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2002jul01
12*   created by: Markus W. Scherer
13*
14*   UTF-8 converter implementation. Used to be in ucnv_utf.c.
15*
16*   Also, CESU-8 implementation, see UTR 26.
17*   The CESU-8 converter uses all the same functions as the
18*   UTF-8 converter, with a branch for converting supplementary code points.
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_CONVERSION
24
25#include "unicode/ucnv.h"
26#include "unicode/utf.h"
27#include "unicode/utf8.h"
28#include "unicode/utf16.h"
29#include "ucnv_bld.h"
30#include "ucnv_cnv.h"
31#include "cmemory.h"
32
33/* Prototypes --------------------------------------------------------------- */
34
35/* Keep these here to make finicky compilers happy */
36
37U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
38                                           UErrorCode *err);
39U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
40                                                        UErrorCode *err);
41
42
43/* UTF-8 -------------------------------------------------------------------- */
44
45/* UTF-8 Conversion DATA
46 *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
47 */
48/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
49#define MAXIMUM_UCS2            0x0000FFFF
50#define MAXIMUM_UTF             0x0010FFFF
51#define MAXIMUM_UCS4            0x7FFFFFFF
52#define HALF_SHIFT              10
53#define HALF_BASE               0x0010000
54#define HALF_MASK               0x3FF
55#define SURROGATE_HIGH_START    0xD800
56#define SURROGATE_HIGH_END      0xDBFF
57#define SURROGATE_LOW_START     0xDC00
58#define SURROGATE_LOW_END       0xDFFF
59
60/* -SURROGATE_LOW_START + HALF_BASE */
61#define SURROGATE_LOW_BASE      9216
62
63static const uint32_t offsetsFromUTF8[7] = {0,
64  (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
65  (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
66};
67
68/* END OF UTF-8 Conversion DATA */
69
70static const int8_t bytesFromUTF8[256] = {
71  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
78  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
79};
80
81/*
82 * Starting with Unicode 3.0.1:
83 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
84 * byte sequences with more than 4 bytes are illegal in UTF-8,
85 * which is tested with impossible values for them
86 */
87static const uint32_t
88utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
89
90static UBool hasCESU8Data(const UConverter *cnv)
91{
92#if UCONFIG_NO_NON_HTML5_CONVERSION
93    return FALSE;
94#else
95    return (UBool)(cnv->sharedData == &_CESU8Data);
96#endif
97}
98
99static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
100                                  UErrorCode * err)
101{
102    UConverter *cnv = args->converter;
103    const unsigned char *mySource = (unsigned char *) args->source;
104    UChar *myTarget = args->target;
105    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
106    const UChar *targetLimit = args->targetLimit;
107    unsigned char *toUBytes = cnv->toUBytes;
108    UBool isCESU8 = hasCESU8Data(cnv);
109    uint32_t ch, ch2 = 0;
110    int32_t i, inBytes;
111
112    /* Restore size of current sequence */
113    if (cnv->toUnicodeStatus && myTarget < targetLimit)
114    {
115        inBytes = cnv->mode;            /* restore # of bytes to consume */
116        i = cnv->toULength;             /* restore # of bytes consumed */
117        cnv->toULength = 0;
118
119        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
120        cnv->toUnicodeStatus = 0;
121        goto morebytes;
122    }
123
124
125    while (mySource < sourceLimit && myTarget < targetLimit)
126    {
127        ch = *(mySource++);
128        if (ch < 0x80)        /* Simple case */
129        {
130            *(myTarget++) = (UChar) ch;
131        }
132        else
133        {
134            /* store the first char */
135            toUBytes[0] = (char)ch;
136            inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
137            i = 1;
138
139morebytes:
140            while (i < inBytes)
141            {
142                if (mySource < sourceLimit)
143                {
144                    toUBytes[i] = (char) (ch2 = *mySource);
145                    if (!U8_IS_TRAIL(ch2))
146                    {
147                        break; /* i < inBytes */
148                    }
149                    ch = (ch << 6) + ch2;
150                    ++mySource;
151                    i++;
152                }
153                else
154                {
155                    /* stores a partially calculated target*/
156                    cnv->toUnicodeStatus = ch;
157                    cnv->mode = inBytes;
158                    cnv->toULength = (int8_t) i;
159                    goto donefornow;
160                }
161            }
162
163            /* Remove the accumulated high bits */
164            ch -= offsetsFromUTF8[inBytes];
165
166            /*
167             * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
168             * - use only trail bytes after a lead byte (checked above)
169             * - use the right number of trail bytes for a given lead byte
170             * - encode a code point <= U+10ffff
171             * - use the fewest possible number of bytes for their code points
172             * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
173             *
174             * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
175             * There are no irregular sequences any more.
176             * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
177             */
178            if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
179                (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
180            {
181                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
182                if (ch <= MAXIMUM_UCS2)
183                {
184                    /* fits in 16 bits */
185                    *(myTarget++) = (UChar) ch;
186                }
187                else
188                {
189                    /* write out the surrogates */
190                    ch -= HALF_BASE;
191                    *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
192                    ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
193                    if (myTarget < targetLimit)
194                    {
195                        *(myTarget++) = (UChar)ch;
196                    }
197                    else
198                    {
199                        /* Put in overflow buffer (not handled here) */
200                        cnv->UCharErrorBuffer[0] = (UChar) ch;
201                        cnv->UCharErrorBufferLength = 1;
202                        *err = U_BUFFER_OVERFLOW_ERROR;
203                        break;
204                    }
205                }
206            }
207            else
208            {
209                cnv->toULength = (int8_t)i;
210                *err = U_ILLEGAL_CHAR_FOUND;
211                break;
212            }
213        }
214    }
215
216donefornow:
217    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
218    {
219        /* End of target buffer */
220        *err = U_BUFFER_OVERFLOW_ERROR;
221    }
222
223    args->target = myTarget;
224    args->source = (const char *) mySource;
225}
226
227static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
228                                                UErrorCode * err)
229{
230    UConverter *cnv = args->converter;
231    const unsigned char *mySource = (unsigned char *) args->source;
232    UChar *myTarget = args->target;
233    int32_t *myOffsets = args->offsets;
234    int32_t offsetNum = 0;
235    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
236    const UChar *targetLimit = args->targetLimit;
237    unsigned char *toUBytes = cnv->toUBytes;
238    UBool isCESU8 = hasCESU8Data(cnv);
239    uint32_t ch, ch2 = 0;
240    int32_t i, inBytes;
241
242    /* Restore size of current sequence */
243    if (cnv->toUnicodeStatus && myTarget < targetLimit)
244    {
245        inBytes = cnv->mode;            /* restore # of bytes to consume */
246        i = cnv->toULength;             /* restore # of bytes consumed */
247        cnv->toULength = 0;
248
249        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
250        cnv->toUnicodeStatus = 0;
251        goto morebytes;
252    }
253
254    while (mySource < sourceLimit && myTarget < targetLimit)
255    {
256        ch = *(mySource++);
257        if (ch < 0x80)        /* Simple case */
258        {
259            *(myTarget++) = (UChar) ch;
260            *(myOffsets++) = offsetNum++;
261        }
262        else
263        {
264            toUBytes[0] = (char)ch;
265            inBytes = bytesFromUTF8[ch];
266            i = 1;
267
268morebytes:
269            while (i < inBytes)
270            {
271                if (mySource < sourceLimit)
272                {
273                    toUBytes[i] = (char) (ch2 = *mySource);
274                    if (!U8_IS_TRAIL(ch2))
275                    {
276                        break; /* i < inBytes */
277                    }
278                    ch = (ch << 6) + ch2;
279                    ++mySource;
280                    i++;
281                }
282                else
283                {
284                    cnv->toUnicodeStatus = ch;
285                    cnv->mode = inBytes;
286                    cnv->toULength = (int8_t)i;
287                    goto donefornow;
288                }
289            }
290
291            /* Remove the accumulated high bits */
292            ch -= offsetsFromUTF8[inBytes];
293
294            /*
295             * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
296             * - use only trail bytes after a lead byte (checked above)
297             * - use the right number of trail bytes for a given lead byte
298             * - encode a code point <= U+10ffff
299             * - use the fewest possible number of bytes for their code points
300             * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
301             *
302             * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
303             * There are no irregular sequences any more.
304             * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
305             */
306            if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
307                (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
308            {
309                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
310                if (ch <= MAXIMUM_UCS2)
311                {
312                    /* fits in 16 bits */
313                    *(myTarget++) = (UChar) ch;
314                    *(myOffsets++) = offsetNum;
315                }
316                else
317                {
318                    /* write out the surrogates */
319                    ch -= HALF_BASE;
320                    *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
321                    *(myOffsets++) = offsetNum;
322                    ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
323                    if (myTarget < targetLimit)
324                    {
325                        *(myTarget++) = (UChar)ch;
326                        *(myOffsets++) = offsetNum;
327                    }
328                    else
329                    {
330                        cnv->UCharErrorBuffer[0] = (UChar) ch;
331                        cnv->UCharErrorBufferLength = 1;
332                        *err = U_BUFFER_OVERFLOW_ERROR;
333                    }
334                }
335                offsetNum += i;
336            }
337            else
338            {
339                cnv->toULength = (int8_t)i;
340                *err = U_ILLEGAL_CHAR_FOUND;
341                break;
342            }
343        }
344    }
345
346donefornow:
347    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
348    {   /* End of target buffer */
349        *err = U_BUFFER_OVERFLOW_ERROR;
350    }
351
352    args->target = myTarget;
353    args->source = (const char *) mySource;
354    args->offsets = myOffsets;
355}
356
357U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
358                                    UErrorCode * err)
359{
360    UConverter *cnv = args->converter;
361    const UChar *mySource = args->source;
362    const UChar *sourceLimit = args->sourceLimit;
363    uint8_t *myTarget = (uint8_t *) args->target;
364    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
365    uint8_t *tempPtr;
366    UChar32 ch;
367    uint8_t tempBuf[4];
368    int32_t indexToWrite;
369    UBool isNotCESU8 = !hasCESU8Data(cnv);
370
371    if (cnv->fromUChar32 && myTarget < targetLimit)
372    {
373        ch = cnv->fromUChar32;
374        cnv->fromUChar32 = 0;
375        goto lowsurrogate;
376    }
377
378    while (mySource < sourceLimit && myTarget < targetLimit)
379    {
380        ch = *(mySource++);
381
382        if (ch < 0x80)        /* Single byte */
383        {
384            *(myTarget++) = (uint8_t) ch;
385        }
386        else if (ch < 0x800)  /* Double byte */
387        {
388            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
389            if (myTarget < targetLimit)
390            {
391                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
392            }
393            else
394            {
395                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
396                cnv->charErrorBufferLength = 1;
397                *err = U_BUFFER_OVERFLOW_ERROR;
398            }
399        }
400        else {
401            /* Check for surrogates */
402            if(U16_IS_SURROGATE(ch) && isNotCESU8) {
403lowsurrogate:
404                if (mySource < sourceLimit) {
405                    /* test both code units */
406                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
407                        /* convert and consume this supplementary code point */
408                        ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
409                        ++mySource;
410                        /* exit this condition tree */
411                    }
412                    else {
413                        /* this is an unpaired trail or lead code unit */
414                        /* callback(illegal) */
415                        cnv->fromUChar32 = ch;
416                        *err = U_ILLEGAL_CHAR_FOUND;
417                        break;
418                    }
419                }
420                else {
421                    /* no more input */
422                    cnv->fromUChar32 = ch;
423                    break;
424                }
425            }
426
427            /* Do we write the buffer directly for speed,
428            or do we have to be careful about target buffer space? */
429            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
430
431            if (ch <= MAXIMUM_UCS2) {
432                indexToWrite = 2;
433                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
434            }
435            else {
436                indexToWrite = 3;
437                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
438                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
439            }
440            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
441            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
442
443            if (tempPtr == myTarget) {
444                /* There was enough space to write the codepoint directly. */
445                myTarget += (indexToWrite + 1);
446            }
447            else {
448                /* We might run out of room soon. Write it slowly. */
449                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
450                    if (myTarget < targetLimit) {
451                        *(myTarget++) = *tempPtr;
452                    }
453                    else {
454                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
455                        *err = U_BUFFER_OVERFLOW_ERROR;
456                    }
457                }
458            }
459        }
460    }
461
462    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
463    {
464        *err = U_BUFFER_OVERFLOW_ERROR;
465    }
466
467    args->target = (char *) myTarget;
468    args->source = mySource;
469}
470
471U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
472                                                  UErrorCode * err)
473{
474    UConverter *cnv = args->converter;
475    const UChar *mySource = args->source;
476    int32_t *myOffsets = args->offsets;
477    const UChar *sourceLimit = args->sourceLimit;
478    uint8_t *myTarget = (uint8_t *) args->target;
479    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
480    uint8_t *tempPtr;
481    UChar32 ch;
482    int32_t offsetNum, nextSourceIndex;
483    int32_t indexToWrite;
484    uint8_t tempBuf[4];
485    UBool isNotCESU8 = !hasCESU8Data(cnv);
486
487    if (cnv->fromUChar32 && myTarget < targetLimit)
488    {
489        ch = cnv->fromUChar32;
490        cnv->fromUChar32 = 0;
491        offsetNum = -1;
492        nextSourceIndex = 0;
493        goto lowsurrogate;
494    } else {
495        offsetNum = 0;
496    }
497
498    while (mySource < sourceLimit && myTarget < targetLimit)
499    {
500        ch = *(mySource++);
501
502        if (ch < 0x80)        /* Single byte */
503        {
504            *(myOffsets++) = offsetNum++;
505            *(myTarget++) = (char) ch;
506        }
507        else if (ch < 0x800)  /* Double byte */
508        {
509            *(myOffsets++) = offsetNum;
510            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
511            if (myTarget < targetLimit)
512            {
513                *(myOffsets++) = offsetNum++;
514                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
515            }
516            else
517            {
518                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
519                cnv->charErrorBufferLength = 1;
520                *err = U_BUFFER_OVERFLOW_ERROR;
521            }
522        }
523        else
524        /* Check for surrogates */
525        {
526            nextSourceIndex = offsetNum + 1;
527
528            if(U16_IS_SURROGATE(ch) && isNotCESU8) {
529lowsurrogate:
530                if (mySource < sourceLimit) {
531                    /* test both code units */
532                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
533                        /* convert and consume this supplementary code point */
534                        ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
535                        ++mySource;
536                        ++nextSourceIndex;
537                        /* exit this condition tree */
538                    }
539                    else {
540                        /* this is an unpaired trail or lead code unit */
541                        /* callback(illegal) */
542                        cnv->fromUChar32 = ch;
543                        *err = U_ILLEGAL_CHAR_FOUND;
544                        break;
545                    }
546                }
547                else {
548                    /* no more input */
549                    cnv->fromUChar32 = ch;
550                    break;
551                }
552            }
553
554            /* Do we write the buffer directly for speed,
555            or do we have to be careful about target buffer space? */
556            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
557
558            if (ch <= MAXIMUM_UCS2) {
559                indexToWrite = 2;
560                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
561            }
562            else {
563                indexToWrite = 3;
564                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
565                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
566            }
567            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
568            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
569
570            if (tempPtr == myTarget) {
571                /* There was enough space to write the codepoint directly. */
572                myTarget += (indexToWrite + 1);
573                myOffsets[0] = offsetNum;
574                myOffsets[1] = offsetNum;
575                myOffsets[2] = offsetNum;
576                if (indexToWrite >= 3) {
577                    myOffsets[3] = offsetNum;
578                }
579                myOffsets += (indexToWrite + 1);
580            }
581            else {
582                /* We might run out of room soon. Write it slowly. */
583                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
584                    if (myTarget < targetLimit)
585                    {
586                        *(myOffsets++) = offsetNum;
587                        *(myTarget++) = *tempPtr;
588                    }
589                    else
590                    {
591                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
592                        *err = U_BUFFER_OVERFLOW_ERROR;
593                    }
594                }
595            }
596            offsetNum = nextSourceIndex;
597        }
598    }
599
600    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
601    {
602        *err = U_BUFFER_OVERFLOW_ERROR;
603    }
604
605    args->target = (char *) myTarget;
606    args->source = mySource;
607    args->offsets = myOffsets;
608}
609
610static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
611                                               UErrorCode *err) {
612    UConverter *cnv;
613    const uint8_t *sourceInitial;
614    const uint8_t *source;
615    uint16_t extraBytesToWrite;
616    uint8_t myByte;
617    UChar32 ch;
618    int8_t i, isLegalSequence;
619
620    /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
621
622    cnv = args->converter;
623    sourceInitial = source = (const uint8_t *)args->source;
624    if (source >= (const uint8_t *)args->sourceLimit)
625    {
626        /* no input */
627        *err = U_INDEX_OUTOFBOUNDS_ERROR;
628        return 0xffff;
629    }
630
631    myByte = (uint8_t)*(source++);
632    if (myByte < 0x80)
633    {
634        args->source = (const char *)source;
635        return (UChar32)myByte;
636    }
637
638    extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
639    if (extraBytesToWrite == 0) {
640        cnv->toUBytes[0] = myByte;
641        cnv->toULength = 1;
642        *err = U_ILLEGAL_CHAR_FOUND;
643        args->source = (const char *)source;
644        return 0xffff;
645    }
646
647    /*The byte sequence is longer than the buffer area passed*/
648    if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
649    {
650        /* check if all of the remaining bytes are trail bytes */
651        cnv->toUBytes[0] = myByte;
652        i = 1;
653        *err = U_TRUNCATED_CHAR_FOUND;
654        while(source < (const uint8_t *)args->sourceLimit) {
655            if(U8_IS_TRAIL(myByte = *source)) {
656                cnv->toUBytes[i++] = myByte;
657                ++source;
658            } else {
659                /* error even before we run out of input */
660                *err = U_ILLEGAL_CHAR_FOUND;
661                break;
662            }
663        }
664        cnv->toULength = i;
665        args->source = (const char *)source;
666        return 0xffff;
667    }
668
669    isLegalSequence = 1;
670    ch = myByte << 6;
671    switch(extraBytesToWrite)
672    {
673      /* note: code falls through cases! (sic)*/
674    case 6:
675        ch += (myByte = *source);
676        ch <<= 6;
677        if (!U8_IS_TRAIL(myByte))
678        {
679            isLegalSequence = 0;
680            break;
681        }
682        ++source;
683    case 5: /*fall through*/
684        ch += (myByte = *source);
685        ch <<= 6;
686        if (!U8_IS_TRAIL(myByte))
687        {
688            isLegalSequence = 0;
689            break;
690        }
691        ++source;
692    case 4: /*fall through*/
693        ch += (myByte = *source);
694        ch <<= 6;
695        if (!U8_IS_TRAIL(myByte))
696        {
697            isLegalSequence = 0;
698            break;
699        }
700        ++source;
701    case 3: /*fall through*/
702        ch += (myByte = *source);
703        ch <<= 6;
704        if (!U8_IS_TRAIL(myByte))
705        {
706            isLegalSequence = 0;
707            break;
708        }
709        ++source;
710    case 2: /*fall through*/
711        ch += (myByte = *source);
712        if (!U8_IS_TRAIL(myByte))
713        {
714            isLegalSequence = 0;
715            break;
716        }
717        ++source;
718    };
719    ch -= offsetsFromUTF8[extraBytesToWrite];
720    args->source = (const char *)source;
721
722    /*
723     * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
724     * - use only trail bytes after a lead byte (checked above)
725     * - use the right number of trail bytes for a given lead byte
726     * - encode a code point <= U+10ffff
727     * - use the fewest possible number of bytes for their code points
728     * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
729     *
730     * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
731     * There are no irregular sequences any more.
732     */
733    if (isLegalSequence &&
734        (uint32_t)ch <= MAXIMUM_UTF &&
735        (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
736        !U_IS_SURROGATE(ch)
737    ) {
738        return ch; /* return the code point */
739    }
740
741    for(i = 0; sourceInitial < source; ++i) {
742        cnv->toUBytes[i] = *sourceInitial++;
743    }
744    cnv->toULength = i;
745    *err = U_ILLEGAL_CHAR_FOUND;
746    return 0xffff;
747}
748
749/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
750
751/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
752static const UChar32
753utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
754
755/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
756static const UChar32
757utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
758
759/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
760static void
761ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
762                  UConverterToUnicodeArgs *pToUArgs,
763                  UErrorCode *pErrorCode) {
764    UConverter *utf8;
765    const uint8_t *source, *sourceLimit;
766    uint8_t *target;
767    int32_t targetCapacity;
768    int32_t count;
769
770    int8_t oldToULength, toULength, toULimit;
771
772    UChar32 c;
773    uint8_t b, t1, t2;
774
775    /* set up the local pointers */
776    utf8=pToUArgs->converter;
777    source=(uint8_t *)pToUArgs->source;
778    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
779    target=(uint8_t *)pFromUArgs->target;
780    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
781
782    /* get the converter state from the UTF-8 UConverter */
783    c=(UChar32)utf8->toUnicodeStatus;
784    if(c!=0) {
785        toULength=oldToULength=utf8->toULength;
786        toULimit=(int8_t)utf8->mode;
787    } else {
788        toULength=oldToULength=toULimit=0;
789    }
790
791    count=(int32_t)(sourceLimit-source)+oldToULength;
792    if(count<toULimit) {
793        /*
794         * Not enough input to complete the partial character.
795         * Jump to moreBytes below - it will not output to target.
796         */
797    } else if(targetCapacity<toULimit) {
798        /*
799         * Not enough target capacity to output the partial character.
800         * Let the standard converter handle this.
801         */
802        *pErrorCode=U_USING_DEFAULT_WARNING;
803        return;
804    } else {
805        /*
806         * Use a single counter for source and target, counting the minimum of
807         * the source length and the target capacity.
808         * As a result, the source length is checked only once per multi-byte
809         * character instead of twice.
810         *
811         * Make sure that the last byte sequence is complete, or else
812         * stop just before it.
813         * (The longest legal byte sequence has 3 trail bytes.)
814         * Count oldToULength (number of source bytes from a previous buffer)
815         * into the source length but reduce the source index by toULimit
816         * while going back over trail bytes in order to not go back into
817         * the bytes that will be read for finishing a partial
818         * sequence from the previous buffer.
819         * Let the standard converter handle edge cases.
820         */
821        int32_t i;
822
823        if(count>targetCapacity) {
824            count=targetCapacity;
825        }
826
827        i=0;
828        while(i<3 && i<(count-toULimit)) {
829            b=source[count-oldToULength-i-1];
830            if(U8_IS_TRAIL(b)) {
831                ++i;
832            } else {
833                if(i<U8_COUNT_TRAIL_BYTES(b)) {
834                    /* stop converting before the lead byte if there are not enough trail bytes for it */
835                    count-=i+1;
836                }
837                break;
838            }
839        }
840    }
841
842    if(c!=0) {
843        utf8->toUnicodeStatus=0;
844        utf8->toULength=0;
845        goto moreBytes;
846        /* See note in ucnv_SBCSFromUTF8() about this goto. */
847    }
848
849    /* conversion loop */
850    while(count>0) {
851        b=*source++;
852        if((int8_t)b>=0) {
853            /* convert ASCII */
854            *target++=b;
855            --count;
856            continue;
857        } else {
858            if(b>0xe0) {
859                if( /* handle U+1000..U+D7FF inline */
860                    (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
861                                               (b==0xed && (t1 <= 0x9f))) &&
862                    (t2=source[1]) >= 0x80 && t2 <= 0xbf
863                ) {
864                    source+=2;
865                    *target++=b;
866                    *target++=t1;
867                    *target++=t2;
868                    count-=3;
869                    continue;
870                }
871            } else if(b<0xe0) {
872                if( /* handle U+0080..U+07FF inline */
873                    b>=0xc2 &&
874                    (t1=*source) >= 0x80 && t1 <= 0xbf
875                ) {
876                    ++source;
877                    *target++=b;
878                    *target++=t1;
879                    count-=2;
880                    continue;
881                }
882            } else if(b==0xe0) {
883                if( /* handle U+0800..U+0FFF inline */
884                    (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
885                    (t2=source[1]) >= 0x80 && t2 <= 0xbf
886                ) {
887                    source+=2;
888                    *target++=b;
889                    *target++=t1;
890                    *target++=t2;
891                    count-=3;
892                    continue;
893                }
894            }
895
896            /* handle "complicated" and error cases, and continuing partial characters */
897            oldToULength=0;
898            toULength=1;
899            toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
900            c=b;
901moreBytes:
902            while(toULength<toULimit) {
903                if(source<sourceLimit) {
904                    b=*source;
905                    if(U8_IS_TRAIL(b)) {
906                        ++source;
907                        ++toULength;
908                        c=(c<<6)+b;
909                    } else {
910                        break; /* sequence too short, stop with toULength<toULimit */
911                    }
912                } else {
913                    /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
914                    source-=(toULength-oldToULength);
915                    while(oldToULength<toULength) {
916                        utf8->toUBytes[oldToULength++]=*source++;
917                    }
918                    utf8->toUnicodeStatus=c;
919                    utf8->toULength=toULength;
920                    utf8->mode=toULimit;
921                    pToUArgs->source=(char *)source;
922                    pFromUArgs->target=(char *)target;
923                    return;
924                }
925            }
926
927            if( toULength==toULimit &&      /* consumed all trail bytes */
928                (toULength==3 || toULength==2) &&             /* BMP */
929                (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
930                (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
931            ) {
932                /* legal byte sequence for BMP code point */
933            } else if(
934                toULength==toULimit && toULength==4 &&
935                (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
936            ) {
937                /* legal byte sequence for supplementary code point */
938            } else {
939                /* error handling: illegal UTF-8 byte sequence */
940                source-=(toULength-oldToULength);
941                while(oldToULength<toULength) {
942                    utf8->toUBytes[oldToULength++]=*source++;
943                }
944                utf8->toULength=toULength;
945                pToUArgs->source=(char *)source;
946                pFromUArgs->target=(char *)target;
947                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
948                return;
949            }
950
951            /* copy the legal byte sequence to the target */
952            {
953                int8_t i;
954
955                for(i=0; i<oldToULength; ++i) {
956                    *target++=utf8->toUBytes[i];
957                }
958                source-=(toULength-oldToULength);
959                for(; i<toULength; ++i) {
960                    *target++=*source++;
961                }
962                count-=toULength;
963            }
964        }
965    }
966
967    if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
968        if(target==(const uint8_t *)pFromUArgs->targetLimit) {
969            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
970        } else {
971            b=*source;
972            toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
973            if(toULimit>(sourceLimit-source)) {
974                /* collect a truncated byte sequence */
975                toULength=0;
976                c=b;
977                for(;;) {
978                    utf8->toUBytes[toULength++]=b;
979                    if(++source==sourceLimit) {
980                        /* partial byte sequence at end of source */
981                        utf8->toUnicodeStatus=c;
982                        utf8->toULength=toULength;
983                        utf8->mode=toULimit;
984                        break;
985                    } else if(!U8_IS_TRAIL(b=*source)) {
986                        /* lead byte in trail byte position */
987                        utf8->toULength=toULength;
988                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
989                        break;
990                    }
991                    c=(c<<6)+b;
992                }
993            } else {
994                /* partial-sequence target overflow: fall back to the pivoting implementation */
995                *pErrorCode=U_USING_DEFAULT_WARNING;
996            }
997        }
998    }
999
1000    /* write back the updated pointers */
1001    pToUArgs->source=(char *)source;
1002    pFromUArgs->target=(char *)target;
1003}
1004
1005/* UTF-8 converter data ----------------------------------------------------- */
1006
1007static const UConverterImpl _UTF8Impl={
1008    UCNV_UTF8,
1009
1010    NULL,
1011    NULL,
1012
1013    NULL,
1014    NULL,
1015    NULL,
1016
1017    ucnv_toUnicode_UTF8,
1018    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1019    ucnv_fromUnicode_UTF8,
1020    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1021    ucnv_getNextUChar_UTF8,
1022
1023    NULL,
1024    NULL,
1025    NULL,
1026    NULL,
1027    ucnv_getNonSurrogateUnicodeSet,
1028
1029    ucnv_UTF8FromUTF8,
1030    ucnv_UTF8FromUTF8
1031};
1032
1033/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1034static const UConverterStaticData _UTF8StaticData={
1035    sizeof(UConverterStaticData),
1036    "UTF-8",
1037    1208, UCNV_IBM, UCNV_UTF8,
1038    1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1039    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1040    0,
1041    0,
1042    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1043};
1044
1045
1046const UConverterSharedData _UTF8Data={
1047    sizeof(UConverterSharedData), ~((uint32_t) 0),
1048    NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
1049    0
1050};
1051
1052/* CESU-8 converter data ---------------------------------------------------- */
1053
1054static const UConverterImpl _CESU8Impl={
1055    UCNV_CESU8,
1056
1057    NULL,
1058    NULL,
1059
1060    NULL,
1061    NULL,
1062    NULL,
1063
1064    ucnv_toUnicode_UTF8,
1065    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1066    ucnv_fromUnicode_UTF8,
1067    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1068    NULL,
1069
1070    NULL,
1071    NULL,
1072    NULL,
1073    NULL,
1074    ucnv_getCompleteUnicodeSet
1075};
1076
1077static const UConverterStaticData _CESU8StaticData={
1078    sizeof(UConverterStaticData),
1079    "CESU-8",
1080    9400, /* CCSID for CESU-8 */
1081    UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1082    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1083    0,
1084    0,
1085    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1086};
1087
1088
1089const UConverterSharedData _CESU8Data={
1090    sizeof(UConverterSharedData), ~((uint32_t) 0),
1091    NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
1092    0
1093};
1094
1095#endif
1096