1/*
2**********************************************************************
3*   Copyright (C) 2002-2007, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   file name:  ucnv_u8.c
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2002jul01
12*   created by: Markus W. Scherer
13*
14*   UTF-8 converter implementation. Used to be in ucnv_utf.c.
15*
16*   Also, CESU-8 implementation, see UTR 26.
17*   The CESU-8 converter uses all the same functions as the
18*   UTF-8 converter, with a branch for converting supplementary code points.
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_CONVERSION
24
25#include "unicode/ucnv.h"
26#include "ucnv_bld.h"
27#include "ucnv_cnv.h"
28#include "cmemory.h"
29
30/* Prototypes --------------------------------------------------------------- */
31
32/* Keep these here to make finicky compilers happy */
33
34U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
35                                           UErrorCode *err);
36U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
37                                                        UErrorCode *err);
38
39
40/* UTF-8 -------------------------------------------------------------------- */
41
42/* UTF-8 Conversion DATA
43 *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
44 */
45/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
46#define MAXIMUM_UCS2            0x0000FFFF
47#define MAXIMUM_UTF             0x0010FFFF
48#define MAXIMUM_UCS4            0x7FFFFFFF
49#define HALF_SHIFT              10
50#define HALF_BASE               0x0010000
51#define HALF_MASK               0x3FF
52#define SURROGATE_HIGH_START    0xD800
53#define SURROGATE_HIGH_END      0xDBFF
54#define SURROGATE_LOW_START     0xDC00
55#define SURROGATE_LOW_END       0xDFFF
56
57/* -SURROGATE_LOW_START + HALF_BASE */
58#define SURROGATE_LOW_BASE      9216
59
60static const uint32_t offsetsFromUTF8[7] = {0,
61  (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
62  (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
63};
64
65/* END OF UTF-8 Conversion DATA */
66
67static const int8_t bytesFromUTF8[256] = {
68  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
75  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
76};
77
78/*
79 * Starting with Unicode 3.0.1:
80 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
81 * byte sequences with more than 4 bytes are illegal in UTF-8,
82 * which is tested with impossible values for them
83 */
84static const uint32_t
85utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
86
87static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
88                                  UErrorCode * err)
89{
90    UConverter *cnv = args->converter;
91    const unsigned char *mySource = (unsigned char *) args->source;
92    UChar *myTarget = args->target;
93    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
94    const UChar *targetLimit = args->targetLimit;
95    unsigned char *toUBytes = cnv->toUBytes;
96    UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
97    uint32_t ch, ch2 = 0;
98    int32_t i, inBytes;
99
100    /* Restore size of current sequence */
101    if (cnv->toUnicodeStatus && myTarget < targetLimit)
102    {
103        inBytes = cnv->mode;            /* restore # of bytes to consume */
104        i = cnv->toULength;             /* restore # of bytes consumed */
105        cnv->toULength = 0;
106
107        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
108        cnv->toUnicodeStatus = 0;
109        goto morebytes;
110    }
111
112
113    while (mySource < sourceLimit && myTarget < targetLimit)
114    {
115        ch = *(mySource++);
116        if (ch < 0x80)        /* Simple case */
117        {
118            *(myTarget++) = (UChar) ch;
119        }
120        else
121        {
122            /* store the first char */
123            toUBytes[0] = (char)ch;
124            inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
125            i = 1;
126
127morebytes:
128            while (i < inBytes)
129            {
130                if (mySource < sourceLimit)
131                {
132                    toUBytes[i] = (char) (ch2 = *mySource);
133                    if (!UTF8_IS_TRAIL(ch2))
134                    {
135                        break; /* i < inBytes */
136                    }
137                    ch = (ch << 6) + ch2;
138                    ++mySource;
139                    i++;
140                }
141                else
142                {
143                    /* stores a partially calculated target*/
144                    cnv->toUnicodeStatus = ch;
145                    cnv->mode = inBytes;
146                    cnv->toULength = (int8_t) i;
147                    goto donefornow;
148                }
149            }
150
151            /* Remove the accumulated high bits */
152            ch -= offsetsFromUTF8[inBytes];
153
154            /*
155             * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
156             * - use only trail bytes after a lead byte (checked above)
157             * - use the right number of trail bytes for a given lead byte
158             * - encode a code point <= U+10ffff
159             * - use the fewest possible number of bytes for their code points
160             * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
161             *
162             * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
163             * There are no irregular sequences any more.
164             * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
165             */
166            if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
167                (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
168            {
169                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
170                if (ch <= MAXIMUM_UCS2)
171                {
172                    /* fits in 16 bits */
173                    *(myTarget++) = (UChar) ch;
174                }
175                else
176                {
177                    /* write out the surrogates */
178                    ch -= HALF_BASE;
179                    *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
180                    ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
181                    if (myTarget < targetLimit)
182                    {
183                        *(myTarget++) = (UChar)ch;
184                    }
185                    else
186                    {
187                        /* Put in overflow buffer (not handled here) */
188                        cnv->UCharErrorBuffer[0] = (UChar) ch;
189                        cnv->UCharErrorBufferLength = 1;
190                        *err = U_BUFFER_OVERFLOW_ERROR;
191                        break;
192                    }
193                }
194            }
195            else
196            {
197                cnv->toULength = (int8_t)i;
198                *err = U_ILLEGAL_CHAR_FOUND;
199                break;
200            }
201        }
202    }
203
204donefornow:
205    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
206    {
207        /* End of target buffer */
208        *err = U_BUFFER_OVERFLOW_ERROR;
209    }
210
211    args->target = myTarget;
212    args->source = (const char *) mySource;
213}
214
215static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
216                                                UErrorCode * err)
217{
218    UConverter *cnv = args->converter;
219    const unsigned char *mySource = (unsigned char *) args->source;
220    UChar *myTarget = args->target;
221    int32_t *myOffsets = args->offsets;
222    int32_t offsetNum = 0;
223    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
224    const UChar *targetLimit = args->targetLimit;
225    unsigned char *toUBytes = cnv->toUBytes;
226    UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
227    uint32_t ch, ch2 = 0;
228    int32_t i, inBytes;
229
230    /* Restore size of current sequence */
231    if (cnv->toUnicodeStatus && myTarget < targetLimit)
232    {
233        inBytes = cnv->mode;            /* restore # of bytes to consume */
234        i = cnv->toULength;             /* restore # of bytes consumed */
235        cnv->toULength = 0;
236
237        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
238        cnv->toUnicodeStatus = 0;
239        goto morebytes;
240    }
241
242    while (mySource < sourceLimit && myTarget < targetLimit)
243    {
244        ch = *(mySource++);
245        if (ch < 0x80)        /* Simple case */
246        {
247            *(myTarget++) = (UChar) ch;
248            *(myOffsets++) = offsetNum++;
249        }
250        else
251        {
252            toUBytes[0] = (char)ch;
253            inBytes = bytesFromUTF8[ch];
254            i = 1;
255
256morebytes:
257            while (i < inBytes)
258            {
259                if (mySource < sourceLimit)
260                {
261                    toUBytes[i] = (char) (ch2 = *mySource);
262                    if (!UTF8_IS_TRAIL(ch2))
263                    {
264                        break; /* i < inBytes */
265                    }
266                    ch = (ch << 6) + ch2;
267                    ++mySource;
268                    i++;
269                }
270                else
271                {
272                    cnv->toUnicodeStatus = ch;
273                    cnv->mode = inBytes;
274                    cnv->toULength = (int8_t)i;
275                    goto donefornow;
276                }
277            }
278
279            /* Remove the accumulated high bits */
280            ch -= offsetsFromUTF8[inBytes];
281
282            /*
283             * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
284             * - use only trail bytes after a lead byte (checked above)
285             * - use the right number of trail bytes for a given lead byte
286             * - encode a code point <= U+10ffff
287             * - use the fewest possible number of bytes for their code points
288             * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
289             *
290             * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
291             * There are no irregular sequences any more.
292             * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
293             */
294            if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
295                (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
296            {
297                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
298                if (ch <= MAXIMUM_UCS2)
299                {
300                    /* fits in 16 bits */
301                    *(myTarget++) = (UChar) ch;
302                    *(myOffsets++) = offsetNum;
303                }
304                else
305                {
306                    /* write out the surrogates */
307                    ch -= HALF_BASE;
308                    *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
309                    *(myOffsets++) = offsetNum;
310                    ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
311                    if (myTarget < targetLimit)
312                    {
313                        *(myTarget++) = (UChar)ch;
314                        *(myOffsets++) = offsetNum;
315                    }
316                    else
317                    {
318                        cnv->UCharErrorBuffer[0] = (UChar) ch;
319                        cnv->UCharErrorBufferLength = 1;
320                        *err = U_BUFFER_OVERFLOW_ERROR;
321                    }
322                }
323                offsetNum += i;
324            }
325            else
326            {
327                cnv->toULength = (int8_t)i;
328                *err = U_ILLEGAL_CHAR_FOUND;
329                break;
330            }
331        }
332    }
333
334donefornow:
335    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
336    {   /* End of target buffer */
337        *err = U_BUFFER_OVERFLOW_ERROR;
338    }
339
340    args->target = myTarget;
341    args->source = (const char *) mySource;
342    args->offsets = myOffsets;
343}
344
345U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
346                                    UErrorCode * err)
347{
348    UConverter *cnv = args->converter;
349    const UChar *mySource = args->source;
350    const UChar *sourceLimit = args->sourceLimit;
351    uint8_t *myTarget = (uint8_t *) args->target;
352    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
353    uint8_t *tempPtr;
354    UChar32 ch;
355    uint8_t tempBuf[4];
356    int32_t indexToWrite;
357    UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
358
359    if (cnv->fromUChar32 && myTarget < targetLimit)
360    {
361        ch = cnv->fromUChar32;
362        cnv->fromUChar32 = 0;
363        goto lowsurrogate;
364    }
365
366    while (mySource < sourceLimit && myTarget < targetLimit)
367    {
368        ch = *(mySource++);
369
370        if (ch < 0x80)        /* Single byte */
371        {
372            *(myTarget++) = (uint8_t) ch;
373        }
374        else if (ch < 0x800)  /* Double byte */
375        {
376            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
377            if (myTarget < targetLimit)
378            {
379                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
380            }
381            else
382            {
383                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
384                cnv->charErrorBufferLength = 1;
385                *err = U_BUFFER_OVERFLOW_ERROR;
386            }
387        }
388        else {
389            /* Check for surrogates */
390            if(UTF_IS_SURROGATE(ch) && isNotCESU8) {
391lowsurrogate:
392                if (mySource < sourceLimit) {
393                    /* test both code units */
394                    if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*mySource)) {
395                        /* convert and consume this supplementary code point */
396                        ch=UTF16_GET_PAIR_VALUE(ch, *mySource);
397                        ++mySource;
398                        /* exit this condition tree */
399                    }
400                    else {
401                        /* this is an unpaired trail or lead code unit */
402                        /* callback(illegal) */
403                        cnv->fromUChar32 = ch;
404                        *err = U_ILLEGAL_CHAR_FOUND;
405                        break;
406                    }
407                }
408                else {
409                    /* no more input */
410                    cnv->fromUChar32 = ch;
411                    break;
412                }
413            }
414
415            /* Do we write the buffer directly for speed,
416            or do we have to be careful about target buffer space? */
417            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
418
419            if (ch <= MAXIMUM_UCS2) {
420                indexToWrite = 2;
421                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
422            }
423            else {
424                indexToWrite = 3;
425                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
426                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
427            }
428            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
429            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
430
431            if (tempPtr == myTarget) {
432                /* There was enough space to write the codepoint directly. */
433                myTarget += (indexToWrite + 1);
434            }
435            else {
436                /* We might run out of room soon. Write it slowly. */
437                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
438                    if (myTarget < targetLimit) {
439                        *(myTarget++) = *tempPtr;
440                    }
441                    else {
442                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
443                        *err = U_BUFFER_OVERFLOW_ERROR;
444                    }
445                }
446            }
447        }
448    }
449
450    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
451    {
452        *err = U_BUFFER_OVERFLOW_ERROR;
453    }
454
455    args->target = (char *) myTarget;
456    args->source = mySource;
457}
458
459U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
460                                                  UErrorCode * err)
461{
462    UConverter *cnv = args->converter;
463    const UChar *mySource = args->source;
464    int32_t *myOffsets = args->offsets;
465    const UChar *sourceLimit = args->sourceLimit;
466    uint8_t *myTarget = (uint8_t *) args->target;
467    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
468    uint8_t *tempPtr;
469    UChar32 ch;
470    int32_t offsetNum, nextSourceIndex;
471    int32_t indexToWrite;
472    uint8_t tempBuf[4];
473    UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
474
475    if (cnv->fromUChar32 && myTarget < targetLimit)
476    {
477        ch = cnv->fromUChar32;
478        cnv->fromUChar32 = 0;
479        offsetNum = -1;
480        nextSourceIndex = 0;
481        goto lowsurrogate;
482    } else {
483        offsetNum = 0;
484    }
485
486    while (mySource < sourceLimit && myTarget < targetLimit)
487    {
488        ch = *(mySource++);
489
490        if (ch < 0x80)        /* Single byte */
491        {
492            *(myOffsets++) = offsetNum++;
493            *(myTarget++) = (char) ch;
494        }
495        else if (ch < 0x800)  /* Double byte */
496        {
497            *(myOffsets++) = offsetNum;
498            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
499            if (myTarget < targetLimit)
500            {
501                *(myOffsets++) = offsetNum++;
502                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
503            }
504            else
505            {
506                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
507                cnv->charErrorBufferLength = 1;
508                *err = U_BUFFER_OVERFLOW_ERROR;
509            }
510        }
511        else
512        /* Check for surrogates */
513        {
514            nextSourceIndex = offsetNum + 1;
515
516            if(UTF_IS_SURROGATE(ch) && isNotCESU8) {
517lowsurrogate:
518                if (mySource < sourceLimit) {
519                    /* test both code units */
520                    if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*mySource)) {
521                        /* convert and consume this supplementary code point */
522                        ch=UTF16_GET_PAIR_VALUE(ch, *mySource);
523                        ++mySource;
524                        ++nextSourceIndex;
525                        /* exit this condition tree */
526                    }
527                    else {
528                        /* this is an unpaired trail or lead code unit */
529                        /* callback(illegal) */
530                        cnv->fromUChar32 = ch;
531                        *err = U_ILLEGAL_CHAR_FOUND;
532                        break;
533                    }
534                }
535                else {
536                    /* no more input */
537                    cnv->fromUChar32 = ch;
538                    break;
539                }
540            }
541
542            /* Do we write the buffer directly for speed,
543            or do we have to be careful about target buffer space? */
544            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
545
546            if (ch <= MAXIMUM_UCS2) {
547                indexToWrite = 2;
548                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
549            }
550            else {
551                indexToWrite = 3;
552                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
553                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
554            }
555            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
556            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
557
558            if (tempPtr == myTarget) {
559                /* There was enough space to write the codepoint directly. */
560                myTarget += (indexToWrite + 1);
561                myOffsets[0] = offsetNum;
562                myOffsets[1] = offsetNum;
563                myOffsets[2] = offsetNum;
564                if (indexToWrite >= 3) {
565                    myOffsets[3] = offsetNum;
566                }
567                myOffsets += (indexToWrite + 1);
568            }
569            else {
570                /* We might run out of room soon. Write it slowly. */
571                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
572                    if (myTarget < targetLimit)
573                    {
574                        *(myOffsets++) = offsetNum;
575                        *(myTarget++) = *tempPtr;
576                    }
577                    else
578                    {
579                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
580                        *err = U_BUFFER_OVERFLOW_ERROR;
581                    }
582                }
583            }
584            offsetNum = nextSourceIndex;
585        }
586    }
587
588    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
589    {
590        *err = U_BUFFER_OVERFLOW_ERROR;
591    }
592
593    args->target = (char *) myTarget;
594    args->source = mySource;
595    args->offsets = myOffsets;
596}
597
598static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
599                                               UErrorCode *err) {
600    UConverter *cnv;
601    const uint8_t *sourceInitial;
602    const uint8_t *source;
603    uint16_t extraBytesToWrite;
604    uint8_t myByte;
605    UChar32 ch;
606    int8_t i, isLegalSequence;
607
608    /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
609
610    cnv = args->converter;
611    sourceInitial = source = (const uint8_t *)args->source;
612    if (source >= (const uint8_t *)args->sourceLimit)
613    {
614        /* no input */
615        *err = U_INDEX_OUTOFBOUNDS_ERROR;
616        return 0xffff;
617    }
618
619    myByte = (uint8_t)*(source++);
620    if (myByte < 0x80)
621    {
622        args->source = (const char *)source;
623        return (UChar32)myByte;
624    }
625
626    extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
627    if (extraBytesToWrite == 0) {
628        cnv->toUBytes[0] = myByte;
629        cnv->toULength = 1;
630        *err = U_ILLEGAL_CHAR_FOUND;
631        args->source = (const char *)source;
632        return 0xffff;
633    }
634
635    /*The byte sequence is longer than the buffer area passed*/
636    if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
637    {
638        /* check if all of the remaining bytes are trail bytes */
639        cnv->toUBytes[0] = myByte;
640        i = 1;
641        *err = U_TRUNCATED_CHAR_FOUND;
642        while(source < (const uint8_t *)args->sourceLimit) {
643            if(U8_IS_TRAIL(myByte = *source)) {
644                cnv->toUBytes[i++] = myByte;
645                ++source;
646            } else {
647                /* error even before we run out of input */
648                *err = U_ILLEGAL_CHAR_FOUND;
649                break;
650            }
651        }
652        cnv->toULength = i;
653        args->source = (const char *)source;
654        return 0xffff;
655    }
656
657    isLegalSequence = 1;
658    ch = myByte << 6;
659    switch(extraBytesToWrite)
660    {
661      /* note: code falls through cases! (sic)*/
662    case 6:
663        ch += (myByte = *source);
664        ch <<= 6;
665        if (!UTF8_IS_TRAIL(myByte))
666        {
667            isLegalSequence = 0;
668            break;
669        }
670        ++source;
671    case 5:
672        ch += (myByte = *source);
673        ch <<= 6;
674        if (!UTF8_IS_TRAIL(myByte))
675        {
676            isLegalSequence = 0;
677            break;
678        }
679        ++source;
680    case 4:
681        ch += (myByte = *source);
682        ch <<= 6;
683        if (!UTF8_IS_TRAIL(myByte))
684        {
685            isLegalSequence = 0;
686            break;
687        }
688        ++source;
689    case 3:
690        ch += (myByte = *source);
691        ch <<= 6;
692        if (!UTF8_IS_TRAIL(myByte))
693        {
694            isLegalSequence = 0;
695            break;
696        }
697        ++source;
698    case 2:
699        ch += (myByte = *source);
700        if (!UTF8_IS_TRAIL(myByte))
701        {
702            isLegalSequence = 0;
703            break;
704        }
705        ++source;
706    };
707    ch -= offsetsFromUTF8[extraBytesToWrite];
708    args->source = (const char *)source;
709
710    /*
711     * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
712     * - use only trail bytes after a lead byte (checked above)
713     * - use the right number of trail bytes for a given lead byte
714     * - encode a code point <= U+10ffff
715     * - use the fewest possible number of bytes for their code points
716     * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
717     *
718     * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
719     * There are no irregular sequences any more.
720     */
721    if (isLegalSequence &&
722        (uint32_t)ch <= MAXIMUM_UTF &&
723        (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
724        !U_IS_SURROGATE(ch)
725    ) {
726        return ch; /* return the code point */
727    }
728
729    for(i = 0; sourceInitial < source; ++i) {
730        cnv->toUBytes[i] = *sourceInitial++;
731    }
732    cnv->toULength = i;
733    *err = U_ILLEGAL_CHAR_FOUND;
734    return 0xffff;
735}
736
737/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
738
739/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
740static const UChar32
741utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
742
743/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
744static const UChar32
745utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
746
747/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
748static void
749ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
750                  UConverterToUnicodeArgs *pToUArgs,
751                  UErrorCode *pErrorCode) {
752    UConverter *utf8, *cnv;
753    const uint8_t *source, *sourceLimit;
754    uint8_t *target;
755    int32_t targetCapacity;
756    int32_t count;
757
758    int8_t oldToULength, toULength, toULimit;
759
760    UChar32 c;
761    uint8_t b, t1, t2;
762
763    /* set up the local pointers */
764    utf8=pToUArgs->converter;
765    cnv=pFromUArgs->converter;
766    source=(uint8_t *)pToUArgs->source;
767    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
768    target=(uint8_t *)pFromUArgs->target;
769    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
770
771    /* get the converter state from the UTF-8 UConverter */
772    c=(UChar32)utf8->toUnicodeStatus;
773    if(c!=0) {
774        toULength=oldToULength=utf8->toULength;
775        toULimit=(int8_t)utf8->mode;
776    } else {
777        toULength=oldToULength=toULimit=0;
778    }
779
780    count=(int32_t)(sourceLimit-source)+oldToULength;
781    if(count<toULimit) {
782        /*
783         * Not enough input to complete the partial character.
784         * Jump to moreBytes below - it will not output to target.
785         */
786    } else if(targetCapacity<toULimit) {
787        /*
788         * Not enough target capacity to output the partial character.
789         * Let the standard converter handle this.
790         */
791        *pErrorCode=U_USING_DEFAULT_WARNING;
792        return;
793    } else {
794        /*
795         * Use a single counter for source and target, counting the minimum of
796         * the source length and the target capacity.
797         * As a result, the source length is checked only once per multi-byte
798         * character instead of twice.
799         *
800         * Make sure that the last byte sequence is complete, or else
801         * stop just before it.
802         * (The longest legal byte sequence has 3 trail bytes.)
803         * Count oldToULength (number of source bytes from a previous buffer)
804         * into the source length but reduce the source index by toULimit
805         * while going back over trail bytes in order to not go back into
806         * the bytes that will be read for finishing a partial
807         * sequence from the previous buffer.
808         * Let the standard converter handle edge cases.
809         */
810        int32_t i;
811
812        if(count>targetCapacity) {
813            count=targetCapacity;
814        }
815
816        i=0;
817        while(i<3 && i<(count-toULimit)) {
818            b=source[count-oldToULength-i-1];
819            if(U8_IS_TRAIL(b)) {
820                ++i;
821            } else {
822                if(i<utf8_countTrailBytes[b]) {
823                    /* stop converting before the lead byte if there are not enough trail bytes for it */
824                    count-=i+1;
825                }
826                break;
827            }
828        }
829    }
830
831    if(c!=0) {
832        utf8->toUnicodeStatus=0;
833        utf8->toULength=0;
834        goto moreBytes;
835        /* See note in ucnv_SBCSFromUTF8() about this goto. */
836    }
837
838    /* conversion loop */
839    while(count>0) {
840        b=*source++;
841        if((int8_t)b>=0) {
842            /* convert ASCII */
843            *target++=b;
844            --count;
845            continue;
846        } else {
847            if(b>0xe0) {
848                if( /* handle U+1000..U+D7FF inline */
849                    (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
850                                               (b==0xed && (t1 <= 0x9f))) &&
851                    (t2=source[1]) >= 0x80 && t2 <= 0xbf
852                ) {
853                    source+=2;
854                    *target++=b;
855                    *target++=t1;
856                    *target++=t2;
857                    count-=3;
858                    continue;
859                }
860            } else if(b<0xe0) {
861                if( /* handle U+0080..U+07FF inline */
862                    b>=0xc2 &&
863                    (t1=*source) >= 0x80 && t1 <= 0xbf
864                ) {
865                    ++source;
866                    *target++=b;
867                    *target++=t1;
868                    count-=2;
869                    continue;
870                }
871            } else if(b==0xe0) {
872                if( /* handle U+0800..U+0FFF inline */
873                    (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
874                    (t2=source[1]) >= 0x80 && t2 <= 0xbf
875                ) {
876                    source+=2;
877                    *target++=b;
878                    *target++=t1;
879                    *target++=t2;
880                    count-=3;
881                    continue;
882                }
883            }
884
885            /* handle "complicated" and error cases, and continuing partial characters */
886            oldToULength=0;
887            toULength=1;
888            toULimit=utf8_countTrailBytes[b]+1;
889            c=b;
890moreBytes:
891            while(toULength<toULimit) {
892                if(source<sourceLimit) {
893                    b=*source;
894                    if(U8_IS_TRAIL(b)) {
895                        ++source;
896                        ++toULength;
897                        c=(c<<6)+b;
898                    } else {
899                        break; /* sequence too short, stop with toULength<toULimit */
900                    }
901                } else {
902                    /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
903                    source-=(toULength-oldToULength);
904                    while(oldToULength<toULength) {
905                        utf8->toUBytes[oldToULength++]=*source++;
906                    }
907                    utf8->toUnicodeStatus=c;
908                    utf8->toULength=toULength;
909                    utf8->mode=toULimit;
910                    pToUArgs->source=(char *)source;
911                    pFromUArgs->target=(char *)target;
912                    return;
913                }
914            }
915
916            if( toULength==toULimit &&      /* consumed all trail bytes */
917                (toULength==3 || toULength==2) &&             /* BMP */
918                (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
919                (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
920            ) {
921                /* legal byte sequence for BMP code point */
922            } else if(
923                toULength==toULimit && toULength==4 &&
924                (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
925            ) {
926                /* legal byte sequence for supplementary code point */
927            } else {
928                /* error handling: illegal UTF-8 byte sequence */
929                source-=(toULength-oldToULength);
930                while(oldToULength<toULength) {
931                    utf8->toUBytes[oldToULength++]=*source++;
932                }
933                utf8->toULength=toULength;
934                pToUArgs->source=(char *)source;
935                pFromUArgs->target=(char *)target;
936                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
937                return;
938            }
939
940            /* copy the legal byte sequence to the target */
941            {
942                int8_t i;
943
944                for(i=0; i<oldToULength; ++i) {
945                    *target++=utf8->toUBytes[i];
946                }
947                source-=(toULength-oldToULength);
948                for(; i<toULength; ++i) {
949                    *target++=*source++;
950                }
951                count-=toULength;
952            }
953        }
954    }
955
956    if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
957        if(target==(const uint8_t *)pFromUArgs->targetLimit) {
958            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
959        } else {
960            b=*source;
961            toULimit=utf8_countTrailBytes[b]+1;
962            if(toULimit>(sourceLimit-source)) {
963                /* collect a truncated byte sequence */
964                toULength=0;
965                c=b;
966                for(;;) {
967                    utf8->toUBytes[toULength++]=b;
968                    if(++source==sourceLimit) {
969                        /* partial byte sequence at end of source */
970                        utf8->toUnicodeStatus=c;
971                        utf8->toULength=toULength;
972                        utf8->mode=toULimit;
973                        break;
974                    } else if(!U8_IS_TRAIL(b=*source)) {
975                        /* lead byte in trail byte position */
976                        utf8->toULength=toULength;
977                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
978                        break;
979                    }
980                    c=(c<<6)+b;
981                }
982            } else {
983                /* partial-sequence target overflow: fall back to the pivoting implementation */
984                *pErrorCode=U_USING_DEFAULT_WARNING;
985            }
986        }
987    }
988
989    /* write back the updated pointers */
990    pToUArgs->source=(char *)source;
991    pFromUArgs->target=(char *)target;
992}
993
994/* UTF-8 converter data ----------------------------------------------------- */
995
996static const UConverterImpl _UTF8Impl={
997    UCNV_UTF8,
998
999    NULL,
1000    NULL,
1001
1002    NULL,
1003    NULL,
1004    NULL,
1005
1006    ucnv_toUnicode_UTF8,
1007    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1008    ucnv_fromUnicode_UTF8,
1009    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1010    ucnv_getNextUChar_UTF8,
1011
1012    NULL,
1013    NULL,
1014    NULL,
1015    NULL,
1016    ucnv_getNonSurrogateUnicodeSet,
1017
1018    ucnv_UTF8FromUTF8,
1019    ucnv_UTF8FromUTF8
1020};
1021
1022/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1023static const UConverterStaticData _UTF8StaticData={
1024    sizeof(UConverterStaticData),
1025    "UTF-8",
1026    1208, UCNV_IBM, UCNV_UTF8,
1027    1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1028    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1029    0,
1030    0,
1031    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1032};
1033
1034
1035const UConverterSharedData _UTF8Data={
1036    sizeof(UConverterSharedData), ~((uint32_t) 0),
1037    NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
1038    0
1039};
1040
1041/* CESU-8 converter data ---------------------------------------------------- */
1042
1043static const UConverterImpl _CESU8Impl={
1044    UCNV_CESU8,
1045
1046    NULL,
1047    NULL,
1048
1049    NULL,
1050    NULL,
1051    NULL,
1052
1053    ucnv_toUnicode_UTF8,
1054    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1055    ucnv_fromUnicode_UTF8,
1056    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1057    NULL,
1058
1059    NULL,
1060    NULL,
1061    NULL,
1062    NULL,
1063    ucnv_getCompleteUnicodeSet
1064};
1065
1066static const UConverterStaticData _CESU8StaticData={
1067    sizeof(UConverterStaticData),
1068    "CESU-8",
1069    9400, /* CCSID for CESU-8 */
1070    UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1071    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1072    0,
1073    0,
1074    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1075};
1076
1077
1078const UConverterSharedData _CESU8Data={
1079    sizeof(UConverterSharedData), ~((uint32_t) 0),
1080    NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
1081    0
1082};
1083
1084#endif
1085