1/*
2**********************************************************************
3*   Copyright (C) 2002-2009, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   file name:  ucnv_u32.c
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2002jul01
12*   created by: Markus W. Scherer
13*
14*   UTF-32 converter implementation. Used to be in ucnv_utf.c.
15*/
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_CONVERSION
20
21#include "unicode/ucnv.h"
22#include "ucnv_bld.h"
23#include "ucnv_cnv.h"
24#include "cmemory.h"
25
26#define MAXIMUM_UCS2            0x0000FFFF
27#define MAXIMUM_UTF             0x0010FFFF
28#define HALF_SHIFT              10
29#define HALF_BASE               0x0010000
30#define HALF_MASK               0x3FF
31#define SURROGATE_HIGH_START    0xD800
32#define SURROGATE_LOW_START     0xDC00
33
34/* -SURROGATE_LOW_START + HALF_BASE */
35#define SURROGATE_LOW_BASE      9216
36
37enum {
38    UCNV_NEED_TO_WRITE_BOM=1
39};
40
41/* UTF-32BE ----------------------------------------------------------------- */
42
43static void
44T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
45                                UErrorCode * err)
46{
47    const unsigned char *mySource = (unsigned char *) args->source;
48    UChar *myTarget = args->target;
49    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
50    const UChar *targetLimit = args->targetLimit;
51    unsigned char *toUBytes = args->converter->toUBytes;
52    uint32_t ch, i;
53
54    /* Restore state of current sequence */
55    if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
56        i = args->converter->toULength;       /* restore # of bytes consumed */
57        args->converter->toULength = 0;
58
59        ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
60        args->converter->toUnicodeStatus = 0;
61        goto morebytes;
62    }
63
64    while (mySource < sourceLimit && myTarget < targetLimit) {
65        i = 0;
66        ch = 0;
67morebytes:
68        while (i < sizeof(uint32_t)) {
69            if (mySource < sourceLimit) {
70                ch = (ch << 8) | (uint8_t)(*mySource);
71                toUBytes[i++] = (char) *(mySource++);
72            }
73            else {
74                /* stores a partially calculated target*/
75                /* + 1 to make 0 a valid character */
76                args->converter->toUnicodeStatus = ch + 1;
77                args->converter->toULength = (int8_t) i;
78                goto donefornow;
79            }
80        }
81
82        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
83            /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
84            if (ch <= MAXIMUM_UCS2)
85            {
86                /* fits in 16 bits */
87                *(myTarget++) = (UChar) ch;
88            }
89            else {
90                /* write out the surrogates */
91                *(myTarget++) = U16_LEAD(ch);
92                ch = U16_TRAIL(ch);
93                if (myTarget < targetLimit) {
94                    *(myTarget++) = (UChar)ch;
95                }
96                else {
97                    /* Put in overflow buffer (not handled here) */
98                    args->converter->UCharErrorBuffer[0] = (UChar) ch;
99                    args->converter->UCharErrorBufferLength = 1;
100                    *err = U_BUFFER_OVERFLOW_ERROR;
101                    break;
102                }
103            }
104        }
105        else {
106            args->converter->toULength = (int8_t)i;
107            *err = U_ILLEGAL_CHAR_FOUND;
108            break;
109        }
110    }
111
112donefornow:
113    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
114        /* End of target buffer */
115        *err = U_BUFFER_OVERFLOW_ERROR;
116    }
117
118    args->target = myTarget;
119    args->source = (const char *) mySource;
120}
121
122static void
123T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
124                                             UErrorCode * err)
125{
126    const unsigned char *mySource = (unsigned char *) args->source;
127    UChar *myTarget = args->target;
128    int32_t *myOffsets = args->offsets;
129    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
130    const UChar *targetLimit = args->targetLimit;
131    unsigned char *toUBytes = args->converter->toUBytes;
132    uint32_t ch, i;
133    int32_t offsetNum = 0;
134
135    /* Restore state of current sequence */
136    if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
137        i = args->converter->toULength;       /* restore # of bytes consumed */
138        args->converter->toULength = 0;
139
140        ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
141        args->converter->toUnicodeStatus = 0;
142        goto morebytes;
143    }
144
145    while (mySource < sourceLimit && myTarget < targetLimit) {
146        i = 0;
147        ch = 0;
148morebytes:
149        while (i < sizeof(uint32_t)) {
150            if (mySource < sourceLimit) {
151                ch = (ch << 8) | (uint8_t)(*mySource);
152                toUBytes[i++] = (char) *(mySource++);
153            }
154            else {
155                /* stores a partially calculated target*/
156                /* + 1 to make 0 a valid character */
157                args->converter->toUnicodeStatus = ch + 1;
158                args->converter->toULength = (int8_t) i;
159                goto donefornow;
160            }
161        }
162
163        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
164            /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
165            if (ch <= MAXIMUM_UCS2) {
166                /* fits in 16 bits */
167                *(myTarget++) = (UChar) ch;
168                *(myOffsets++) = offsetNum;
169            }
170            else {
171                /* write out the surrogates */
172                *(myTarget++) = U16_LEAD(ch);
173                *myOffsets++ = offsetNum;
174                ch = U16_TRAIL(ch);
175                if (myTarget < targetLimit)
176                {
177                    *(myTarget++) = (UChar)ch;
178                    *(myOffsets++) = offsetNum;
179                }
180                else {
181                    /* Put in overflow buffer (not handled here) */
182                    args->converter->UCharErrorBuffer[0] = (UChar) ch;
183                    args->converter->UCharErrorBufferLength = 1;
184                    *err = U_BUFFER_OVERFLOW_ERROR;
185                    break;
186                }
187            }
188        }
189        else {
190            args->converter->toULength = (int8_t)i;
191            *err = U_ILLEGAL_CHAR_FOUND;
192            break;
193        }
194        offsetNum += i;
195    }
196
197donefornow:
198    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
199    {
200        /* End of target buffer */
201        *err = U_BUFFER_OVERFLOW_ERROR;
202    }
203
204    args->target = myTarget;
205    args->source = (const char *) mySource;
206    args->offsets = myOffsets;
207}
208
209static void
210T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
211                                  UErrorCode * err)
212{
213    const UChar *mySource = args->source;
214    unsigned char *myTarget;
215    const UChar *sourceLimit = args->sourceLimit;
216    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
217    UChar32 ch, ch2;
218    unsigned int indexToWrite;
219    unsigned char temp[sizeof(uint32_t)];
220
221    if(mySource >= sourceLimit) {
222        /* no input, nothing to do */
223        return;
224    }
225
226    /* write the BOM if necessary */
227    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
228        static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
229        ucnv_fromUWriteBytes(args->converter,
230                             bom, 4,
231                             &args->target, args->targetLimit,
232                             &args->offsets, -1,
233                             err);
234        args->converter->fromUnicodeStatus=0;
235    }
236
237    myTarget = (unsigned char *) args->target;
238    temp[0] = 0;
239
240    if (args->converter->fromUChar32) {
241        ch = args->converter->fromUChar32;
242        args->converter->fromUChar32 = 0;
243        goto lowsurogate;
244    }
245
246    while (mySource < sourceLimit && myTarget < targetLimit) {
247        ch = *(mySource++);
248
249        if (UTF_IS_SURROGATE(ch)) {
250            if (U_IS_LEAD(ch)) {
251lowsurogate:
252                if (mySource < sourceLimit) {
253                    ch2 = *mySource;
254                    if (U_IS_TRAIL(ch2)) {
255                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
256                        mySource++;
257                    }
258                    else {
259                        /* this is an unmatched trail code unit (2nd surrogate) */
260                        /* callback(illegal) */
261                        args->converter->fromUChar32 = ch;
262                        *err = U_ILLEGAL_CHAR_FOUND;
263                        break;
264                    }
265                }
266                else {
267                    /* ran out of source */
268                    args->converter->fromUChar32 = ch;
269                    if (args->flush) {
270                        /* this is an unmatched trail code unit (2nd surrogate) */
271                        /* callback(illegal) */
272                        *err = U_ILLEGAL_CHAR_FOUND;
273                    }
274                    break;
275                }
276            }
277            else {
278                /* this is an unmatched trail code unit (2nd surrogate) */
279                /* callback(illegal) */
280                args->converter->fromUChar32 = ch;
281                *err = U_ILLEGAL_CHAR_FOUND;
282                break;
283            }
284        }
285
286        /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
287        temp[1] = (uint8_t) (ch >> 16 & 0x1F);
288        temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
289        temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
290
291        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
292            if (myTarget < targetLimit) {
293                *(myTarget++) = temp[indexToWrite];
294            }
295            else {
296                args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
297                *err = U_BUFFER_OVERFLOW_ERROR;
298            }
299        }
300    }
301
302    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
303        *err = U_BUFFER_OVERFLOW_ERROR;
304    }
305
306    args->target = (char *) myTarget;
307    args->source = mySource;
308}
309
310static void
311T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
312                                               UErrorCode * err)
313{
314    const UChar *mySource = args->source;
315    unsigned char *myTarget;
316    int32_t *myOffsets;
317    const UChar *sourceLimit = args->sourceLimit;
318    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
319    UChar32 ch, ch2;
320    int32_t offsetNum = 0;
321    unsigned int indexToWrite;
322    unsigned char temp[sizeof(uint32_t)];
323
324    if(mySource >= sourceLimit) {
325        /* no input, nothing to do */
326        return;
327    }
328
329    /* write the BOM if necessary */
330    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
331        static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
332        ucnv_fromUWriteBytes(args->converter,
333                             bom, 4,
334                             &args->target, args->targetLimit,
335                             &args->offsets, -1,
336                             err);
337        args->converter->fromUnicodeStatus=0;
338    }
339
340    myTarget = (unsigned char *) args->target;
341    myOffsets = args->offsets;
342    temp[0] = 0;
343
344    if (args->converter->fromUChar32) {
345        ch = args->converter->fromUChar32;
346        args->converter->fromUChar32 = 0;
347        goto lowsurogate;
348    }
349
350    while (mySource < sourceLimit && myTarget < targetLimit) {
351        ch = *(mySource++);
352
353        if (UTF_IS_SURROGATE(ch)) {
354            if (U_IS_LEAD(ch)) {
355lowsurogate:
356                if (mySource < sourceLimit) {
357                    ch2 = *mySource;
358                    if (U_IS_TRAIL(ch2)) {
359                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
360                        mySource++;
361                    }
362                    else {
363                        /* this is an unmatched trail code unit (2nd surrogate) */
364                        /* callback(illegal) */
365                        args->converter->fromUChar32 = ch;
366                        *err = U_ILLEGAL_CHAR_FOUND;
367                        break;
368                    }
369                }
370                else {
371                    /* ran out of source */
372                    args->converter->fromUChar32 = ch;
373                    if (args->flush) {
374                        /* this is an unmatched trail code unit (2nd surrogate) */
375                        /* callback(illegal) */
376                        *err = U_ILLEGAL_CHAR_FOUND;
377                    }
378                    break;
379                }
380            }
381            else {
382                /* this is an unmatched trail code unit (2nd surrogate) */
383                /* callback(illegal) */
384                args->converter->fromUChar32 = ch;
385                *err = U_ILLEGAL_CHAR_FOUND;
386                break;
387            }
388        }
389
390        /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
391        temp[1] = (uint8_t) (ch >> 16 & 0x1F);
392        temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
393        temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
394
395        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
396            if (myTarget < targetLimit) {
397                *(myTarget++) = temp[indexToWrite];
398                *(myOffsets++) = offsetNum;
399            }
400            else {
401                args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
402                *err = U_BUFFER_OVERFLOW_ERROR;
403            }
404        }
405        offsetNum = offsetNum + 1 + (temp[1] != 0);
406    }
407
408    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
409        *err = U_BUFFER_OVERFLOW_ERROR;
410    }
411
412    args->target = (char *) myTarget;
413    args->source = mySource;
414    args->offsets = myOffsets;
415}
416
417static UChar32
418T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
419                                   UErrorCode* err)
420{
421    const uint8_t *mySource;
422    UChar32 myUChar;
423    int32_t length;
424
425    mySource = (const uint8_t *)args->source;
426    if (mySource >= (const uint8_t *)args->sourceLimit)
427    {
428        /* no input */
429        *err = U_INDEX_OUTOFBOUNDS_ERROR;
430        return 0xffff;
431    }
432
433    length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
434    if (length < 4)
435    {
436        /* got a partial character */
437        uprv_memcpy(args->converter->toUBytes, mySource, length);
438        args->converter->toULength = (int8_t)length;
439        args->source = (const char *)(mySource + length);
440        *err = U_TRUNCATED_CHAR_FOUND;
441        return 0xffff;
442    }
443
444    /* Don't even try to do a direct cast because the value may be on an odd address. */
445    myUChar = ((UChar32)mySource[0] << 24)
446            | ((UChar32)mySource[1] << 16)
447            | ((UChar32)mySource[2] << 8)
448            | ((UChar32)mySource[3]);
449
450    args->source = (const char *)(mySource + 4);
451    if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
452        return myUChar;
453    }
454
455    uprv_memcpy(args->converter->toUBytes, mySource, 4);
456    args->converter->toULength = 4;
457
458    *err = U_ILLEGAL_CHAR_FOUND;
459    return 0xffff;
460}
461
462static const UConverterImpl _UTF32BEImpl = {
463    UCNV_UTF32_BigEndian,
464
465    NULL,
466    NULL,
467
468    NULL,
469    NULL,
470    NULL,
471
472    T_UConverter_toUnicode_UTF32_BE,
473    T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
474    T_UConverter_fromUnicode_UTF32_BE,
475    T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
476    T_UConverter_getNextUChar_UTF32_BE,
477
478    NULL,
479    NULL,
480    NULL,
481    NULL,
482    ucnv_getNonSurrogateUnicodeSet
483};
484
485/* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
486static const UConverterStaticData _UTF32BEStaticData = {
487    sizeof(UConverterStaticData),
488    "UTF-32BE",
489    1232,
490    UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
491    { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
492    0,
493    0,
494    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
495};
496
497const UConverterSharedData _UTF32BEData = {
498    sizeof(UConverterSharedData), ~((uint32_t) 0),
499    NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
500    0
501};
502
503/* UTF-32LE ---------------------------------------------------------- */
504
505static void
506T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
507                                UErrorCode * err)
508{
509    const unsigned char *mySource = (unsigned char *) args->source;
510    UChar *myTarget = args->target;
511    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
512    const UChar *targetLimit = args->targetLimit;
513    unsigned char *toUBytes = args->converter->toUBytes;
514    uint32_t ch, i;
515
516    /* Restore state of current sequence */
517    if (args->converter->toUnicodeStatus && myTarget < targetLimit)
518    {
519        i = args->converter->toULength;       /* restore # of bytes consumed */
520        args->converter->toULength = 0;
521
522        /* Stores the previously calculated ch from a previous call*/
523        ch = args->converter->toUnicodeStatus - 1;
524        args->converter->toUnicodeStatus = 0;
525        goto morebytes;
526    }
527
528    while (mySource < sourceLimit && myTarget < targetLimit)
529    {
530        i = 0;
531        ch = 0;
532morebytes:
533        while (i < sizeof(uint32_t))
534        {
535            if (mySource < sourceLimit)
536            {
537                ch |= ((uint8_t)(*mySource)) << (i * 8);
538                toUBytes[i++] = (char) *(mySource++);
539            }
540            else
541            {
542                /* stores a partially calculated target*/
543                /* + 1 to make 0 a valid character */
544                args->converter->toUnicodeStatus = ch + 1;
545                args->converter->toULength = (int8_t) i;
546                goto donefornow;
547            }
548        }
549
550        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
551            /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
552            if (ch <= MAXIMUM_UCS2) {
553                /* fits in 16 bits */
554                *(myTarget++) = (UChar) ch;
555            }
556            else {
557                /* write out the surrogates */
558                *(myTarget++) = U16_LEAD(ch);
559                ch = U16_TRAIL(ch);
560                if (myTarget < targetLimit) {
561                    *(myTarget++) = (UChar)ch;
562                }
563                else {
564                    /* Put in overflow buffer (not handled here) */
565                    args->converter->UCharErrorBuffer[0] = (UChar) ch;
566                    args->converter->UCharErrorBufferLength = 1;
567                    *err = U_BUFFER_OVERFLOW_ERROR;
568                    break;
569                }
570            }
571        }
572        else {
573            args->converter->toULength = (int8_t)i;
574            *err = U_ILLEGAL_CHAR_FOUND;
575            break;
576        }
577    }
578
579donefornow:
580    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
581    {
582        /* End of target buffer */
583        *err = U_BUFFER_OVERFLOW_ERROR;
584    }
585
586    args->target = myTarget;
587    args->source = (const char *) mySource;
588}
589
590static void
591T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
592                                             UErrorCode * err)
593{
594    const unsigned char *mySource = (unsigned char *) args->source;
595    UChar *myTarget = args->target;
596    int32_t *myOffsets = args->offsets;
597    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
598    const UChar *targetLimit = args->targetLimit;
599    unsigned char *toUBytes = args->converter->toUBytes;
600    uint32_t ch, i;
601    int32_t offsetNum = 0;
602
603    /* Restore state of current sequence */
604    if (args->converter->toUnicodeStatus && myTarget < targetLimit)
605    {
606        i = args->converter->toULength;       /* restore # of bytes consumed */
607        args->converter->toULength = 0;
608
609        /* Stores the previously calculated ch from a previous call*/
610        ch = args->converter->toUnicodeStatus - 1;
611        args->converter->toUnicodeStatus = 0;
612        goto morebytes;
613    }
614
615    while (mySource < sourceLimit && myTarget < targetLimit)
616    {
617        i = 0;
618        ch = 0;
619morebytes:
620        while (i < sizeof(uint32_t))
621        {
622            if (mySource < sourceLimit)
623            {
624                ch |= ((uint8_t)(*mySource)) << (i * 8);
625                toUBytes[i++] = (char) *(mySource++);
626            }
627            else
628            {
629                /* stores a partially calculated target*/
630                /* + 1 to make 0 a valid character */
631                args->converter->toUnicodeStatus = ch + 1;
632                args->converter->toULength = (int8_t) i;
633                goto donefornow;
634            }
635        }
636
637        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
638        {
639            /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
640            if (ch <= MAXIMUM_UCS2)
641            {
642                /* fits in 16 bits */
643                *(myTarget++) = (UChar) ch;
644                *(myOffsets++) = offsetNum;
645            }
646            else {
647                /* write out the surrogates */
648                *(myTarget++) = U16_LEAD(ch);
649                *(myOffsets++) = offsetNum;
650                ch = U16_TRAIL(ch);
651                if (myTarget < targetLimit)
652                {
653                    *(myTarget++) = (UChar)ch;
654                    *(myOffsets++) = offsetNum;
655                }
656                else
657                {
658                    /* Put in overflow buffer (not handled here) */
659                    args->converter->UCharErrorBuffer[0] = (UChar) ch;
660                    args->converter->UCharErrorBufferLength = 1;
661                    *err = U_BUFFER_OVERFLOW_ERROR;
662                    break;
663                }
664            }
665        }
666        else
667        {
668            args->converter->toULength = (int8_t)i;
669            *err = U_ILLEGAL_CHAR_FOUND;
670            break;
671        }
672        offsetNum += i;
673    }
674
675donefornow:
676    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
677    {
678        /* End of target buffer */
679        *err = U_BUFFER_OVERFLOW_ERROR;
680    }
681
682    args->target = myTarget;
683    args->source = (const char *) mySource;
684    args->offsets = myOffsets;
685}
686
687static void
688T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
689                                  UErrorCode * err)
690{
691    const UChar *mySource = args->source;
692    unsigned char *myTarget;
693    const UChar *sourceLimit = args->sourceLimit;
694    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
695    UChar32 ch, ch2;
696    unsigned int indexToWrite;
697    unsigned char temp[sizeof(uint32_t)];
698
699    if(mySource >= sourceLimit) {
700        /* no input, nothing to do */
701        return;
702    }
703
704    /* write the BOM if necessary */
705    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
706        static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
707        ucnv_fromUWriteBytes(args->converter,
708                             bom, 4,
709                             &args->target, args->targetLimit,
710                             &args->offsets, -1,
711                             err);
712        args->converter->fromUnicodeStatus=0;
713    }
714
715    myTarget = (unsigned char *) args->target;
716    temp[3] = 0;
717
718    if (args->converter->fromUChar32)
719    {
720        ch = args->converter->fromUChar32;
721        args->converter->fromUChar32 = 0;
722        goto lowsurogate;
723    }
724
725    while (mySource < sourceLimit && myTarget < targetLimit)
726    {
727        ch = *(mySource++);
728
729        if (UTF_IS_SURROGATE(ch)) {
730            if (U_IS_LEAD(ch))
731            {
732lowsurogate:
733                if (mySource < sourceLimit)
734                {
735                    ch2 = *mySource;
736                    if (U_IS_TRAIL(ch2)) {
737                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
738                        mySource++;
739                    }
740                    else {
741                        /* this is an unmatched trail code unit (2nd surrogate) */
742                        /* callback(illegal) */
743                        args->converter->fromUChar32 = ch;
744                        *err = U_ILLEGAL_CHAR_FOUND;
745                        break;
746                    }
747                }
748                else {
749                    /* ran out of source */
750                    args->converter->fromUChar32 = ch;
751                    if (args->flush) {
752                        /* this is an unmatched trail code unit (2nd surrogate) */
753                        /* callback(illegal) */
754                        *err = U_ILLEGAL_CHAR_FOUND;
755                    }
756                    break;
757                }
758            }
759            else {
760                /* this is an unmatched trail code unit (2nd surrogate) */
761                /* callback(illegal) */
762                args->converter->fromUChar32 = ch;
763                *err = U_ILLEGAL_CHAR_FOUND;
764                break;
765            }
766        }
767
768        /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
769        temp[2] = (uint8_t) (ch >> 16 & 0x1F);
770        temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
771        temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
772
773        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
774        {
775            if (myTarget < targetLimit)
776            {
777                *(myTarget++) = temp[indexToWrite];
778            }
779            else
780            {
781                args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
782                *err = U_BUFFER_OVERFLOW_ERROR;
783            }
784        }
785    }
786
787    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
788    {
789        *err = U_BUFFER_OVERFLOW_ERROR;
790    }
791
792    args->target = (char *) myTarget;
793    args->source = mySource;
794}
795
796static void
797T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
798                                               UErrorCode * err)
799{
800    const UChar *mySource = args->source;
801    unsigned char *myTarget;
802    int32_t *myOffsets;
803    const UChar *sourceLimit = args->sourceLimit;
804    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
805    UChar32 ch, ch2;
806    unsigned int indexToWrite;
807    unsigned char temp[sizeof(uint32_t)];
808    int32_t offsetNum = 0;
809
810    if(mySource >= sourceLimit) {
811        /* no input, nothing to do */
812        return;
813    }
814
815    /* write the BOM if necessary */
816    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
817        static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
818        ucnv_fromUWriteBytes(args->converter,
819                             bom, 4,
820                             &args->target, args->targetLimit,
821                             &args->offsets, -1,
822                             err);
823        args->converter->fromUnicodeStatus=0;
824    }
825
826    myTarget = (unsigned char *) args->target;
827    myOffsets = args->offsets;
828    temp[3] = 0;
829
830    if (args->converter->fromUChar32)
831    {
832        ch = args->converter->fromUChar32;
833        args->converter->fromUChar32 = 0;
834        goto lowsurogate;
835    }
836
837    while (mySource < sourceLimit && myTarget < targetLimit)
838    {
839        ch = *(mySource++);
840
841        if (UTF_IS_SURROGATE(ch)) {
842            if (U_IS_LEAD(ch))
843            {
844lowsurogate:
845                if (mySource < sourceLimit)
846                {
847                    ch2 = *mySource;
848                    if (U_IS_TRAIL(ch2))
849                    {
850                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
851                        mySource++;
852                    }
853                    else {
854                        /* this is an unmatched trail code unit (2nd surrogate) */
855                        /* callback(illegal) */
856                        args->converter->fromUChar32 = ch;
857                        *err = U_ILLEGAL_CHAR_FOUND;
858                        break;
859                    }
860                }
861                else {
862                    /* ran out of source */
863                    args->converter->fromUChar32 = ch;
864                    if (args->flush) {
865                        /* this is an unmatched trail code unit (2nd surrogate) */
866                        /* callback(illegal) */
867                        *err = U_ILLEGAL_CHAR_FOUND;
868                    }
869                    break;
870                }
871            }
872            else {
873                /* this is an unmatched trail code unit (2nd surrogate) */
874                /* callback(illegal) */
875                args->converter->fromUChar32 = ch;
876                *err = U_ILLEGAL_CHAR_FOUND;
877                break;
878            }
879        }
880
881        /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
882        temp[2] = (uint8_t) (ch >> 16 & 0x1F);
883        temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
884        temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
885
886        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
887        {
888            if (myTarget < targetLimit)
889            {
890                *(myTarget++) = temp[indexToWrite];
891                *(myOffsets++) = offsetNum;
892            }
893            else
894            {
895                args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
896                *err = U_BUFFER_OVERFLOW_ERROR;
897            }
898        }
899        offsetNum = offsetNum + 1 + (temp[2] != 0);
900    }
901
902    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
903    {
904        *err = U_BUFFER_OVERFLOW_ERROR;
905    }
906
907    args->target = (char *) myTarget;
908    args->source = mySource;
909    args->offsets = myOffsets;
910}
911
912static UChar32
913T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
914                                   UErrorCode* err)
915{
916    const uint8_t *mySource;
917    UChar32 myUChar;
918    int32_t length;
919
920    mySource = (const uint8_t *)args->source;
921    if (mySource >= (const uint8_t *)args->sourceLimit)
922    {
923        /* no input */
924        *err = U_INDEX_OUTOFBOUNDS_ERROR;
925        return 0xffff;
926    }
927
928    length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
929    if (length < 4)
930    {
931        /* got a partial character */
932        uprv_memcpy(args->converter->toUBytes, mySource, length);
933        args->converter->toULength = (int8_t)length;
934        args->source = (const char *)(mySource + length);
935        *err = U_TRUNCATED_CHAR_FOUND;
936        return 0xffff;
937    }
938
939    /* Don't even try to do a direct cast because the value may be on an odd address. */
940    myUChar = ((UChar32)mySource[3] << 24)
941            | ((UChar32)mySource[2] << 16)
942            | ((UChar32)mySource[1] << 8)
943            | ((UChar32)mySource[0]);
944
945    args->source = (const char *)(mySource + 4);
946    if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
947        return myUChar;
948    }
949
950    uprv_memcpy(args->converter->toUBytes, mySource, 4);
951    args->converter->toULength = 4;
952
953    *err = U_ILLEGAL_CHAR_FOUND;
954    return 0xffff;
955}
956
957static const UConverterImpl _UTF32LEImpl = {
958    UCNV_UTF32_LittleEndian,
959
960    NULL,
961    NULL,
962
963    NULL,
964    NULL,
965    NULL,
966
967    T_UConverter_toUnicode_UTF32_LE,
968    T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
969    T_UConverter_fromUnicode_UTF32_LE,
970    T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
971    T_UConverter_getNextUChar_UTF32_LE,
972
973    NULL,
974    NULL,
975    NULL,
976    NULL,
977    ucnv_getNonSurrogateUnicodeSet
978};
979
980/* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
981static const UConverterStaticData _UTF32LEStaticData = {
982    sizeof(UConverterStaticData),
983    "UTF-32LE",
984    1234,
985    UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
986    { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
987    0,
988    0,
989    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
990};
991
992
993const UConverterSharedData _UTF32LEData = {
994    sizeof(UConverterSharedData), ~((uint32_t) 0),
995    NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
996    0
997};
998
999/* UTF-32 (Detect BOM) ------------------------------------------------------ */
1000
1001/*
1002 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1003 * accordingly.
1004 *
1005 * State values:
1006 * 0    initial state
1007 * 1    saw 00
1008 * 2    saw 00 00
1009 * 3    saw 00 00 FE
1010 * 4    -
1011 * 5    saw FF
1012 * 6    saw FF FE
1013 * 7    saw FF FE 00
1014 * 8    UTF-32BE mode
1015 * 9    UTF-32LE mode
1016 *
1017 * During detection: state&3==number of matching bytes so far.
1018 *
1019 * On output, emit U+FEFF as the first code point.
1020 */
1021
1022static void
1023_UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1024    if(choice<=UCNV_RESET_TO_UNICODE) {
1025        /* reset toUnicode: state=0 */
1026        cnv->mode=0;
1027    }
1028    if(choice!=UCNV_RESET_TO_UNICODE) {
1029        /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1030        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1031    }
1032}
1033
1034static void
1035_UTF32Open(UConverter *cnv,
1036           UConverterLoadArgs *pArgs,
1037           UErrorCode *pErrorCode) {
1038    _UTF32Reset(cnv, UCNV_RESET_BOTH);
1039}
1040
1041static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };
1042
1043static void
1044_UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1045                           UErrorCode *pErrorCode) {
1046    UConverter *cnv=pArgs->converter;
1047    const char *source=pArgs->source;
1048    const char *sourceLimit=pArgs->sourceLimit;
1049    int32_t *offsets=pArgs->offsets;
1050
1051    int32_t state, offsetDelta;
1052    char b;
1053
1054    state=cnv->mode;
1055
1056    /*
1057     * If we detect a BOM in this buffer, then we must add the BOM size to the
1058     * offsets because the actual converter function will not see and count the BOM.
1059     * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1060     */
1061    offsetDelta=0;
1062
1063    while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1064        switch(state) {
1065        case 0:
1066            b=*source;
1067            if(b==0) {
1068                state=1; /* could be 00 00 FE FF */
1069            } else if(b==(char)0xff) {
1070                state=5; /* could be FF FE 00 00 */
1071            } else {
1072                state=8; /* default to UTF-32BE */
1073                continue;
1074            }
1075            ++source;
1076            break;
1077        case 1:
1078        case 2:
1079        case 3:
1080        case 5:
1081        case 6:
1082        case 7:
1083            if(*source==utf32BOM[state]) {
1084                ++state;
1085                ++source;
1086                if(state==4) {
1087                    state=8; /* detect UTF-32BE */
1088                    offsetDelta=(int32_t)(source-pArgs->source);
1089                } else if(state==8) {
1090                    state=9; /* detect UTF-32LE */
1091                    offsetDelta=(int32_t)(source-pArgs->source);
1092                }
1093            } else {
1094                /* switch to UTF-32BE and pass the previous bytes */
1095                int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1096
1097                /* reset the source */
1098                source=pArgs->source;
1099
1100                if(count==(state&3)) {
1101                    /* simple: all in the same buffer, just reset source */
1102                } else {
1103                    UBool oldFlush=pArgs->flush;
1104
1105                    /* some of the bytes are from a previous buffer, replay those first */
1106                    pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1107                    pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1108                    pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1109
1110                    /* no offsets: bytes from previous buffer, and not enough for output */
1111                    T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1112
1113                    /* restore real pointers; pArgs->source will be set in case 8/9 */
1114                    pArgs->sourceLimit=sourceLimit;
1115                    pArgs->flush=oldFlush;
1116                }
1117                state=8;
1118                continue;
1119            }
1120            break;
1121        case 8:
1122            /* call UTF-32BE */
1123            pArgs->source=source;
1124            if(offsets==NULL) {
1125                T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1126            } else {
1127                T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1128            }
1129            source=pArgs->source;
1130            break;
1131        case 9:
1132            /* call UTF-32LE */
1133            pArgs->source=source;
1134            if(offsets==NULL) {
1135                T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1136            } else {
1137                T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1138            }
1139            source=pArgs->source;
1140            break;
1141        default:
1142            break; /* does not occur */
1143        }
1144    }
1145
1146    /* add BOM size to offsets - see comment at offsetDelta declaration */
1147    if(offsets!=NULL && offsetDelta!=0) {
1148        int32_t *offsetsLimit=pArgs->offsets;
1149        while(offsets<offsetsLimit) {
1150            *offsets++ += offsetDelta;
1151        }
1152    }
1153
1154    pArgs->source=source;
1155
1156    if(source==sourceLimit && pArgs->flush) {
1157        /* handle truncated input */
1158        switch(state) {
1159        case 0:
1160            break; /* no input at all, nothing to do */
1161        case 8:
1162            T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1163            break;
1164        case 9:
1165            T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1166            break;
1167        default:
1168            /* handle 0<state<8: call UTF-32BE with too-short input */
1169            pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1170            pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1171
1172            /* no offsets: not enough for output */
1173            T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1174            pArgs->source=source;
1175            pArgs->sourceLimit=sourceLimit;
1176            state=8;
1177            break;
1178        }
1179    }
1180
1181    cnv->mode=state;
1182}
1183
1184static UChar32
1185_UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1186                   UErrorCode *pErrorCode) {
1187    switch(pArgs->converter->mode) {
1188    case 8:
1189        return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1190    case 9:
1191        return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1192    default:
1193        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1194    }
1195}
1196
1197static const UConverterImpl _UTF32Impl = {
1198    UCNV_UTF32,
1199
1200    NULL,
1201    NULL,
1202
1203    _UTF32Open,
1204    NULL,
1205    _UTF32Reset,
1206
1207    _UTF32ToUnicodeWithOffsets,
1208    _UTF32ToUnicodeWithOffsets,
1209#if U_IS_BIG_ENDIAN
1210    T_UConverter_fromUnicode_UTF32_BE,
1211    T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1212#else
1213    T_UConverter_fromUnicode_UTF32_LE,
1214    T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1215#endif
1216    _UTF32GetNextUChar,
1217
1218    NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1219    NULL,
1220    NULL,
1221    NULL,
1222    ucnv_getNonSurrogateUnicodeSet
1223};
1224
1225/* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1226static const UConverterStaticData _UTF32StaticData = {
1227    sizeof(UConverterStaticData),
1228    "UTF-32",
1229    1236,
1230    UCNV_IBM, UCNV_UTF32, 4, 4,
1231#if U_IS_BIG_ENDIAN
1232    { 0, 0, 0xff, 0xfd }, 4,
1233#else
1234    { 0xfd, 0xff, 0, 0 }, 4,
1235#endif
1236    FALSE, FALSE,
1237    0,
1238    0,
1239    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1240};
1241
1242const UConverterSharedData _UTF32Data = {
1243    sizeof(UConverterSharedData), ~((uint32_t) 0),
1244    NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
1245    0
1246};
1247
1248#endif
1249