1/*
2**********************************************************************
3*   Copyright (C) 2002-2015, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   file name:  ucnv_u32.c
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2002jul01
12*   created by: Markus W. Scherer
13*
14*   UTF-32 converter implementation. Used to be in ucnv_utf.c.
15*/
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
20
21#include "unicode/ucnv.h"
22#include "unicode/utf.h"
23#include "ucnv_bld.h"
24#include "ucnv_cnv.h"
25#include "cmemory.h"
26
27#define MAXIMUM_UCS2            0x0000FFFF
28#define MAXIMUM_UTF             0x0010FFFF
29#define HALF_SHIFT              10
30#define HALF_BASE               0x0010000
31#define HALF_MASK               0x3FF
32#define SURROGATE_HIGH_START    0xD800
33#define SURROGATE_LOW_START     0xDC00
34
35/* -SURROGATE_LOW_START + HALF_BASE */
36#define SURROGATE_LOW_BASE      9216
37
38enum {
39    UCNV_NEED_TO_WRITE_BOM=1
40};
41
42/* UTF-32BE ----------------------------------------------------------------- */
43
44static void
45T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
46                                UErrorCode * err)
47{
48    const unsigned char *mySource = (unsigned char *) args->source;
49    UChar *myTarget = args->target;
50    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
51    const UChar *targetLimit = args->targetLimit;
52    unsigned char *toUBytes = args->converter->toUBytes;
53    uint32_t ch, i;
54
55    /* Restore state of current sequence */
56    if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
57        i = args->converter->toULength;       /* restore # of bytes consumed */
58        args->converter->toULength = 0;
59
60        ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
61        args->converter->toUnicodeStatus = 0;
62        goto morebytes;
63    }
64
65    while (mySource < sourceLimit && myTarget < targetLimit) {
66        i = 0;
67        ch = 0;
68morebytes:
69        while (i < sizeof(uint32_t)) {
70            if (mySource < sourceLimit) {
71                ch = (ch << 8) | (uint8_t)(*mySource);
72                toUBytes[i++] = (char) *(mySource++);
73            }
74            else {
75                /* stores a partially calculated target*/
76                /* + 1 to make 0 a valid character */
77                args->converter->toUnicodeStatus = ch + 1;
78                args->converter->toULength = (int8_t) i;
79                goto donefornow;
80            }
81        }
82
83        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
84            /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
85            if (ch <= MAXIMUM_UCS2)
86            {
87                /* fits in 16 bits */
88                *(myTarget++) = (UChar) ch;
89            }
90            else {
91                /* write out the surrogates */
92                *(myTarget++) = U16_LEAD(ch);
93                ch = U16_TRAIL(ch);
94                if (myTarget < targetLimit) {
95                    *(myTarget++) = (UChar)ch;
96                }
97                else {
98                    /* Put in overflow buffer (not handled here) */
99                    args->converter->UCharErrorBuffer[0] = (UChar) ch;
100                    args->converter->UCharErrorBufferLength = 1;
101                    *err = U_BUFFER_OVERFLOW_ERROR;
102                    break;
103                }
104            }
105        }
106        else {
107            args->converter->toULength = (int8_t)i;
108            *err = U_ILLEGAL_CHAR_FOUND;
109            break;
110        }
111    }
112
113donefornow:
114    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
115        /* End of target buffer */
116        *err = U_BUFFER_OVERFLOW_ERROR;
117    }
118
119    args->target = myTarget;
120    args->source = (const char *) mySource;
121}
122
123static void
124T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
125                                             UErrorCode * err)
126{
127    const unsigned char *mySource = (unsigned char *) args->source;
128    UChar *myTarget = args->target;
129    int32_t *myOffsets = args->offsets;
130    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
131    const UChar *targetLimit = args->targetLimit;
132    unsigned char *toUBytes = args->converter->toUBytes;
133    uint32_t ch, i;
134    int32_t offsetNum = 0;
135
136    /* Restore state of current sequence */
137    if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
138        i = args->converter->toULength;       /* restore # of bytes consumed */
139        args->converter->toULength = 0;
140
141        ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
142        args->converter->toUnicodeStatus = 0;
143        goto morebytes;
144    }
145
146    while (mySource < sourceLimit && myTarget < targetLimit) {
147        i = 0;
148        ch = 0;
149morebytes:
150        while (i < sizeof(uint32_t)) {
151            if (mySource < sourceLimit) {
152                ch = (ch << 8) | (uint8_t)(*mySource);
153                toUBytes[i++] = (char) *(mySource++);
154            }
155            else {
156                /* stores a partially calculated target*/
157                /* + 1 to make 0 a valid character */
158                args->converter->toUnicodeStatus = ch + 1;
159                args->converter->toULength = (int8_t) i;
160                goto donefornow;
161            }
162        }
163
164        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
165            /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
166            if (ch <= MAXIMUM_UCS2) {
167                /* fits in 16 bits */
168                *(myTarget++) = (UChar) ch;
169                *(myOffsets++) = offsetNum;
170            }
171            else {
172                /* write out the surrogates */
173                *(myTarget++) = U16_LEAD(ch);
174                *myOffsets++ = offsetNum;
175                ch = U16_TRAIL(ch);
176                if (myTarget < targetLimit)
177                {
178                    *(myTarget++) = (UChar)ch;
179                    *(myOffsets++) = offsetNum;
180                }
181                else {
182                    /* Put in overflow buffer (not handled here) */
183                    args->converter->UCharErrorBuffer[0] = (UChar) ch;
184                    args->converter->UCharErrorBufferLength = 1;
185                    *err = U_BUFFER_OVERFLOW_ERROR;
186                    break;
187                }
188            }
189        }
190        else {
191            args->converter->toULength = (int8_t)i;
192            *err = U_ILLEGAL_CHAR_FOUND;
193            break;
194        }
195        offsetNum += i;
196    }
197
198donefornow:
199    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
200    {
201        /* End of target buffer */
202        *err = U_BUFFER_OVERFLOW_ERROR;
203    }
204
205    args->target = myTarget;
206    args->source = (const char *) mySource;
207    args->offsets = myOffsets;
208}
209
210static void
211T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
212                                  UErrorCode * err)
213{
214    const UChar *mySource = args->source;
215    unsigned char *myTarget;
216    const UChar *sourceLimit = args->sourceLimit;
217    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
218    UChar32 ch, ch2;
219    unsigned int indexToWrite;
220    unsigned char temp[sizeof(uint32_t)];
221
222    if(mySource >= sourceLimit) {
223        /* no input, nothing to do */
224        return;
225    }
226
227    /* write the BOM if necessary */
228    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
229        static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
230        ucnv_fromUWriteBytes(args->converter,
231                             bom, 4,
232                             &args->target, args->targetLimit,
233                             &args->offsets, -1,
234                             err);
235        args->converter->fromUnicodeStatus=0;
236    }
237
238    myTarget = (unsigned char *) args->target;
239    temp[0] = 0;
240
241    if (args->converter->fromUChar32) {
242        ch = args->converter->fromUChar32;
243        args->converter->fromUChar32 = 0;
244        goto lowsurogate;
245    }
246
247    while (mySource < sourceLimit && myTarget < targetLimit) {
248        ch = *(mySource++);
249
250        if (U_IS_SURROGATE(ch)) {
251            if (U_IS_LEAD(ch)) {
252lowsurogate:
253                if (mySource < sourceLimit) {
254                    ch2 = *mySource;
255                    if (U_IS_TRAIL(ch2)) {
256                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
257                        mySource++;
258                    }
259                    else {
260                        /* this is an unmatched trail code unit (2nd surrogate) */
261                        /* callback(illegal) */
262                        args->converter->fromUChar32 = ch;
263                        *err = U_ILLEGAL_CHAR_FOUND;
264                        break;
265                    }
266                }
267                else {
268                    /* ran out of source */
269                    args->converter->fromUChar32 = ch;
270                    if (args->flush) {
271                        /* this is an unmatched trail code unit (2nd surrogate) */
272                        /* callback(illegal) */
273                        *err = U_ILLEGAL_CHAR_FOUND;
274                    }
275                    break;
276                }
277            }
278            else {
279                /* this is an unmatched trail code unit (2nd surrogate) */
280                /* callback(illegal) */
281                args->converter->fromUChar32 = ch;
282                *err = U_ILLEGAL_CHAR_FOUND;
283                break;
284            }
285        }
286
287        /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
288        temp[1] = (uint8_t) (ch >> 16 & 0x1F);
289        temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
290        temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
291
292        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
293            if (myTarget < targetLimit) {
294                *(myTarget++) = temp[indexToWrite];
295            }
296            else {
297                args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
298                *err = U_BUFFER_OVERFLOW_ERROR;
299            }
300        }
301    }
302
303    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
304        *err = U_BUFFER_OVERFLOW_ERROR;
305    }
306
307    args->target = (char *) myTarget;
308    args->source = mySource;
309}
310
311static void
312T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
313                                               UErrorCode * err)
314{
315    const UChar *mySource = args->source;
316    unsigned char *myTarget;
317    int32_t *myOffsets;
318    const UChar *sourceLimit = args->sourceLimit;
319    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
320    UChar32 ch, ch2;
321    int32_t offsetNum = 0;
322    unsigned int indexToWrite;
323    unsigned char temp[sizeof(uint32_t)];
324
325    if(mySource >= sourceLimit) {
326        /* no input, nothing to do */
327        return;
328    }
329
330    /* write the BOM if necessary */
331    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
332        static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
333        ucnv_fromUWriteBytes(args->converter,
334                             bom, 4,
335                             &args->target, args->targetLimit,
336                             &args->offsets, -1,
337                             err);
338        args->converter->fromUnicodeStatus=0;
339    }
340
341    myTarget = (unsigned char *) args->target;
342    myOffsets = args->offsets;
343    temp[0] = 0;
344
345    if (args->converter->fromUChar32) {
346        ch = args->converter->fromUChar32;
347        args->converter->fromUChar32 = 0;
348        goto lowsurogate;
349    }
350
351    while (mySource < sourceLimit && myTarget < targetLimit) {
352        ch = *(mySource++);
353
354        if (U_IS_SURROGATE(ch)) {
355            if (U_IS_LEAD(ch)) {
356lowsurogate:
357                if (mySource < sourceLimit) {
358                    ch2 = *mySource;
359                    if (U_IS_TRAIL(ch2)) {
360                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
361                        mySource++;
362                    }
363                    else {
364                        /* this is an unmatched trail code unit (2nd surrogate) */
365                        /* callback(illegal) */
366                        args->converter->fromUChar32 = ch;
367                        *err = U_ILLEGAL_CHAR_FOUND;
368                        break;
369                    }
370                }
371                else {
372                    /* ran out of source */
373                    args->converter->fromUChar32 = ch;
374                    if (args->flush) {
375                        /* this is an unmatched trail code unit (2nd surrogate) */
376                        /* callback(illegal) */
377                        *err = U_ILLEGAL_CHAR_FOUND;
378                    }
379                    break;
380                }
381            }
382            else {
383                /* this is an unmatched trail code unit (2nd surrogate) */
384                /* callback(illegal) */
385                args->converter->fromUChar32 = ch;
386                *err = U_ILLEGAL_CHAR_FOUND;
387                break;
388            }
389        }
390
391        /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
392        temp[1] = (uint8_t) (ch >> 16 & 0x1F);
393        temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
394        temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
395
396        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
397            if (myTarget < targetLimit) {
398                *(myTarget++) = temp[indexToWrite];
399                *(myOffsets++) = offsetNum;
400            }
401            else {
402                args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
403                *err = U_BUFFER_OVERFLOW_ERROR;
404            }
405        }
406        offsetNum = offsetNum + 1 + (temp[1] != 0);
407    }
408
409    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
410        *err = U_BUFFER_OVERFLOW_ERROR;
411    }
412
413    args->target = (char *) myTarget;
414    args->source = mySource;
415    args->offsets = myOffsets;
416}
417
418static UChar32
419T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
420                                   UErrorCode* err)
421{
422    const uint8_t *mySource;
423    UChar32 myUChar;
424    int32_t length;
425
426    mySource = (const uint8_t *)args->source;
427    if (mySource >= (const uint8_t *)args->sourceLimit)
428    {
429        /* no input */
430        *err = U_INDEX_OUTOFBOUNDS_ERROR;
431        return 0xffff;
432    }
433
434    length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
435    if (length < 4)
436    {
437        /* got a partial character */
438        uprv_memcpy(args->converter->toUBytes, mySource, length);
439        args->converter->toULength = (int8_t)length;
440        args->source = (const char *)(mySource + length);
441        *err = U_TRUNCATED_CHAR_FOUND;
442        return 0xffff;
443    }
444
445    /* Don't even try to do a direct cast because the value may be on an odd address. */
446    myUChar = ((UChar32)mySource[0] << 24)
447            | ((UChar32)mySource[1] << 16)
448            | ((UChar32)mySource[2] << 8)
449            | ((UChar32)mySource[3]);
450
451    args->source = (const char *)(mySource + 4);
452    if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
453        return myUChar;
454    }
455
456    uprv_memcpy(args->converter->toUBytes, mySource, 4);
457    args->converter->toULength = 4;
458
459    *err = U_ILLEGAL_CHAR_FOUND;
460    return 0xffff;
461}
462
463static const UConverterImpl _UTF32BEImpl = {
464    UCNV_UTF32_BigEndian,
465
466    NULL,
467    NULL,
468
469    NULL,
470    NULL,
471    NULL,
472
473    T_UConverter_toUnicode_UTF32_BE,
474    T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
475    T_UConverter_fromUnicode_UTF32_BE,
476    T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
477    T_UConverter_getNextUChar_UTF32_BE,
478
479    NULL,
480    NULL,
481    NULL,
482    NULL,
483    ucnv_getNonSurrogateUnicodeSet
484};
485
486/* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
487static const UConverterStaticData _UTF32BEStaticData = {
488    sizeof(UConverterStaticData),
489    "UTF-32BE",
490    1232,
491    UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
492    { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
493    0,
494    0,
495    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
496};
497
498const UConverterSharedData _UTF32BEData = {
499    sizeof(UConverterSharedData), ~((uint32_t) 0),
500    NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
501    0
502};
503
504/* UTF-32LE ---------------------------------------------------------- */
505
506static void
507T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
508                                UErrorCode * err)
509{
510    const unsigned char *mySource = (unsigned char *) args->source;
511    UChar *myTarget = args->target;
512    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
513    const UChar *targetLimit = args->targetLimit;
514    unsigned char *toUBytes = args->converter->toUBytes;
515    uint32_t ch, i;
516
517    /* Restore state of current sequence */
518    if (args->converter->toUnicodeStatus && myTarget < targetLimit)
519    {
520        i = args->converter->toULength;       /* restore # of bytes consumed */
521        args->converter->toULength = 0;
522
523        /* Stores the previously calculated ch from a previous call*/
524        ch = args->converter->toUnicodeStatus - 1;
525        args->converter->toUnicodeStatus = 0;
526        goto morebytes;
527    }
528
529    while (mySource < sourceLimit && myTarget < targetLimit)
530    {
531        i = 0;
532        ch = 0;
533morebytes:
534        while (i < sizeof(uint32_t))
535        {
536            if (mySource < sourceLimit)
537            {
538                ch |= ((uint8_t)(*mySource)) << (i * 8);
539                toUBytes[i++] = (char) *(mySource++);
540            }
541            else
542            {
543                /* stores a partially calculated target*/
544                /* + 1 to make 0 a valid character */
545                args->converter->toUnicodeStatus = ch + 1;
546                args->converter->toULength = (int8_t) i;
547                goto donefornow;
548            }
549        }
550
551        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
552            /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
553            if (ch <= MAXIMUM_UCS2) {
554                /* fits in 16 bits */
555                *(myTarget++) = (UChar) ch;
556            }
557            else {
558                /* write out the surrogates */
559                *(myTarget++) = U16_LEAD(ch);
560                ch = U16_TRAIL(ch);
561                if (myTarget < targetLimit) {
562                    *(myTarget++) = (UChar)ch;
563                }
564                else {
565                    /* Put in overflow buffer (not handled here) */
566                    args->converter->UCharErrorBuffer[0] = (UChar) ch;
567                    args->converter->UCharErrorBufferLength = 1;
568                    *err = U_BUFFER_OVERFLOW_ERROR;
569                    break;
570                }
571            }
572        }
573        else {
574            args->converter->toULength = (int8_t)i;
575            *err = U_ILLEGAL_CHAR_FOUND;
576            break;
577        }
578    }
579
580donefornow:
581    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
582    {
583        /* End of target buffer */
584        *err = U_BUFFER_OVERFLOW_ERROR;
585    }
586
587    args->target = myTarget;
588    args->source = (const char *) mySource;
589}
590
591static void
592T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
593                                             UErrorCode * err)
594{
595    const unsigned char *mySource = (unsigned char *) args->source;
596    UChar *myTarget = args->target;
597    int32_t *myOffsets = args->offsets;
598    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
599    const UChar *targetLimit = args->targetLimit;
600    unsigned char *toUBytes = args->converter->toUBytes;
601    uint32_t ch, i;
602    int32_t offsetNum = 0;
603
604    /* Restore state of current sequence */
605    if (args->converter->toUnicodeStatus && myTarget < targetLimit)
606    {
607        i = args->converter->toULength;       /* restore # of bytes consumed */
608        args->converter->toULength = 0;
609
610        /* Stores the previously calculated ch from a previous call*/
611        ch = args->converter->toUnicodeStatus - 1;
612        args->converter->toUnicodeStatus = 0;
613        goto morebytes;
614    }
615
616    while (mySource < sourceLimit && myTarget < targetLimit)
617    {
618        i = 0;
619        ch = 0;
620morebytes:
621        while (i < sizeof(uint32_t))
622        {
623            if (mySource < sourceLimit)
624            {
625                ch |= ((uint8_t)(*mySource)) << (i * 8);
626                toUBytes[i++] = (char) *(mySource++);
627            }
628            else
629            {
630                /* stores a partially calculated target*/
631                /* + 1 to make 0 a valid character */
632                args->converter->toUnicodeStatus = ch + 1;
633                args->converter->toULength = (int8_t) i;
634                goto donefornow;
635            }
636        }
637
638        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
639        {
640            /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
641            if (ch <= MAXIMUM_UCS2)
642            {
643                /* fits in 16 bits */
644                *(myTarget++) = (UChar) ch;
645                *(myOffsets++) = offsetNum;
646            }
647            else {
648                /* write out the surrogates */
649                *(myTarget++) = U16_LEAD(ch);
650                *(myOffsets++) = offsetNum;
651                ch = U16_TRAIL(ch);
652                if (myTarget < targetLimit)
653                {
654                    *(myTarget++) = (UChar)ch;
655                    *(myOffsets++) = offsetNum;
656                }
657                else
658                {
659                    /* Put in overflow buffer (not handled here) */
660                    args->converter->UCharErrorBuffer[0] = (UChar) ch;
661                    args->converter->UCharErrorBufferLength = 1;
662                    *err = U_BUFFER_OVERFLOW_ERROR;
663                    break;
664                }
665            }
666        }
667        else
668        {
669            args->converter->toULength = (int8_t)i;
670            *err = U_ILLEGAL_CHAR_FOUND;
671            break;
672        }
673        offsetNum += i;
674    }
675
676donefornow:
677    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
678    {
679        /* End of target buffer */
680        *err = U_BUFFER_OVERFLOW_ERROR;
681    }
682
683    args->target = myTarget;
684    args->source = (const char *) mySource;
685    args->offsets = myOffsets;
686}
687
688static void
689T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
690                                  UErrorCode * err)
691{
692    const UChar *mySource = args->source;
693    unsigned char *myTarget;
694    const UChar *sourceLimit = args->sourceLimit;
695    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
696    UChar32 ch, ch2;
697    unsigned int indexToWrite;
698    unsigned char temp[sizeof(uint32_t)];
699
700    if(mySource >= sourceLimit) {
701        /* no input, nothing to do */
702        return;
703    }
704
705    /* write the BOM if necessary */
706    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
707        static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
708        ucnv_fromUWriteBytes(args->converter,
709                             bom, 4,
710                             &args->target, args->targetLimit,
711                             &args->offsets, -1,
712                             err);
713        args->converter->fromUnicodeStatus=0;
714    }
715
716    myTarget = (unsigned char *) args->target;
717    temp[3] = 0;
718
719    if (args->converter->fromUChar32)
720    {
721        ch = args->converter->fromUChar32;
722        args->converter->fromUChar32 = 0;
723        goto lowsurogate;
724    }
725
726    while (mySource < sourceLimit && myTarget < targetLimit)
727    {
728        ch = *(mySource++);
729
730        if (U16_IS_SURROGATE(ch)) {
731            if (U16_IS_LEAD(ch))
732            {
733lowsurogate:
734                if (mySource < sourceLimit)
735                {
736                    ch2 = *mySource;
737                    if (U16_IS_TRAIL(ch2)) {
738                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
739                        mySource++;
740                    }
741                    else {
742                        /* this is an unmatched trail code unit (2nd surrogate) */
743                        /* callback(illegal) */
744                        args->converter->fromUChar32 = ch;
745                        *err = U_ILLEGAL_CHAR_FOUND;
746                        break;
747                    }
748                }
749                else {
750                    /* ran out of source */
751                    args->converter->fromUChar32 = ch;
752                    if (args->flush) {
753                        /* this is an unmatched trail code unit (2nd surrogate) */
754                        /* callback(illegal) */
755                        *err = U_ILLEGAL_CHAR_FOUND;
756                    }
757                    break;
758                }
759            }
760            else {
761                /* this is an unmatched trail code unit (2nd surrogate) */
762                /* callback(illegal) */
763                args->converter->fromUChar32 = ch;
764                *err = U_ILLEGAL_CHAR_FOUND;
765                break;
766            }
767        }
768
769        /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
770        temp[2] = (uint8_t) (ch >> 16 & 0x1F);
771        temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
772        temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
773
774        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
775        {
776            if (myTarget < targetLimit)
777            {
778                *(myTarget++) = temp[indexToWrite];
779            }
780            else
781            {
782                args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
783                *err = U_BUFFER_OVERFLOW_ERROR;
784            }
785        }
786    }
787
788    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
789    {
790        *err = U_BUFFER_OVERFLOW_ERROR;
791    }
792
793    args->target = (char *) myTarget;
794    args->source = mySource;
795}
796
797static void
798T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
799                                               UErrorCode * err)
800{
801    const UChar *mySource = args->source;
802    unsigned char *myTarget;
803    int32_t *myOffsets;
804    const UChar *sourceLimit = args->sourceLimit;
805    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
806    UChar32 ch, ch2;
807    unsigned int indexToWrite;
808    unsigned char temp[sizeof(uint32_t)];
809    int32_t offsetNum = 0;
810
811    if(mySource >= sourceLimit) {
812        /* no input, nothing to do */
813        return;
814    }
815
816    /* write the BOM if necessary */
817    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
818        static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
819        ucnv_fromUWriteBytes(args->converter,
820                             bom, 4,
821                             &args->target, args->targetLimit,
822                             &args->offsets, -1,
823                             err);
824        args->converter->fromUnicodeStatus=0;
825    }
826
827    myTarget = (unsigned char *) args->target;
828    myOffsets = args->offsets;
829    temp[3] = 0;
830
831    if (args->converter->fromUChar32)
832    {
833        ch = args->converter->fromUChar32;
834        args->converter->fromUChar32 = 0;
835        goto lowsurogate;
836    }
837
838    while (mySource < sourceLimit && myTarget < targetLimit)
839    {
840        ch = *(mySource++);
841
842        if (U16_IS_SURROGATE(ch)) {
843            if (U16_IS_LEAD(ch))
844            {
845lowsurogate:
846                if (mySource < sourceLimit)
847                {
848                    ch2 = *mySource;
849                    if (U16_IS_TRAIL(ch2))
850                    {
851                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
852                        mySource++;
853                    }
854                    else {
855                        /* this is an unmatched trail code unit (2nd surrogate) */
856                        /* callback(illegal) */
857                        args->converter->fromUChar32 = ch;
858                        *err = U_ILLEGAL_CHAR_FOUND;
859                        break;
860                    }
861                }
862                else {
863                    /* ran out of source */
864                    args->converter->fromUChar32 = ch;
865                    if (args->flush) {
866                        /* this is an unmatched trail code unit (2nd surrogate) */
867                        /* callback(illegal) */
868                        *err = U_ILLEGAL_CHAR_FOUND;
869                    }
870                    break;
871                }
872            }
873            else {
874                /* this is an unmatched trail code unit (2nd surrogate) */
875                /* callback(illegal) */
876                args->converter->fromUChar32 = ch;
877                *err = U_ILLEGAL_CHAR_FOUND;
878                break;
879            }
880        }
881
882        /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
883        temp[2] = (uint8_t) (ch >> 16 & 0x1F);
884        temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
885        temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
886
887        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
888        {
889            if (myTarget < targetLimit)
890            {
891                *(myTarget++) = temp[indexToWrite];
892                *(myOffsets++) = offsetNum;
893            }
894            else
895            {
896                args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
897                *err = U_BUFFER_OVERFLOW_ERROR;
898            }
899        }
900        offsetNum = offsetNum + 1 + (temp[2] != 0);
901    }
902
903    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
904    {
905        *err = U_BUFFER_OVERFLOW_ERROR;
906    }
907
908    args->target = (char *) myTarget;
909    args->source = mySource;
910    args->offsets = myOffsets;
911}
912
913static UChar32
914T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
915                                   UErrorCode* err)
916{
917    const uint8_t *mySource;
918    UChar32 myUChar;
919    int32_t length;
920
921    mySource = (const uint8_t *)args->source;
922    if (mySource >= (const uint8_t *)args->sourceLimit)
923    {
924        /* no input */
925        *err = U_INDEX_OUTOFBOUNDS_ERROR;
926        return 0xffff;
927    }
928
929    length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
930    if (length < 4)
931    {
932        /* got a partial character */
933        uprv_memcpy(args->converter->toUBytes, mySource, length);
934        args->converter->toULength = (int8_t)length;
935        args->source = (const char *)(mySource + length);
936        *err = U_TRUNCATED_CHAR_FOUND;
937        return 0xffff;
938    }
939
940    /* Don't even try to do a direct cast because the value may be on an odd address. */
941    myUChar = ((UChar32)mySource[3] << 24)
942            | ((UChar32)mySource[2] << 16)
943            | ((UChar32)mySource[1] << 8)
944            | ((UChar32)mySource[0]);
945
946    args->source = (const char *)(mySource + 4);
947    if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
948        return myUChar;
949    }
950
951    uprv_memcpy(args->converter->toUBytes, mySource, 4);
952    args->converter->toULength = 4;
953
954    *err = U_ILLEGAL_CHAR_FOUND;
955    return 0xffff;
956}
957
958static const UConverterImpl _UTF32LEImpl = {
959    UCNV_UTF32_LittleEndian,
960
961    NULL,
962    NULL,
963
964    NULL,
965    NULL,
966    NULL,
967
968    T_UConverter_toUnicode_UTF32_LE,
969    T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
970    T_UConverter_fromUnicode_UTF32_LE,
971    T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
972    T_UConverter_getNextUChar_UTF32_LE,
973
974    NULL,
975    NULL,
976    NULL,
977    NULL,
978    ucnv_getNonSurrogateUnicodeSet
979};
980
981/* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
982static const UConverterStaticData _UTF32LEStaticData = {
983    sizeof(UConverterStaticData),
984    "UTF-32LE",
985    1234,
986    UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
987    { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
988    0,
989    0,
990    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
991};
992
993
994const UConverterSharedData _UTF32LEData = {
995    sizeof(UConverterSharedData), ~((uint32_t) 0),
996    NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
997    0
998};
999
1000/* UTF-32 (Detect BOM) ------------------------------------------------------ */
1001
1002/*
1003 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1004 * accordingly.
1005 *
1006 * State values:
1007 * 0    initial state
1008 * 1    saw 00
1009 * 2    saw 00 00
1010 * 3    saw 00 00 FE
1011 * 4    -
1012 * 5    saw FF
1013 * 6    saw FF FE
1014 * 7    saw FF FE 00
1015 * 8    UTF-32BE mode
1016 * 9    UTF-32LE mode
1017 *
1018 * During detection: state&3==number of matching bytes so far.
1019 *
1020 * On output, emit U+FEFF as the first code point.
1021 */
1022
1023static void
1024_UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1025    if(choice<=UCNV_RESET_TO_UNICODE) {
1026        /* reset toUnicode: state=0 */
1027        cnv->mode=0;
1028    }
1029    if(choice!=UCNV_RESET_TO_UNICODE) {
1030        /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1031        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1032    }
1033}
1034
1035static void
1036_UTF32Open(UConverter *cnv,
1037           UConverterLoadArgs *pArgs,
1038           UErrorCode *pErrorCode) {
1039    _UTF32Reset(cnv, UCNV_RESET_BOTH);
1040}
1041
1042static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };
1043
1044static void
1045_UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1046                           UErrorCode *pErrorCode) {
1047    UConverter *cnv=pArgs->converter;
1048    const char *source=pArgs->source;
1049    const char *sourceLimit=pArgs->sourceLimit;
1050    int32_t *offsets=pArgs->offsets;
1051
1052    int32_t state, offsetDelta;
1053    char b;
1054
1055    state=cnv->mode;
1056
1057    /*
1058     * If we detect a BOM in this buffer, then we must add the BOM size to the
1059     * offsets because the actual converter function will not see and count the BOM.
1060     * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1061     */
1062    offsetDelta=0;
1063
1064    while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1065        switch(state) {
1066        case 0:
1067            b=*source;
1068            if(b==0) {
1069                state=1; /* could be 00 00 FE FF */
1070            } else if(b==(char)0xff) {
1071                state=5; /* could be FF FE 00 00 */
1072            } else {
1073                state=8; /* default to UTF-32BE */
1074                continue;
1075            }
1076            ++source;
1077            break;
1078        case 1:
1079        case 2:
1080        case 3:
1081        case 5:
1082        case 6:
1083        case 7:
1084            if(*source==utf32BOM[state]) {
1085                ++state;
1086                ++source;
1087                if(state==4) {
1088                    state=8; /* detect UTF-32BE */
1089                    offsetDelta=(int32_t)(source-pArgs->source);
1090                } else if(state==8) {
1091                    state=9; /* detect UTF-32LE */
1092                    offsetDelta=(int32_t)(source-pArgs->source);
1093                }
1094            } else {
1095                /* switch to UTF-32BE and pass the previous bytes */
1096                int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1097
1098                /* reset the source */
1099                source=pArgs->source;
1100
1101                if(count==(state&3)) {
1102                    /* simple: all in the same buffer, just reset source */
1103                } else {
1104                    UBool oldFlush=pArgs->flush;
1105
1106                    /* some of the bytes are from a previous buffer, replay those first */
1107                    pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1108                    pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1109                    pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1110
1111                    /* no offsets: bytes from previous buffer, and not enough for output */
1112                    T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1113
1114                    /* restore real pointers; pArgs->source will be set in case 8/9 */
1115                    pArgs->sourceLimit=sourceLimit;
1116                    pArgs->flush=oldFlush;
1117                }
1118                state=8;
1119                continue;
1120            }
1121            break;
1122        case 8:
1123            /* call UTF-32BE */
1124            pArgs->source=source;
1125            if(offsets==NULL) {
1126                T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1127            } else {
1128                T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1129            }
1130            source=pArgs->source;
1131            break;
1132        case 9:
1133            /* call UTF-32LE */
1134            pArgs->source=source;
1135            if(offsets==NULL) {
1136                T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1137            } else {
1138                T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1139            }
1140            source=pArgs->source;
1141            break;
1142        default:
1143            break; /* does not occur */
1144        }
1145    }
1146
1147    /* add BOM size to offsets - see comment at offsetDelta declaration */
1148    if(offsets!=NULL && offsetDelta!=0) {
1149        int32_t *offsetsLimit=pArgs->offsets;
1150        while(offsets<offsetsLimit) {
1151            *offsets++ += offsetDelta;
1152        }
1153    }
1154
1155    pArgs->source=source;
1156
1157    if(source==sourceLimit && pArgs->flush) {
1158        /* handle truncated input */
1159        switch(state) {
1160        case 0:
1161            break; /* no input at all, nothing to do */
1162        case 8:
1163            T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1164            break;
1165        case 9:
1166            T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1167            break;
1168        default:
1169            /* handle 0<state<8: call UTF-32BE with too-short input */
1170            pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1171            pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1172
1173            /* no offsets: not enough for output */
1174            T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1175            pArgs->source=source;
1176            pArgs->sourceLimit=sourceLimit;
1177            state=8;
1178            break;
1179        }
1180    }
1181
1182    cnv->mode=state;
1183}
1184
1185static UChar32
1186_UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1187                   UErrorCode *pErrorCode) {
1188    switch(pArgs->converter->mode) {
1189    case 8:
1190        return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1191    case 9:
1192        return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1193    default:
1194        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1195    }
1196}
1197
1198static const UConverterImpl _UTF32Impl = {
1199    UCNV_UTF32,
1200
1201    NULL,
1202    NULL,
1203
1204    _UTF32Open,
1205    NULL,
1206    _UTF32Reset,
1207
1208    _UTF32ToUnicodeWithOffsets,
1209    _UTF32ToUnicodeWithOffsets,
1210#if U_IS_BIG_ENDIAN
1211    T_UConverter_fromUnicode_UTF32_BE,
1212    T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1213#else
1214    T_UConverter_fromUnicode_UTF32_LE,
1215    T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1216#endif
1217    _UTF32GetNextUChar,
1218
1219    NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1220    NULL,
1221    NULL,
1222    NULL,
1223    ucnv_getNonSurrogateUnicodeSet
1224};
1225
1226/* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1227static const UConverterStaticData _UTF32StaticData = {
1228    sizeof(UConverterStaticData),
1229    "UTF-32",
1230    1236,
1231    UCNV_IBM, UCNV_UTF32, 4, 4,
1232#if U_IS_BIG_ENDIAN
1233    { 0, 0, 0xff, 0xfd }, 4,
1234#else
1235    { 0xfd, 0xff, 0, 0 }, 4,
1236#endif
1237    FALSE, FALSE,
1238    0,
1239    0,
1240    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1241};
1242
1243const UConverterSharedData _UTF32Data = {
1244    sizeof(UConverterSharedData), ~((uint32_t) 0),
1245    NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
1246    0
1247};
1248
1249#endif
1250