1/*
2******************************************************************************
3*
4*   Copyright (C) 2001-2013, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*
9* File ustrtrns.cpp
10*
11* Modification History:
12*
13*   Date        Name        Description
14*   9/10/2001    Ram    Creation.
15******************************************************************************
16*/
17
18/*******************************************************************************
19 *
20 * u_strTo* and u_strFrom* APIs
21 * WCS functions moved to ustr_wcs.c for better modularization
22 *
23 *******************************************************************************
24 */
25
26
27#include "unicode/putil.h"
28#include "unicode/ustring.h"
29#include "unicode/utf.h"
30#include "unicode/utf8.h"
31#include "unicode/utf16.h"
32#include "cstring.h"
33#include "cmemory.h"
34#include "ustr_imp.h"
35#include "uassert.h"
36
37#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
38
39U_CAPI UChar* U_EXPORT2
40u_strFromUTF32WithSub(UChar *dest,
41               int32_t destCapacity,
42               int32_t *pDestLength,
43               const UChar32 *src,
44               int32_t srcLength,
45               UChar32 subchar, int32_t *pNumSubstitutions,
46               UErrorCode *pErrorCode) {
47    const UChar32 *srcLimit;
48    UChar32 ch;
49    UChar *destLimit;
50    UChar *pDest;
51    int32_t reqLength;
52    int32_t numSubstitutions;
53
54    /* args check */
55    if(U_FAILURE(*pErrorCode)){
56        return NULL;
57    }
58    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
59        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
60        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
61    ) {
62        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
63        return NULL;
64    }
65
66    if(pNumSubstitutions != NULL) {
67        *pNumSubstitutions = 0;
68    }
69
70    pDest = dest;
71    destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
72    reqLength = 0;
73    numSubstitutions = 0;
74
75    if(srcLength < 0) {
76        /* simple loop for conversion of a NUL-terminated BMP string */
77        while((ch=*src) != 0 &&
78              ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
79            ++src;
80            if(pDest < destLimit) {
81                *pDest++ = (UChar)ch;
82            } else {
83                ++reqLength;
84            }
85        }
86        srcLimit = src;
87        if(ch != 0) {
88            /* "complicated" case, find the end of the remaining string */
89            while(*++srcLimit != 0) {}
90        }
91    } else {
92      srcLimit = (src!=NULL)?(src + srcLength):NULL;
93    }
94
95    /* convert with length */
96    while(src < srcLimit) {
97        ch = *src++;
98        do {
99            /* usually "loops" once; twice only for writing subchar */
100            if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
101                if(pDest < destLimit) {
102                    *pDest++ = (UChar)ch;
103                } else {
104                    ++reqLength;
105                }
106                break;
107            } else if(0x10000 <= ch && ch <= 0x10ffff) {
108                if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
109                    *pDest++ = U16_LEAD(ch);
110                    *pDest++ = U16_TRAIL(ch);
111                } else {
112                    reqLength += 2;
113                }
114                break;
115            } else if((ch = subchar) < 0) {
116                /* surrogate code point, or not a Unicode code point at all */
117                *pErrorCode = U_INVALID_CHAR_FOUND;
118                return NULL;
119            } else {
120                ++numSubstitutions;
121            }
122        } while(TRUE);
123    }
124
125    reqLength += (int32_t)(pDest - dest);
126    if(pDestLength) {
127        *pDestLength = reqLength;
128    }
129    if(pNumSubstitutions != NULL) {
130        *pNumSubstitutions = numSubstitutions;
131    }
132
133    /* Terminate the buffer */
134    u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
135
136    return dest;
137}
138
139U_CAPI UChar* U_EXPORT2
140u_strFromUTF32(UChar *dest,
141               int32_t destCapacity,
142               int32_t *pDestLength,
143               const UChar32 *src,
144               int32_t srcLength,
145               UErrorCode *pErrorCode) {
146    return u_strFromUTF32WithSub(
147            dest, destCapacity, pDestLength,
148            src, srcLength,
149            U_SENTINEL, NULL,
150            pErrorCode);
151}
152
153U_CAPI UChar32* U_EXPORT2
154u_strToUTF32WithSub(UChar32 *dest,
155             int32_t destCapacity,
156             int32_t *pDestLength,
157             const UChar *src,
158             int32_t srcLength,
159             UChar32 subchar, int32_t *pNumSubstitutions,
160             UErrorCode *pErrorCode) {
161    const UChar *srcLimit;
162    UChar32 ch;
163    UChar ch2;
164    UChar32 *destLimit;
165    UChar32 *pDest;
166    int32_t reqLength;
167    int32_t numSubstitutions;
168
169    /* args check */
170    if(U_FAILURE(*pErrorCode)){
171        return NULL;
172    }
173    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
174        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
175        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
176    ) {
177        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
178        return NULL;
179    }
180
181    if(pNumSubstitutions != NULL) {
182        *pNumSubstitutions = 0;
183    }
184
185    pDest = dest;
186    destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
187    reqLength = 0;
188    numSubstitutions = 0;
189
190    if(srcLength < 0) {
191        /* simple loop for conversion of a NUL-terminated BMP string */
192        while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
193            ++src;
194            if(pDest < destLimit) {
195                *pDest++ = ch;
196            } else {
197                ++reqLength;
198            }
199        }
200        srcLimit = src;
201        if(ch != 0) {
202            /* "complicated" case, find the end of the remaining string */
203            while(*++srcLimit != 0) {}
204        }
205    } else {
206        srcLimit = (src!=NULL)?(src + srcLength):NULL;
207    }
208
209    /* convert with length */
210    while(src < srcLimit) {
211        ch = *src++;
212        if(!U16_IS_SURROGATE(ch)) {
213            /* write or count ch below */
214        } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
215            ++src;
216            ch = U16_GET_SUPPLEMENTARY(ch, ch2);
217        } else if((ch = subchar) < 0) {
218            /* unpaired surrogate */
219            *pErrorCode = U_INVALID_CHAR_FOUND;
220            return NULL;
221        } else {
222            ++numSubstitutions;
223        }
224        if(pDest < destLimit) {
225            *pDest++ = ch;
226        } else {
227            ++reqLength;
228        }
229    }
230
231    reqLength += (int32_t)(pDest - dest);
232    if(pDestLength) {
233        *pDestLength = reqLength;
234    }
235    if(pNumSubstitutions != NULL) {
236        *pNumSubstitutions = numSubstitutions;
237    }
238
239    /* Terminate the buffer */
240    u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
241
242    return dest;
243}
244
245U_CAPI UChar32* U_EXPORT2
246u_strToUTF32(UChar32 *dest,
247             int32_t destCapacity,
248             int32_t *pDestLength,
249             const UChar *src,
250             int32_t srcLength,
251             UErrorCode *pErrorCode) {
252    return u_strToUTF32WithSub(
253            dest, destCapacity, pDestLength,
254            src, srcLength,
255            U_SENTINEL, NULL,
256            pErrorCode);
257}
258
259/* for utf8_nextCharSafeBodyTerminated() */
260static const UChar32
261utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
262
263/*
264 * Version of utf8_nextCharSafeBody() with the following differences:
265 * - checks for NUL termination instead of length
266 * - works with pointers instead of indexes
267 * - always strict (strict==-1)
268 *
269 * *ps points to after the lead byte and will be moved to after the last trail byte.
270 * c is the lead byte.
271 * @return the code point, or U_SENTINEL
272 */
273static UChar32
274utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
275    const uint8_t *s=*ps;
276    uint8_t trail, illegal=0;
277    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
278    U_ASSERT(count<6);
279    U8_MASK_LEAD_BYTE((c), count);
280    /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
281    switch(count) {
282    /* each branch falls through to the next one */
283    case 5:
284    case 4:
285        /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
286        illegal=1;
287        break;
288    case 3:
289        trail=(uint8_t)(*s++ - 0x80);
290        c=(c<<6)|trail;
291        if(trail>0x3f || c>=0x110) {
292            /* not a trail byte, or code point>0x10ffff (outside Unicode) */
293            illegal=1;
294            break;
295        }
296    case 2: /*fall through*/
297        trail=(uint8_t)(*s++ - 0x80);
298        if(trail>0x3f) {
299            /* not a trail byte */
300            illegal=1;
301            break;
302        }
303        c=(c<<6)|trail;
304    case 1: /*fall through*/
305        trail=(uint8_t)(*s++ - 0x80);
306        if(trail>0x3f) {
307            /* not a trail byte */
308            illegal=1;
309        }
310        c=(c<<6)|trail;
311        break;
312    case 0:
313        return U_SENTINEL;
314    /* no default branch to optimize switch()  - all values are covered */
315    }
316
317    /* correct sequence - all trail bytes have (b7..b6)==(10)? */
318    /* illegal is also set if count>=4 */
319    if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
320        /* error handling */
321        /* don't go beyond this sequence */
322        s=*ps;
323        while(count>0 && U8_IS_TRAIL(*s)) {
324            ++s;
325            --count;
326        }
327        c=U_SENTINEL;
328    }
329    *ps=s;
330    return c;
331}
332
333/*
334 * Version of utf8_nextCharSafeBody() with the following differences:
335 * - works with pointers instead of indexes
336 * - always strict (strict==-1)
337 *
338 * *ps points to after the lead byte and will be moved to after the last trail byte.
339 * c is the lead byte.
340 * @return the code point, or U_SENTINEL
341 */
342static UChar32
343utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
344    const uint8_t *s=*ps;
345    uint8_t trail, illegal=0;
346    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
347    if((limit-s)>=count) {
348        U8_MASK_LEAD_BYTE((c), count);
349        /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
350        switch(count) {
351        /* each branch falls through to the next one */
352        case 5:
353        case 4:
354            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
355            illegal=1;
356            break;
357        case 3:
358            trail=*s++;
359            c=(c<<6)|(trail&0x3f);
360            if(c<0x110) {
361                illegal|=(trail&0xc0)^0x80;
362            } else {
363                /* code point>0x10ffff, outside Unicode */
364                illegal=1;
365                break;
366            }
367        case 2: /*fall through*/
368            trail=*s++;
369            c=(c<<6)|(trail&0x3f);
370            illegal|=(trail&0xc0)^0x80;
371        case 1: /*fall through*/
372            trail=*s++;
373            c=(c<<6)|(trail&0x3f);
374            illegal|=(trail&0xc0)^0x80;
375            break;
376        case 0:
377            return U_SENTINEL;
378        /* no default branch to optimize switch()  - all values are covered */
379        }
380    } else {
381        illegal=1; /* too few bytes left */
382    }
383
384    /* correct sequence - all trail bytes have (b7..b6)==(10)? */
385    /* illegal is also set if count>=4 */
386    U_ASSERT(illegal || count<LENGTHOF(utf8_minLegal));
387    if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
388        /* error handling */
389        /* don't go beyond this sequence */
390        s=*ps;
391        while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
392            ++s;
393            --count;
394        }
395        c=U_SENTINEL;
396    }
397    *ps=s;
398    return c;
399}
400
401U_CAPI UChar* U_EXPORT2
402u_strFromUTF8WithSub(UChar *dest,
403              int32_t destCapacity,
404              int32_t *pDestLength,
405              const char* src,
406              int32_t srcLength,
407              UChar32 subchar, int32_t *pNumSubstitutions,
408              UErrorCode *pErrorCode){
409    UChar *pDest = dest;
410    UChar *pDestLimit = dest+destCapacity;
411    UChar32 ch;
412    int32_t reqLength = 0;
413    const uint8_t* pSrc = (const uint8_t*) src;
414    uint8_t t1, t2; /* trail bytes */
415    int32_t numSubstitutions;
416
417    /* args check */
418    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
419        return NULL;
420    }
421
422    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
423        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
424        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
425    ) {
426        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
427        return NULL;
428    }
429
430    if(pNumSubstitutions!=NULL) {
431        *pNumSubstitutions=0;
432    }
433    numSubstitutions=0;
434
435    /*
436     * Inline processing of UTF-8 byte sequences:
437     *
438     * Byte sequences for the most common characters are handled inline in
439     * the conversion loops. In order to reduce the path lengths for those
440     * characters, the tests are arranged in a kind of binary search.
441     * ASCII (<=0x7f) is checked first, followed by the dividing point
442     * between 2- and 3-byte sequences (0xe0).
443     * The 3-byte branch is tested first to speed up CJK text.
444     * The compiler should combine the subtractions for the two tests for 0xe0.
445     * Each branch then tests for the other end of its range.
446     */
447
448    if(srcLength < 0){
449        /*
450         * Transform a NUL-terminated string.
451         * The code explicitly checks for NULs only in the lead byte position.
452         * A NUL byte in the trail byte position fails the trail byte range check anyway.
453         */
454        while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
455            if(ch <= 0x7f){
456                *pDest++=(UChar)ch;
457                ++pSrc;
458            } else {
459                if(ch > 0xe0) {
460                    if( /* handle U+1000..U+CFFF inline */
461                        ch <= 0xec &&
462                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
463                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
464                    ) {
465                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
466                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
467                        pSrc += 3;
468                        continue;
469                    }
470                } else if(ch < 0xe0) {
471                    if( /* handle U+0080..U+07FF inline */
472                        ch >= 0xc2 &&
473                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
474                    ) {
475                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
476                        pSrc += 2;
477                        continue;
478                    }
479                }
480
481                /* function call for "complicated" and error cases */
482                ++pSrc; /* continue after the lead byte */
483                ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
484                if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
485                    *pErrorCode = U_INVALID_CHAR_FOUND;
486                    return NULL;
487                } else if(ch<=0xFFFF) {
488                    *(pDest++)=(UChar)ch;
489                } else {
490                    *(pDest++)=U16_LEAD(ch);
491                    if(pDest<pDestLimit) {
492                        *(pDest++)=U16_TRAIL(ch);
493                    } else {
494                        reqLength++;
495                        break;
496                    }
497                }
498            }
499        }
500
501        /* Pre-flight the rest of the string. */
502        while((ch = *pSrc) != 0) {
503            if(ch <= 0x7f){
504                ++reqLength;
505                ++pSrc;
506            } else {
507                if(ch > 0xe0) {
508                    if( /* handle U+1000..U+CFFF inline */
509                        ch <= 0xec &&
510                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
511                        (uint8_t)(pSrc[2] - 0x80) <= 0x3f
512                    ) {
513                        ++reqLength;
514                        pSrc += 3;
515                        continue;
516                    }
517                } else if(ch < 0xe0) {
518                    if( /* handle U+0080..U+07FF inline */
519                        ch >= 0xc2 &&
520                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f
521                    ) {
522                        ++reqLength;
523                        pSrc += 2;
524                        continue;
525                    }
526                }
527
528                /* function call for "complicated" and error cases */
529                ++pSrc; /* continue after the lead byte */
530                ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
531                if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
532                    *pErrorCode = U_INVALID_CHAR_FOUND;
533                    return NULL;
534                }
535                reqLength += U16_LENGTH(ch);
536            }
537        }
538    } else /* srcLength >= 0 */ {
539        const uint8_t *pSrcLimit = pSrc + srcLength;
540        int32_t count;
541
542        /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
543        for(;;) {
544            /*
545             * Each iteration of the inner loop progresses by at most 3 UTF-8
546             * bytes and one UChar, for most characters.
547             * For supplementary code points (4 & 2), which are rare,
548             * there is an additional adjustment.
549             */
550            count = (int32_t)(pDestLimit - pDest);
551            srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
552            if(count > srcLength) {
553                count = srcLength; /* min(remaining dest, remaining src/3) */
554            }
555            if(count < 3) {
556                /*
557                 * Too much overhead if we get near the end of the string,
558                 * continue with the next loop.
559                 */
560                break;
561            }
562
563            do {
564                ch = *pSrc;
565                if(ch <= 0x7f){
566                    *pDest++=(UChar)ch;
567                    ++pSrc;
568                } else {
569                    if(ch > 0xe0) {
570                        if( /* handle U+1000..U+CFFF inline */
571                            ch <= 0xec &&
572                            (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
573                            (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
574                        ) {
575                            /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
576                            *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
577                            pSrc += 3;
578                            continue;
579                        }
580                    } else if(ch < 0xe0) {
581                        if( /* handle U+0080..U+07FF inline */
582                            ch >= 0xc2 &&
583                            (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
584                        ) {
585                            *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
586                            pSrc += 2;
587                            continue;
588                        }
589                    }
590
591                    if(ch >= 0xf0 || subchar > 0xffff) {
592                        /*
593                         * We may read up to six bytes and write up to two UChars,
594                         * which we didn't account for with computing count,
595                         * so we adjust it here.
596                         */
597                        if(--count == 0) {
598                            break;
599                        }
600                    }
601
602                    /* function call for "complicated" and error cases */
603                    ++pSrc; /* continue after the lead byte */
604                    ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
605                    if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
606                        *pErrorCode = U_INVALID_CHAR_FOUND;
607                        return NULL;
608                    }else if(ch<=0xFFFF){
609                        *(pDest++)=(UChar)ch;
610                    }else{
611                        *(pDest++)=U16_LEAD(ch);
612                        *(pDest++)=U16_TRAIL(ch);
613                    }
614                }
615            } while(--count > 0);
616        }
617
618        while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
619            ch = *pSrc;
620            if(ch <= 0x7f){
621                *pDest++=(UChar)ch;
622                ++pSrc;
623            } else {
624                if(ch > 0xe0) {
625                    if( /* handle U+1000..U+CFFF inline */
626                        ch <= 0xec &&
627                        ((pSrcLimit - pSrc) >= 3) &&
628                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
629                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
630                    ) {
631                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
632                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
633                        pSrc += 3;
634                        continue;
635                    }
636                } else if(ch < 0xe0) {
637                    if( /* handle U+0080..U+07FF inline */
638                        ch >= 0xc2 &&
639                        ((pSrcLimit - pSrc) >= 2) &&
640                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
641                    ) {
642                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
643                        pSrc += 2;
644                        continue;
645                    }
646                }
647
648                /* function call for "complicated" and error cases */
649                ++pSrc; /* continue after the lead byte */
650                ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
651                if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
652                    *pErrorCode = U_INVALID_CHAR_FOUND;
653                    return NULL;
654                }else if(ch<=0xFFFF){
655                    *(pDest++)=(UChar)ch;
656                }else{
657                    *(pDest++)=U16_LEAD(ch);
658                    if(pDest<pDestLimit){
659                        *(pDest++)=U16_TRAIL(ch);
660                    }else{
661                        reqLength++;
662                        break;
663                    }
664                }
665            }
666        }
667        /* do not fill the dest buffer just count the UChars needed */
668        while(pSrc < pSrcLimit){
669            ch = *pSrc;
670            if(ch <= 0x7f){
671                reqLength++;
672                ++pSrc;
673            } else {
674                if(ch > 0xe0) {
675                    if( /* handle U+1000..U+CFFF inline */
676                        ch <= 0xec &&
677                        ((pSrcLimit - pSrc) >= 3) &&
678                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
679                        (uint8_t)(pSrc[2] - 0x80) <= 0x3f
680                    ) {
681                        reqLength++;
682                        pSrc += 3;
683                        continue;
684                    }
685                } else if(ch < 0xe0) {
686                    if( /* handle U+0080..U+07FF inline */
687                        ch >= 0xc2 &&
688                        ((pSrcLimit - pSrc) >= 2) &&
689                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f
690                    ) {
691                        reqLength++;
692                        pSrc += 2;
693                        continue;
694                    }
695                }
696
697                /* function call for "complicated" and error cases */
698                ++pSrc; /* continue after the lead byte */
699                ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
700                if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
701                    *pErrorCode = U_INVALID_CHAR_FOUND;
702                    return NULL;
703                }
704                reqLength+=U16_LENGTH(ch);
705            }
706        }
707    }
708
709    reqLength+=(int32_t)(pDest - dest);
710
711    if(pNumSubstitutions!=NULL) {
712        *pNumSubstitutions=numSubstitutions;
713    }
714
715    if(pDestLength){
716        *pDestLength = reqLength;
717    }
718
719    /* Terminate the buffer */
720    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
721
722    return dest;
723}
724
725U_CAPI UChar* U_EXPORT2
726u_strFromUTF8(UChar *dest,
727              int32_t destCapacity,
728              int32_t *pDestLength,
729              const char* src,
730              int32_t srcLength,
731              UErrorCode *pErrorCode){
732    return u_strFromUTF8WithSub(
733            dest, destCapacity, pDestLength,
734            src, srcLength,
735            U_SENTINEL, NULL,
736            pErrorCode);
737}
738
739U_CAPI UChar * U_EXPORT2
740u_strFromUTF8Lenient(UChar *dest,
741                     int32_t destCapacity,
742                     int32_t *pDestLength,
743                     const char *src,
744                     int32_t srcLength,
745                     UErrorCode *pErrorCode) {
746    UChar *pDest = dest;
747    UChar32 ch;
748    int32_t reqLength = 0;
749    uint8_t* pSrc = (uint8_t*) src;
750
751    /* args check */
752    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
753        return NULL;
754    }
755
756    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
757        (destCapacity<0) || (dest == NULL && destCapacity > 0)
758    ) {
759        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
760        return NULL;
761    }
762
763    if(srcLength < 0) {
764        /* Transform a NUL-terminated string. */
765        UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
766        uint8_t t1, t2, t3; /* trail bytes */
767
768        while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
769            if(ch < 0xc0) {
770                /*
771                 * ASCII, or a trail byte in lead position which is treated like
772                 * a single-byte sequence for better character boundary
773                 * resynchronization after illegal sequences.
774                 */
775                *pDest++=(UChar)ch;
776                ++pSrc;
777                continue;
778            } else if(ch < 0xe0) { /* U+0080..U+07FF */
779                if((t1 = pSrc[1]) != 0) {
780                    /* 0x3080 = (0xc0 << 6) + 0x80 */
781                    *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
782                    pSrc += 2;
783                    continue;
784                }
785            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
786                if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
787                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
788                    /* 0x2080 = (0x80 << 6) + 0x80 */
789                    *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
790                    pSrc += 3;
791                    continue;
792                }
793            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
794                if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
795                    pSrc += 4;
796                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
797                    ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
798                    *(pDest++) = U16_LEAD(ch);
799                    if(pDest < pDestLimit) {
800                        *(pDest++) = U16_TRAIL(ch);
801                    } else {
802                        reqLength = 1;
803                        break;
804                    }
805                    continue;
806                }
807            }
808
809            /* truncated character at the end */
810            *pDest++ = 0xfffd;
811            while(*++pSrc != 0) {}
812            break;
813        }
814
815        /* Pre-flight the rest of the string. */
816        while((ch = *pSrc) != 0) {
817            if(ch < 0xc0) {
818                /*
819                 * ASCII, or a trail byte in lead position which is treated like
820                 * a single-byte sequence for better character boundary
821                 * resynchronization after illegal sequences.
822                 */
823                ++reqLength;
824                ++pSrc;
825                continue;
826            } else if(ch < 0xe0) { /* U+0080..U+07FF */
827                if(pSrc[1] != 0) {
828                    ++reqLength;
829                    pSrc += 2;
830                    continue;
831                }
832            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
833                if(pSrc[1] != 0 && pSrc[2] != 0) {
834                    ++reqLength;
835                    pSrc += 3;
836                    continue;
837                }
838            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
839                if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
840                    reqLength += 2;
841                    pSrc += 4;
842                    continue;
843                }
844            }
845
846            /* truncated character at the end */
847            ++reqLength;
848            break;
849        }
850    } else /* srcLength >= 0 */ {
851      const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
852
853        /*
854         * This function requires that if srcLength is given, then it must be
855         * destCapatity >= srcLength so that we need not check for
856         * destination buffer overflow in the loop.
857         */
858        if(destCapacity < srcLength) {
859            if(pDestLength != NULL) {
860                *pDestLength = srcLength; /* this likely overestimates the true destLength! */
861            }
862            *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
863            return NULL;
864        }
865
866        if((pSrcLimit - pSrc) >= 4) {
867            pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
868
869            /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
870            do {
871                ch = *pSrc++;
872                if(ch < 0xc0) {
873                    /*
874                     * ASCII, or a trail byte in lead position which is treated like
875                     * a single-byte sequence for better character boundary
876                     * resynchronization after illegal sequences.
877                     */
878                    *pDest++=(UChar)ch;
879                } else if(ch < 0xe0) { /* U+0080..U+07FF */
880                    /* 0x3080 = (0xc0 << 6) + 0x80 */
881                    *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
882                } else if(ch < 0xf0) { /* U+0800..U+FFFF */
883                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
884                    /* 0x2080 = (0x80 << 6) + 0x80 */
885                    ch = (ch << 12) + (*pSrc++ << 6);
886                    *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
887                } else /* f0..f4 */ { /* U+10000..U+10FFFF */
888                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
889                    ch = (ch << 18) + (*pSrc++ << 12);
890                    ch += *pSrc++ << 6;
891                    ch += *pSrc++ - 0x3c82080;
892                    *(pDest++) = U16_LEAD(ch);
893                    *(pDest++) = U16_TRAIL(ch);
894                }
895            } while(pSrc < pSrcLimit);
896
897            pSrcLimit += 3; /* restore original pSrcLimit */
898        }
899
900        while(pSrc < pSrcLimit) {
901            ch = *pSrc++;
902            if(ch < 0xc0) {
903                /*
904                 * ASCII, or a trail byte in lead position which is treated like
905                 * a single-byte sequence for better character boundary
906                 * resynchronization after illegal sequences.
907                 */
908                *pDest++=(UChar)ch;
909                continue;
910            } else if(ch < 0xe0) { /* U+0080..U+07FF */
911                if(pSrc < pSrcLimit) {
912                    /* 0x3080 = (0xc0 << 6) + 0x80 */
913                    *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
914                    continue;
915                }
916            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
917                if((pSrcLimit - pSrc) >= 2) {
918                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
919                    /* 0x2080 = (0x80 << 6) + 0x80 */
920                    ch = (ch << 12) + (*pSrc++ << 6);
921                    *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
922                    pSrc += 3;
923                    continue;
924                }
925            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
926                if((pSrcLimit - pSrc) >= 3) {
927                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
928                    ch = (ch << 18) + (*pSrc++ << 12);
929                    ch += *pSrc++ << 6;
930                    ch += *pSrc++ - 0x3c82080;
931                    *(pDest++) = U16_LEAD(ch);
932                    *(pDest++) = U16_TRAIL(ch);
933                    pSrc += 4;
934                    continue;
935                }
936            }
937
938            /* truncated character at the end */
939            *pDest++ = 0xfffd;
940            break;
941        }
942    }
943
944    reqLength+=(int32_t)(pDest - dest);
945
946    if(pDestLength){
947        *pDestLength = reqLength;
948    }
949
950    /* Terminate the buffer */
951    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
952
953    return dest;
954}
955
956static inline uint8_t *
957_appendUTF8(uint8_t *pDest, UChar32 c) {
958    /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
959    if((c)<=0x7f) {
960        *pDest++=(uint8_t)c;
961    } else if(c<=0x7ff) {
962        *pDest++=(uint8_t)((c>>6)|0xc0);
963        *pDest++=(uint8_t)((c&0x3f)|0x80);
964    } else if(c<=0xffff) {
965        *pDest++=(uint8_t)((c>>12)|0xe0);
966        *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
967        *pDest++=(uint8_t)(((c)&0x3f)|0x80);
968    } else /* if((uint32_t)(c)<=0x10ffff) */ {
969        *pDest++=(uint8_t)(((c)>>18)|0xf0);
970        *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
971        *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
972        *pDest++=(uint8_t)(((c)&0x3f)|0x80);
973    }
974    return pDest;
975}
976
977
978U_CAPI char* U_EXPORT2
979u_strToUTF8WithSub(char *dest,
980            int32_t destCapacity,
981            int32_t *pDestLength,
982            const UChar *pSrc,
983            int32_t srcLength,
984            UChar32 subchar, int32_t *pNumSubstitutions,
985            UErrorCode *pErrorCode){
986    int32_t reqLength=0;
987    uint32_t ch=0,ch2=0;
988    uint8_t *pDest = (uint8_t *)dest;
989    uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
990    int32_t numSubstitutions;
991
992    /* args check */
993    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
994        return NULL;
995    }
996
997    if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
998        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
999        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1000    ) {
1001        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1002        return NULL;
1003    }
1004
1005    if(pNumSubstitutions!=NULL) {
1006        *pNumSubstitutions=0;
1007    }
1008    numSubstitutions=0;
1009
1010    if(srcLength==-1) {
1011        while((ch=*pSrc)!=0) {
1012            ++pSrc;
1013            if(ch <= 0x7f) {
1014                if(pDest<pDestLimit) {
1015                    *pDest++ = (uint8_t)ch;
1016                } else {
1017                    reqLength = 1;
1018                    break;
1019                }
1020            } else if(ch <= 0x7ff) {
1021                if((pDestLimit - pDest) >= 2) {
1022                    *pDest++=(uint8_t)((ch>>6)|0xc0);
1023                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1024                } else {
1025                    reqLength = 2;
1026                    break;
1027                }
1028            } else if(ch <= 0xd7ff || ch >= 0xe000) {
1029                if((pDestLimit - pDest) >= 3) {
1030                    *pDest++=(uint8_t)((ch>>12)|0xe0);
1031                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1032                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1033                } else {
1034                    reqLength = 3;
1035                    break;
1036                }
1037            } else /* ch is a surrogate */ {
1038                int32_t length;
1039
1040                /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1041                if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1042                    ++pSrc;
1043                    ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1044                } else if(subchar>=0) {
1045                    ch=subchar;
1046                    ++numSubstitutions;
1047                } else {
1048                    /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1049                    *pErrorCode = U_INVALID_CHAR_FOUND;
1050                    return NULL;
1051                }
1052
1053                length = U8_LENGTH(ch);
1054                if((pDestLimit - pDest) >= length) {
1055                    /* convert and append*/
1056                    pDest=_appendUTF8(pDest, ch);
1057                } else {
1058                    reqLength = length;
1059                    break;
1060                }
1061            }
1062        }
1063        while((ch=*pSrc++)!=0) {
1064            if(ch<=0x7f) {
1065                ++reqLength;
1066            } else if(ch<=0x7ff) {
1067                reqLength+=2;
1068            } else if(!U16_IS_SURROGATE(ch)) {
1069                reqLength+=3;
1070            } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1071                ++pSrc;
1072                reqLength+=4;
1073            } else if(subchar>=0) {
1074                reqLength+=U8_LENGTH(subchar);
1075                ++numSubstitutions;
1076            } else {
1077                /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1078                *pErrorCode = U_INVALID_CHAR_FOUND;
1079                return NULL;
1080            }
1081        }
1082    } else {
1083        const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
1084        int32_t count;
1085
1086        /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1087        for(;;) {
1088            /*
1089             * Each iteration of the inner loop progresses by at most 3 UTF-8
1090             * bytes and one UChar, for most characters.
1091             * For supplementary code points (4 & 2), which are rare,
1092             * there is an additional adjustment.
1093             */
1094            count = (int32_t)((pDestLimit - pDest) / 3);
1095            srcLength = (int32_t)(pSrcLimit - pSrc);
1096            if(count > srcLength) {
1097                count = srcLength; /* min(remaining dest/3, remaining src) */
1098            }
1099            if(count < 3) {
1100                /*
1101                 * Too much overhead if we get near the end of the string,
1102                 * continue with the next loop.
1103                 */
1104                break;
1105            }
1106            do {
1107                ch=*pSrc++;
1108                if(ch <= 0x7f) {
1109                    *pDest++ = (uint8_t)ch;
1110                } else if(ch <= 0x7ff) {
1111                    *pDest++=(uint8_t)((ch>>6)|0xc0);
1112                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1113                } else if(ch <= 0xd7ff || ch >= 0xe000) {
1114                    *pDest++=(uint8_t)((ch>>12)|0xe0);
1115                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1116                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1117                } else /* ch is a surrogate */ {
1118                    /*
1119                     * We will read two UChars and probably output four bytes,
1120                     * which we didn't account for with computing count,
1121                     * so we adjust it here.
1122                     */
1123                    if(--count == 0) {
1124                        --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1125                        break;  /* recompute count */
1126                    }
1127
1128                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1129                        ++pSrc;
1130                        ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1131
1132                        /* writing 4 bytes per 2 UChars is ok */
1133                        *pDest++=(uint8_t)((ch>>18)|0xf0);
1134                        *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1135                        *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1136                        *pDest++=(uint8_t)((ch&0x3f)|0x80);
1137                    } else  {
1138                        /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1139                        if(subchar>=0) {
1140                            ch=subchar;
1141                            ++numSubstitutions;
1142                        } else {
1143                            *pErrorCode = U_INVALID_CHAR_FOUND;
1144                            return NULL;
1145                        }
1146
1147                        /* convert and append*/
1148                        pDest=_appendUTF8(pDest, ch);
1149                    }
1150                }
1151            } while(--count > 0);
1152        }
1153
1154        while(pSrc<pSrcLimit) {
1155            ch=*pSrc++;
1156            if(ch <= 0x7f) {
1157                if(pDest<pDestLimit) {
1158                    *pDest++ = (uint8_t)ch;
1159                } else {
1160                    reqLength = 1;
1161                    break;
1162                }
1163            } else if(ch <= 0x7ff) {
1164                if((pDestLimit - pDest) >= 2) {
1165                    *pDest++=(uint8_t)((ch>>6)|0xc0);
1166                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1167                } else {
1168                    reqLength = 2;
1169                    break;
1170                }
1171            } else if(ch <= 0xd7ff || ch >= 0xe000) {
1172                if((pDestLimit - pDest) >= 3) {
1173                    *pDest++=(uint8_t)((ch>>12)|0xe0);
1174                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1175                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1176                } else {
1177                    reqLength = 3;
1178                    break;
1179                }
1180            } else /* ch is a surrogate */ {
1181                int32_t length;
1182
1183                if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1184                    ++pSrc;
1185                    ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1186                } else if(subchar>=0) {
1187                    ch=subchar;
1188                    ++numSubstitutions;
1189                } else {
1190                    /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1191                    *pErrorCode = U_INVALID_CHAR_FOUND;
1192                    return NULL;
1193                }
1194
1195                length = U8_LENGTH(ch);
1196                if((pDestLimit - pDest) >= length) {
1197                    /* convert and append*/
1198                    pDest=_appendUTF8(pDest, ch);
1199                } else {
1200                    reqLength = length;
1201                    break;
1202                }
1203            }
1204        }
1205        while(pSrc<pSrcLimit) {
1206            ch=*pSrc++;
1207            if(ch<=0x7f) {
1208                ++reqLength;
1209            } else if(ch<=0x7ff) {
1210                reqLength+=2;
1211            } else if(!U16_IS_SURROGATE(ch)) {
1212                reqLength+=3;
1213            } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1214                ++pSrc;
1215                reqLength+=4;
1216            } else if(subchar>=0) {
1217                reqLength+=U8_LENGTH(subchar);
1218                ++numSubstitutions;
1219            } else {
1220                /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1221                *pErrorCode = U_INVALID_CHAR_FOUND;
1222                return NULL;
1223            }
1224        }
1225    }
1226
1227    reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1228
1229    if(pNumSubstitutions!=NULL) {
1230        *pNumSubstitutions=numSubstitutions;
1231    }
1232
1233    if(pDestLength){
1234        *pDestLength = reqLength;
1235    }
1236
1237    /* Terminate the buffer */
1238    u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1239    return dest;
1240}
1241
1242U_CAPI char* U_EXPORT2
1243u_strToUTF8(char *dest,
1244            int32_t destCapacity,
1245            int32_t *pDestLength,
1246            const UChar *pSrc,
1247            int32_t srcLength,
1248            UErrorCode *pErrorCode){
1249    return u_strToUTF8WithSub(
1250            dest, destCapacity, pDestLength,
1251            pSrc, srcLength,
1252            U_SENTINEL, NULL,
1253            pErrorCode);
1254}
1255
1256U_CAPI UChar* U_EXPORT2
1257u_strFromJavaModifiedUTF8WithSub(
1258        UChar *dest,
1259        int32_t destCapacity,
1260        int32_t *pDestLength,
1261        const char *src,
1262        int32_t srcLength,
1263        UChar32 subchar, int32_t *pNumSubstitutions,
1264        UErrorCode *pErrorCode) {
1265    UChar *pDest = dest;
1266    UChar *pDestLimit = dest+destCapacity;
1267    UChar32 ch;
1268    int32_t reqLength = 0;
1269    const uint8_t* pSrc = (const uint8_t*) src;
1270    const uint8_t *pSrcLimit;
1271    int32_t count;
1272    uint8_t t1, t2; /* trail bytes */
1273    int32_t numSubstitutions;
1274
1275    /* args check */
1276    if(U_FAILURE(*pErrorCode)){
1277        return NULL;
1278    }
1279    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1280        (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1281        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1282    ) {
1283        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1284        return NULL;
1285    }
1286
1287    if(pNumSubstitutions!=NULL) {
1288        *pNumSubstitutions=0;
1289    }
1290    numSubstitutions=0;
1291
1292    if(srcLength < 0) {
1293        /*
1294         * Transform a NUL-terminated ASCII string.
1295         * Handle non-ASCII strings with slower code.
1296         */
1297        while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1298            *pDest++=(UChar)ch;
1299            ++pSrc;
1300        }
1301        if(ch == 0) {
1302            reqLength=(int32_t)(pDest - dest);
1303            if(pDestLength) {
1304                *pDestLength = reqLength;
1305            }
1306
1307            /* Terminate the buffer */
1308            u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1309            return dest;
1310        }
1311        srcLength = uprv_strlen((const char *)pSrc);
1312    }
1313
1314    /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1315    pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
1316    for(;;) {
1317        count = (int32_t)(pDestLimit - pDest);
1318        srcLength = (int32_t)(pSrcLimit - pSrc);
1319        if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1320            /* fast ASCII loop */
1321            const uint8_t *prevSrc = pSrc;
1322            int32_t delta;
1323            while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1324                *pDest++=(UChar)ch;
1325                ++pSrc;
1326            }
1327            delta = (int32_t)(pSrc - prevSrc);
1328            count -= delta;
1329            srcLength -= delta;
1330        }
1331        /*
1332         * Each iteration of the inner loop progresses by at most 3 UTF-8
1333         * bytes and one UChar.
1334         */
1335        srcLength /= 3;
1336        if(count > srcLength) {
1337            count = srcLength; /* min(remaining dest, remaining src/3) */
1338        }
1339        if(count < 3) {
1340            /*
1341             * Too much overhead if we get near the end of the string,
1342             * continue with the next loop.
1343             */
1344            break;
1345        }
1346        do {
1347            ch = *pSrc;
1348            if(ch <= 0x7f){
1349                *pDest++=(UChar)ch;
1350                ++pSrc;
1351            } else {
1352                if(ch >= 0xe0) {
1353                    if( /* handle U+0000..U+FFFF inline */
1354                        ch <= 0xef &&
1355                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1356                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1357                    ) {
1358                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1359                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1360                        pSrc += 3;
1361                        continue;
1362                    }
1363                } else {
1364                    if( /* handle U+0000..U+07FF inline */
1365                        ch >= 0xc0 &&
1366                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1367                    ) {
1368                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1369                        pSrc += 2;
1370                        continue;
1371                    }
1372                }
1373
1374                if(subchar < 0) {
1375                    *pErrorCode = U_INVALID_CHAR_FOUND;
1376                    return NULL;
1377                } else if(subchar > 0xffff && --count == 0) {
1378                    /*
1379                     * We need to write two UChars, adjusted count for that,
1380                     * and ran out of space.
1381                     */
1382                    break;
1383                } else {
1384                    /* function call for error cases */
1385                    ++pSrc; /* continue after the lead byte */
1386                    utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1387                    ++numSubstitutions;
1388                    if(subchar<=0xFFFF) {
1389                        *(pDest++)=(UChar)subchar;
1390                    } else {
1391                        *(pDest++)=U16_LEAD(subchar);
1392                        *(pDest++)=U16_TRAIL(subchar);
1393                    }
1394                }
1395            }
1396        } while(--count > 0);
1397    }
1398
1399    while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1400        ch = *pSrc;
1401        if(ch <= 0x7f){
1402            *pDest++=(UChar)ch;
1403            ++pSrc;
1404        } else {
1405            if(ch >= 0xe0) {
1406                if( /* handle U+0000..U+FFFF inline */
1407                    ch <= 0xef &&
1408                    ((pSrcLimit - pSrc) >= 3) &&
1409                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1410                    (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1411                ) {
1412                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1413                    *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1414                    pSrc += 3;
1415                    continue;
1416                }
1417            } else {
1418                if( /* handle U+0000..U+07FF inline */
1419                    ch >= 0xc0 &&
1420                    ((pSrcLimit - pSrc) >= 2) &&
1421                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1422                ) {
1423                    *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1424                    pSrc += 2;
1425                    continue;
1426                }
1427            }
1428
1429            if(subchar < 0) {
1430                *pErrorCode = U_INVALID_CHAR_FOUND;
1431                return NULL;
1432            } else {
1433                /* function call for error cases */
1434                ++pSrc; /* continue after the lead byte */
1435                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1436                ++numSubstitutions;
1437                if(subchar<=0xFFFF) {
1438                    *(pDest++)=(UChar)subchar;
1439                } else {
1440                    *(pDest++)=U16_LEAD(subchar);
1441                    if(pDest<pDestLimit) {
1442                        *(pDest++)=U16_TRAIL(subchar);
1443                    } else {
1444                        reqLength++;
1445                        break;
1446                    }
1447                }
1448            }
1449        }
1450    }
1451
1452    /* do not fill the dest buffer just count the UChars needed */
1453    while(pSrc < pSrcLimit){
1454        ch = *pSrc;
1455        if(ch <= 0x7f) {
1456            reqLength++;
1457            ++pSrc;
1458        } else {
1459            if(ch >= 0xe0) {
1460                if( /* handle U+0000..U+FFFF inline */
1461                    ch <= 0xef &&
1462                    ((pSrcLimit - pSrc) >= 3) &&
1463                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1464                    (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1465                ) {
1466                    reqLength++;
1467                    pSrc += 3;
1468                    continue;
1469                }
1470            } else {
1471                if( /* handle U+0000..U+07FF inline */
1472                    ch >= 0xc0 &&
1473                    ((pSrcLimit - pSrc) >= 2) &&
1474                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1475                ) {
1476                    reqLength++;
1477                    pSrc += 2;
1478                    continue;
1479                }
1480            }
1481
1482            if(subchar < 0) {
1483                *pErrorCode = U_INVALID_CHAR_FOUND;
1484                return NULL;
1485            } else {
1486                /* function call for error cases */
1487                ++pSrc; /* continue after the lead byte */
1488                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1489                ++numSubstitutions;
1490                reqLength+=U16_LENGTH(ch);
1491            }
1492        }
1493    }
1494
1495    if(pNumSubstitutions!=NULL) {
1496        *pNumSubstitutions=numSubstitutions;
1497    }
1498
1499    reqLength+=(int32_t)(pDest - dest);
1500    if(pDestLength) {
1501        *pDestLength = reqLength;
1502    }
1503
1504    /* Terminate the buffer */
1505    u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1506    return dest;
1507}
1508
1509U_CAPI char* U_EXPORT2
1510u_strToJavaModifiedUTF8(
1511        char *dest,
1512        int32_t destCapacity,
1513        int32_t *pDestLength,
1514        const UChar *src,
1515        int32_t srcLength,
1516        UErrorCode *pErrorCode) {
1517    int32_t reqLength=0;
1518    uint32_t ch=0;
1519    uint8_t *pDest = (uint8_t *)dest;
1520    uint8_t *pDestLimit = pDest + destCapacity;
1521    const UChar *pSrcLimit;
1522    int32_t count;
1523
1524    /* args check */
1525    if(U_FAILURE(*pErrorCode)){
1526        return NULL;
1527    }
1528    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1529        (dest==NULL && destCapacity!=0) || destCapacity<0
1530    ) {
1531        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1532        return NULL;
1533    }
1534
1535    if(srcLength==-1) {
1536        /* Convert NUL-terminated ASCII, then find the string length. */
1537        while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1538            *pDest++ = (uint8_t)ch;
1539            ++src;
1540        }
1541        if(ch == 0) {
1542            reqLength=(int32_t)(pDest - (uint8_t *)dest);
1543            if(pDestLength) {
1544                *pDestLength = reqLength;
1545            }
1546
1547            /* Terminate the buffer */
1548            u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1549            return dest;
1550        }
1551        srcLength = u_strlen(src);
1552    }
1553
1554    /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1555    pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1556    for(;;) {
1557        count = (int32_t)(pDestLimit - pDest);
1558        srcLength = (int32_t)(pSrcLimit - src);
1559        if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1560            /* fast ASCII loop */
1561            const UChar *prevSrc = src;
1562            int32_t delta;
1563            while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1564                *pDest++=(uint8_t)ch;
1565                ++src;
1566            }
1567            delta = (int32_t)(src - prevSrc);
1568            count -= delta;
1569            srcLength -= delta;
1570        }
1571        /*
1572         * Each iteration of the inner loop progresses by at most 3 UTF-8
1573         * bytes and one UChar.
1574         */
1575        count /= 3;
1576        if(count > srcLength) {
1577            count = srcLength; /* min(remaining dest/3, remaining src) */
1578        }
1579        if(count < 3) {
1580            /*
1581             * Too much overhead if we get near the end of the string,
1582             * continue with the next loop.
1583             */
1584            break;
1585        }
1586        do {
1587            ch=*src++;
1588            if(ch <= 0x7f && ch != 0) {
1589                *pDest++ = (uint8_t)ch;
1590            } else if(ch <= 0x7ff) {
1591                *pDest++=(uint8_t)((ch>>6)|0xc0);
1592                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1593            } else {
1594                *pDest++=(uint8_t)((ch>>12)|0xe0);
1595                *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1596                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1597            }
1598        } while(--count > 0);
1599    }
1600
1601    while(src<pSrcLimit) {
1602        ch=*src++;
1603        if(ch <= 0x7f && ch != 0) {
1604            if(pDest<pDestLimit) {
1605                *pDest++ = (uint8_t)ch;
1606            } else {
1607                reqLength = 1;
1608                break;
1609            }
1610        } else if(ch <= 0x7ff) {
1611            if((pDestLimit - pDest) >= 2) {
1612                *pDest++=(uint8_t)((ch>>6)|0xc0);
1613                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1614            } else {
1615                reqLength = 2;
1616                break;
1617            }
1618        } else {
1619            if((pDestLimit - pDest) >= 3) {
1620                *pDest++=(uint8_t)((ch>>12)|0xe0);
1621                *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1622                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1623            } else {
1624                reqLength = 3;
1625                break;
1626            }
1627        }
1628    }
1629    while(src<pSrcLimit) {
1630        ch=*src++;
1631        if(ch <= 0x7f && ch != 0) {
1632            ++reqLength;
1633        } else if(ch<=0x7ff) {
1634            reqLength+=2;
1635        } else {
1636            reqLength+=3;
1637        }
1638    }
1639
1640    reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1641    if(pDestLength){
1642        *pDestLength = reqLength;
1643    }
1644
1645    /* Terminate the buffer */
1646    u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1647    return dest;
1648}
1649