1/*
2******************************************************************************
3*
4*   Copyright (C) 2001-2010, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*
9* File ustrtrns.c
10*
11* Modification History:
12*
13*   Date        Name        Description
14*   9/10/2001    Ram    Creation.
15******************************************************************************
16*/
17
18/*******************************************************************************
19 *
20 * u_strTo* and u_strFrom* APIs
21 * WCS functions moved to ustr_wcs.c for better modularization
22 *
23 *******************************************************************************
24 */
25
26
27#include "unicode/putil.h"
28#include "unicode/ustring.h"
29#include "cstring.h"
30#include "cmemory.h"
31#include "ustr_imp.h"
32
33U_CAPI UChar* U_EXPORT2
34u_strFromUTF32WithSub(UChar *dest,
35               int32_t destCapacity,
36               int32_t *pDestLength,
37               const UChar32 *src,
38               int32_t srcLength,
39               UChar32 subchar, int32_t *pNumSubstitutions,
40               UErrorCode *pErrorCode) {
41    const UChar32 *srcLimit;
42    UChar32 ch;
43    UChar *destLimit;
44    UChar *pDest;
45    int32_t reqLength;
46    int32_t numSubstitutions;
47
48    /* args check */
49    if(U_FAILURE(*pErrorCode)){
50        return NULL;
51    }
52    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
53        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
54        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
55    ) {
56        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
57        return NULL;
58    }
59
60    if(pNumSubstitutions != NULL) {
61        *pNumSubstitutions = 0;
62    }
63
64    pDest = dest;
65    destLimit = dest + destCapacity;
66    reqLength = 0;
67    numSubstitutions = 0;
68
69    if(srcLength < 0) {
70        /* simple loop for conversion of a NUL-terminated BMP string */
71        while((ch=*src) != 0 &&
72              ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
73            ++src;
74            if(pDest < destLimit) {
75                *pDest++ = (UChar)ch;
76            } else {
77                ++reqLength;
78            }
79        }
80        srcLimit = src;
81        if(ch != 0) {
82            /* "complicated" case, find the end of the remaining string */
83            while(*++srcLimit != 0) {}
84        }
85    } else {
86        srcLimit = src + srcLength;
87    }
88
89    /* convert with length */
90    while(src < srcLimit) {
91        ch = *src++;
92        do {
93            /* usually "loops" once; twice only for writing subchar */
94            if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
95                if(pDest < destLimit) {
96                    *pDest++ = (UChar)ch;
97                } else {
98                    ++reqLength;
99                }
100                break;
101            } else if(0x10000 <= ch && ch <= 0x10ffff) {
102                if((pDest + 2) <= destLimit) {
103                    *pDest++ = U16_LEAD(ch);
104                    *pDest++ = U16_TRAIL(ch);
105                } else {
106                    reqLength += 2;
107                }
108                break;
109            } else if((ch = subchar) < 0) {
110                /* surrogate code point, or not a Unicode code point at all */
111                *pErrorCode = U_INVALID_CHAR_FOUND;
112                return NULL;
113            } else {
114                ++numSubstitutions;
115            }
116        } while(TRUE);
117    }
118
119    reqLength += (int32_t)(pDest - dest);
120    if(pDestLength) {
121        *pDestLength = reqLength;
122    }
123    if(pNumSubstitutions != NULL) {
124        *pNumSubstitutions = numSubstitutions;
125    }
126
127    /* Terminate the buffer */
128    u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
129
130    return dest;
131}
132
133U_CAPI UChar* U_EXPORT2
134u_strFromUTF32(UChar *dest,
135               int32_t destCapacity,
136               int32_t *pDestLength,
137               const UChar32 *src,
138               int32_t srcLength,
139               UErrorCode *pErrorCode) {
140    return u_strFromUTF32WithSub(
141            dest, destCapacity, pDestLength,
142            src, srcLength,
143            U_SENTINEL, NULL,
144            pErrorCode);
145}
146
147U_CAPI UChar32* U_EXPORT2
148u_strToUTF32WithSub(UChar32 *dest,
149             int32_t destCapacity,
150             int32_t *pDestLength,
151             const UChar *src,
152             int32_t srcLength,
153             UChar32 subchar, int32_t *pNumSubstitutions,
154             UErrorCode *pErrorCode) {
155    const UChar *srcLimit;
156    UChar32 ch;
157    UChar ch2;
158    UChar32 *destLimit;
159    UChar32 *pDest;
160    int32_t reqLength;
161    int32_t numSubstitutions;
162
163    /* args check */
164    if(U_FAILURE(*pErrorCode)){
165        return NULL;
166    }
167    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
168        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
169        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
170    ) {
171        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
172        return NULL;
173    }
174
175    if(pNumSubstitutions != NULL) {
176        *pNumSubstitutions = 0;
177    }
178
179    pDest = dest;
180    destLimit = dest + destCapacity;
181    reqLength = 0;
182    numSubstitutions = 0;
183
184    if(srcLength < 0) {
185        /* simple loop for conversion of a NUL-terminated BMP string */
186        while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
187            ++src;
188            if(pDest < destLimit) {
189                *pDest++ = ch;
190            } else {
191                ++reqLength;
192            }
193        }
194        srcLimit = src;
195        if(ch != 0) {
196            /* "complicated" case, find the end of the remaining string */
197            while(*++srcLimit != 0) {}
198        }
199    } else {
200        srcLimit = src + srcLength;
201    }
202
203    /* convert with length */
204    while(src < srcLimit) {
205        ch = *src++;
206        if(!U16_IS_SURROGATE(ch)) {
207            /* write or count ch below */
208        } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
209            ++src;
210            ch = U16_GET_SUPPLEMENTARY(ch, ch2);
211        } else if((ch = subchar) < 0) {
212            /* unpaired surrogate */
213            *pErrorCode = U_INVALID_CHAR_FOUND;
214            return NULL;
215        } else {
216            ++numSubstitutions;
217        }
218        if(pDest < destLimit) {
219            *pDest++ = ch;
220        } else {
221            ++reqLength;
222        }
223    }
224
225    reqLength += (int32_t)(pDest - dest);
226    if(pDestLength) {
227        *pDestLength = reqLength;
228    }
229    if(pNumSubstitutions != NULL) {
230        *pNumSubstitutions = numSubstitutions;
231    }
232
233    /* Terminate the buffer */
234    u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
235
236    return dest;
237}
238
239U_CAPI UChar32* U_EXPORT2
240u_strToUTF32(UChar32 *dest,
241             int32_t destCapacity,
242             int32_t *pDestLength,
243             const UChar *src,
244             int32_t srcLength,
245             UErrorCode *pErrorCode) {
246    return u_strToUTF32WithSub(
247            dest, destCapacity, pDestLength,
248            src, srcLength,
249            U_SENTINEL, NULL,
250            pErrorCode);
251}
252
253/* for utf8_nextCharSafeBodyTerminated() */
254static const UChar32
255utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
256
257/*
258 * Version of utf8_nextCharSafeBody() with the following differences:
259 * - checks for NUL termination instead of length
260 * - works with pointers instead of indexes
261 * - always strict (strict==-1)
262 *
263 * *ps points to after the lead byte and will be moved to after the last trail byte.
264 * c is the lead byte.
265 * @return the code point, or U_SENTINEL
266 */
267static UChar32
268utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
269    const uint8_t *s=*ps;
270    uint8_t trail, illegal=0;
271    uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
272    UTF8_MASK_LEAD_BYTE((c), count);
273    /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
274    switch(count) {
275    /* each branch falls through to the next one */
276    case 5:
277    case 4:
278        /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
279        illegal=1;
280        break;
281    case 3:
282        trail=(uint8_t)(*s++ - 0x80);
283        c=(c<<6)|trail;
284        if(trail>0x3f || c>=0x110) {
285            /* not a trail byte, or code point>0x10ffff (outside Unicode) */
286            illegal=1;
287            break;
288        }
289    case 2:
290        trail=(uint8_t)(*s++ - 0x80);
291        if(trail>0x3f) {
292            /* not a trail byte */
293            illegal=1;
294            break;
295        }
296        c=(c<<6)|trail;
297    case 1:
298        trail=(uint8_t)(*s++ - 0x80);
299        if(trail>0x3f) {
300            /* not a trail byte */
301            illegal=1;
302        }
303        c=(c<<6)|trail;
304        break;
305    case 0:
306        return U_SENTINEL;
307    /* no default branch to optimize switch()  - all values are covered */
308    }
309
310    /* correct sequence - all trail bytes have (b7..b6)==(10)? */
311    /* illegal is also set if count>=4 */
312    if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
313        /* error handling */
314        /* don't go beyond this sequence */
315        s=*ps;
316        while(count>0 && UTF8_IS_TRAIL(*s)) {
317            ++s;
318            --count;
319        }
320        c=U_SENTINEL;
321    }
322    *ps=s;
323    return c;
324}
325
326/*
327 * Version of utf8_nextCharSafeBody() with the following differences:
328 * - works with pointers instead of indexes
329 * - always strict (strict==-1)
330 *
331 * *ps points to after the lead byte and will be moved to after the last trail byte.
332 * c is the lead byte.
333 * @return the code point, or U_SENTINEL
334 */
335static UChar32
336utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
337    const uint8_t *s=*ps;
338    uint8_t trail, illegal=0;
339    uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
340    if((limit-s)>=count) {
341        UTF8_MASK_LEAD_BYTE((c), count);
342        /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
343        switch(count) {
344        /* each branch falls through to the next one */
345        case 5:
346        case 4:
347            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
348            illegal=1;
349            break;
350        case 3:
351            trail=*s++;
352            c=(c<<6)|(trail&0x3f);
353            if(c<0x110) {
354                illegal|=(trail&0xc0)^0x80;
355            } else {
356                /* code point>0x10ffff, outside Unicode */
357                illegal=1;
358                break;
359            }
360        case 2:
361            trail=*s++;
362            c=(c<<6)|(trail&0x3f);
363            illegal|=(trail&0xc0)^0x80;
364        case 1:
365            trail=*s++;
366            c=(c<<6)|(trail&0x3f);
367            illegal|=(trail&0xc0)^0x80;
368            break;
369        case 0:
370            return U_SENTINEL;
371        /* no default branch to optimize switch()  - all values are covered */
372        }
373    } else {
374        illegal=1; /* too few bytes left */
375    }
376
377    /* correct sequence - all trail bytes have (b7..b6)==(10)? */
378    /* illegal is also set if count>=4 */
379    if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
380        /* error handling */
381        /* don't go beyond this sequence */
382        s=*ps;
383        while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) {
384            ++s;
385            --count;
386        }
387        c=U_SENTINEL;
388    }
389    *ps=s;
390    return c;
391}
392
393U_CAPI UChar* U_EXPORT2
394u_strFromUTF8WithSub(UChar *dest,
395              int32_t destCapacity,
396              int32_t *pDestLength,
397              const char* src,
398              int32_t srcLength,
399              UChar32 subchar, int32_t *pNumSubstitutions,
400              UErrorCode *pErrorCode){
401    UChar *pDest = dest;
402    UChar *pDestLimit = dest+destCapacity;
403    UChar32 ch;
404    int32_t reqLength = 0;
405    const uint8_t* pSrc = (const uint8_t*) src;
406    uint8_t t1, t2; /* trail bytes */
407    int32_t numSubstitutions;
408
409    /* args check */
410    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
411        return NULL;
412    }
413
414    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
415        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
416        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
417    ) {
418        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
419        return NULL;
420    }
421
422    if(pNumSubstitutions!=NULL) {
423        *pNumSubstitutions=0;
424    }
425    numSubstitutions=0;
426
427    /*
428     * Inline processing of UTF-8 byte sequences:
429     *
430     * Byte sequences for the most common characters are handled inline in
431     * the conversion loops. In order to reduce the path lengths for those
432     * characters, the tests are arranged in a kind of binary search.
433     * ASCII (<=0x7f) is checked first, followed by the dividing point
434     * between 2- and 3-byte sequences (0xe0).
435     * The 3-byte branch is tested first to speed up CJK text.
436     * The compiler should combine the subtractions for the two tests for 0xe0.
437     * Each branch then tests for the other end of its range.
438     */
439
440    if(srcLength < 0){
441        /*
442         * Transform a NUL-terminated string.
443         * The code explicitly checks for NULs only in the lead byte position.
444         * A NUL byte in the trail byte position fails the trail byte range check anyway.
445         */
446        while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
447            if(ch <= 0x7f){
448                *pDest++=(UChar)ch;
449                ++pSrc;
450            } else {
451                if(ch > 0xe0) {
452                    if( /* handle U+1000..U+CFFF inline */
453                        ch <= 0xec &&
454                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
455                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
456                    ) {
457                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
458                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
459                        pSrc += 3;
460                        continue;
461                    }
462                } else if(ch < 0xe0) {
463                    if( /* handle U+0080..U+07FF inline */
464                        ch >= 0xc2 &&
465                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
466                    ) {
467                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
468                        pSrc += 2;
469                        continue;
470                    }
471                }
472
473                /* function call for "complicated" and error cases */
474                ++pSrc; /* continue after the lead byte */
475                ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
476                if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
477                    *pErrorCode = U_INVALID_CHAR_FOUND;
478                    return NULL;
479                } else if(ch<=0xFFFF) {
480                    *(pDest++)=(UChar)ch;
481                } else {
482                    *(pDest++)=UTF16_LEAD(ch);
483                    if(pDest<pDestLimit) {
484                        *(pDest++)=UTF16_TRAIL(ch);
485                    } else {
486                        reqLength++;
487                        break;
488                    }
489                }
490            }
491        }
492
493        /* Pre-flight the rest of the string. */
494        while((ch = *pSrc) != 0) {
495            if(ch <= 0x7f){
496                ++reqLength;
497                ++pSrc;
498            } else {
499                if(ch > 0xe0) {
500                    if( /* handle U+1000..U+CFFF inline */
501                        ch <= 0xec &&
502                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
503                        (uint8_t)(pSrc[2] - 0x80) <= 0x3f
504                    ) {
505                        ++reqLength;
506                        pSrc += 3;
507                        continue;
508                    }
509                } else if(ch < 0xe0) {
510                    if( /* handle U+0080..U+07FF inline */
511                        ch >= 0xc2 &&
512                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f
513                    ) {
514                        ++reqLength;
515                        pSrc += 2;
516                        continue;
517                    }
518                }
519
520                /* function call for "complicated" and error cases */
521                ++pSrc; /* continue after the lead byte */
522                ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
523                if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
524                    *pErrorCode = U_INVALID_CHAR_FOUND;
525                    return NULL;
526                }
527                reqLength += U16_LENGTH(ch);
528            }
529        }
530    } else /* srcLength >= 0 */ {
531        const uint8_t *pSrcLimit = pSrc + srcLength;
532        int32_t count;
533
534        /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
535        for(;;) {
536            /*
537             * Each iteration of the inner loop progresses by at most 3 UTF-8
538             * bytes and one UChar, for most characters.
539             * For supplementary code points (4 & 2), which are rare,
540             * there is an additional adjustment.
541             */
542            count = (int32_t)(pDestLimit - pDest);
543            srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
544            if(count > srcLength) {
545                count = srcLength; /* min(remaining dest, remaining src/3) */
546            }
547            if(count < 3) {
548                /*
549                 * Too much overhead if we get near the end of the string,
550                 * continue with the next loop.
551                 */
552                break;
553            }
554
555            do {
556                ch = *pSrc;
557                if(ch <= 0x7f){
558                    *pDest++=(UChar)ch;
559                    ++pSrc;
560                } else {
561                    if(ch > 0xe0) {
562                        if( /* handle U+1000..U+CFFF inline */
563                            ch <= 0xec &&
564                            (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
565                            (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
566                        ) {
567                            /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
568                            *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
569                            pSrc += 3;
570                            continue;
571                        }
572                    } else if(ch < 0xe0) {
573                        if( /* handle U+0080..U+07FF inline */
574                            ch >= 0xc2 &&
575                            (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
576                        ) {
577                            *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
578                            pSrc += 2;
579                            continue;
580                        }
581                    }
582
583                    if(ch >= 0xf0 || subchar > 0xffff) {
584                        /*
585                         * We may read up to six bytes and write up to two UChars,
586                         * which we didn't account for with computing count,
587                         * so we adjust it here.
588                         */
589                        if(--count == 0) {
590                            break;
591                        }
592                    }
593
594                    /* function call for "complicated" and error cases */
595                    ++pSrc; /* continue after the lead byte */
596                    ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
597                    if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
598                        *pErrorCode = U_INVALID_CHAR_FOUND;
599                        return NULL;
600                    }else if(ch<=0xFFFF){
601                        *(pDest++)=(UChar)ch;
602                    }else{
603                        *(pDest++)=UTF16_LEAD(ch);
604                        *(pDest++)=UTF16_TRAIL(ch);
605                    }
606                }
607            } while(--count > 0);
608        }
609
610        while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
611            ch = *pSrc;
612            if(ch <= 0x7f){
613                *pDest++=(UChar)ch;
614                ++pSrc;
615            } else {
616                if(ch > 0xe0) {
617                    if( /* handle U+1000..U+CFFF inline */
618                        ch <= 0xec &&
619                        ((pSrcLimit - pSrc) >= 3) &&
620                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
621                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
622                    ) {
623                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
624                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
625                        pSrc += 3;
626                        continue;
627                    }
628                } else if(ch < 0xe0) {
629                    if( /* handle U+0080..U+07FF inline */
630                        ch >= 0xc2 &&
631                        ((pSrcLimit - pSrc) >= 2) &&
632                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
633                    ) {
634                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
635                        pSrc += 2;
636                        continue;
637                    }
638                }
639
640                /* function call for "complicated" and error cases */
641                ++pSrc; /* continue after the lead byte */
642                ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
643                if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
644                    *pErrorCode = U_INVALID_CHAR_FOUND;
645                    return NULL;
646                }else if(ch<=0xFFFF){
647                    *(pDest++)=(UChar)ch;
648                }else{
649                    *(pDest++)=UTF16_LEAD(ch);
650                    if(pDest<pDestLimit){
651                        *(pDest++)=UTF16_TRAIL(ch);
652                    }else{
653                        reqLength++;
654                        break;
655                    }
656                }
657            }
658        }
659        /* do not fill the dest buffer just count the UChars needed */
660        while(pSrc < pSrcLimit){
661            ch = *pSrc;
662            if(ch <= 0x7f){
663                reqLength++;
664                ++pSrc;
665            } else {
666                if(ch > 0xe0) {
667                    if( /* handle U+1000..U+CFFF inline */
668                        ch <= 0xec &&
669                        ((pSrcLimit - pSrc) >= 3) &&
670                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
671                        (uint8_t)(pSrc[2] - 0x80) <= 0x3f
672                    ) {
673                        reqLength++;
674                        pSrc += 3;
675                        continue;
676                    }
677                } else if(ch < 0xe0) {
678                    if( /* handle U+0080..U+07FF inline */
679                        ch >= 0xc2 &&
680                        ((pSrcLimit - pSrc) >= 2) &&
681                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f
682                    ) {
683                        reqLength++;
684                        pSrc += 2;
685                        continue;
686                    }
687                }
688
689                /* function call for "complicated" and error cases */
690                ++pSrc; /* continue after the lead byte */
691                ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
692                if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
693                    *pErrorCode = U_INVALID_CHAR_FOUND;
694                    return NULL;
695                }
696                reqLength+=UTF_CHAR_LENGTH(ch);
697            }
698        }
699    }
700
701    reqLength+=(int32_t)(pDest - dest);
702
703    if(pNumSubstitutions!=NULL) {
704        *pNumSubstitutions=numSubstitutions;
705    }
706
707    if(pDestLength){
708        *pDestLength = reqLength;
709    }
710
711    /* Terminate the buffer */
712    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
713
714    return dest;
715}
716
717U_CAPI UChar* U_EXPORT2
718u_strFromUTF8(UChar *dest,
719              int32_t destCapacity,
720              int32_t *pDestLength,
721              const char* src,
722              int32_t srcLength,
723              UErrorCode *pErrorCode){
724    return u_strFromUTF8WithSub(
725            dest, destCapacity, pDestLength,
726            src, srcLength,
727            U_SENTINEL, NULL,
728            pErrorCode);
729}
730
731U_CAPI UChar * U_EXPORT2
732u_strFromUTF8Lenient(UChar *dest,
733                     int32_t destCapacity,
734                     int32_t *pDestLength,
735                     const char *src,
736                     int32_t srcLength,
737                     UErrorCode *pErrorCode) {
738    UChar *pDest = dest;
739    UChar32 ch;
740    int32_t reqLength = 0;
741    uint8_t* pSrc = (uint8_t*) src;
742
743    /* args check */
744    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
745        return NULL;
746    }
747
748    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
749        (destCapacity<0) || (dest == NULL && destCapacity > 0)
750    ) {
751        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
752        return NULL;
753    }
754
755    if(srcLength < 0) {
756        /* Transform a NUL-terminated string. */
757        UChar *pDestLimit = dest+destCapacity;
758        uint8_t t1, t2, t3; /* trail bytes */
759
760        while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
761            if(ch < 0xc0) {
762                /*
763                 * ASCII, or a trail byte in lead position which is treated like
764                 * a single-byte sequence for better character boundary
765                 * resynchronization after illegal sequences.
766                 */
767                *pDest++=(UChar)ch;
768                ++pSrc;
769                continue;
770            } else if(ch < 0xe0) { /* U+0080..U+07FF */
771                if((t1 = pSrc[1]) != 0) {
772                    /* 0x3080 = (0xc0 << 6) + 0x80 */
773                    *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
774                    pSrc += 2;
775                    continue;
776                }
777            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
778                if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
779                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
780                    /* 0x2080 = (0x80 << 6) + 0x80 */
781                    *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
782                    pSrc += 3;
783                    continue;
784                }
785            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
786                if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
787                    pSrc += 4;
788                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
789                    ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
790                    *(pDest++) = U16_LEAD(ch);
791                    if(pDest < pDestLimit) {
792                        *(pDest++) = U16_TRAIL(ch);
793                    } else {
794                        reqLength = 1;
795                        break;
796                    }
797                    continue;
798                }
799            }
800
801            /* truncated character at the end */
802            *pDest++ = 0xfffd;
803            while(*++pSrc != 0) {}
804            break;
805        }
806
807        /* Pre-flight the rest of the string. */
808        while((ch = *pSrc) != 0) {
809            if(ch < 0xc0) {
810                /*
811                 * ASCII, or a trail byte in lead position which is treated like
812                 * a single-byte sequence for better character boundary
813                 * resynchronization after illegal sequences.
814                 */
815                ++reqLength;
816                ++pSrc;
817                continue;
818            } else if(ch < 0xe0) { /* U+0080..U+07FF */
819                if(pSrc[1] != 0) {
820                    ++reqLength;
821                    pSrc += 2;
822                    continue;
823                }
824            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
825                if(pSrc[1] != 0 && pSrc[2] != 0) {
826                    ++reqLength;
827                    pSrc += 3;
828                    continue;
829                }
830            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
831                if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
832                    reqLength += 2;
833                    pSrc += 4;
834                    continue;
835                }
836            }
837
838            /* truncated character at the end */
839            ++reqLength;
840            break;
841        }
842    } else /* srcLength >= 0 */ {
843        const uint8_t *pSrcLimit = pSrc + srcLength;
844
845        /*
846         * This function requires that if srcLength is given, then it must be
847         * destCapatity >= srcLength so that we need not check for
848         * destination buffer overflow in the loop.
849         */
850        if(destCapacity < srcLength) {
851            if(pDestLength != NULL) {
852                *pDestLength = srcLength; /* this likely overestimates the true destLength! */
853            }
854            *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
855            return NULL;
856        }
857
858        if((pSrcLimit - pSrc) >= 4) {
859            pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
860
861            /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
862            do {
863                ch = *pSrc++;
864                if(ch < 0xc0) {
865                    /*
866                     * ASCII, or a trail byte in lead position which is treated like
867                     * a single-byte sequence for better character boundary
868                     * resynchronization after illegal sequences.
869                     */
870                    *pDest++=(UChar)ch;
871                } else if(ch < 0xe0) { /* U+0080..U+07FF */
872                    /* 0x3080 = (0xc0 << 6) + 0x80 */
873                    *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
874                } else if(ch < 0xf0) { /* U+0800..U+FFFF */
875                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
876                    /* 0x2080 = (0x80 << 6) + 0x80 */
877                    ch = (ch << 12) + (*pSrc++ << 6);
878                    *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
879                } else /* f0..f4 */ { /* U+10000..U+10FFFF */
880                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
881                    ch = (ch << 18) + (*pSrc++ << 12);
882                    ch += *pSrc++ << 6;
883                    ch += *pSrc++ - 0x3c82080;
884                    *(pDest++) = U16_LEAD(ch);
885                    *(pDest++) = U16_TRAIL(ch);
886                }
887            } while(pSrc < pSrcLimit);
888
889            pSrcLimit += 3; /* restore original pSrcLimit */
890        }
891
892        while(pSrc < pSrcLimit) {
893            ch = *pSrc++;
894            if(ch < 0xc0) {
895                /*
896                 * ASCII, or a trail byte in lead position which is treated like
897                 * a single-byte sequence for better character boundary
898                 * resynchronization after illegal sequences.
899                 */
900                *pDest++=(UChar)ch;
901                continue;
902            } else if(ch < 0xe0) { /* U+0080..U+07FF */
903                if(pSrc < pSrcLimit) {
904                    /* 0x3080 = (0xc0 << 6) + 0x80 */
905                    *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
906                    continue;
907                }
908            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
909                if((pSrcLimit - pSrc) >= 2) {
910                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
911                    /* 0x2080 = (0x80 << 6) + 0x80 */
912                    ch = (ch << 12) + (*pSrc++ << 6);
913                    *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
914                    pSrc += 3;
915                    continue;
916                }
917            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
918                if((pSrcLimit - pSrc) >= 3) {
919                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
920                    ch = (ch << 18) + (*pSrc++ << 12);
921                    ch += *pSrc++ << 6;
922                    ch += *pSrc++ - 0x3c82080;
923                    *(pDest++) = U16_LEAD(ch);
924                    *(pDest++) = U16_TRAIL(ch);
925                    pSrc += 4;
926                    continue;
927                }
928            }
929
930            /* truncated character at the end */
931            *pDest++ = 0xfffd;
932            break;
933        }
934    }
935
936    reqLength+=(int32_t)(pDest - dest);
937
938    if(pDestLength){
939        *pDestLength = reqLength;
940    }
941
942    /* Terminate the buffer */
943    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
944
945    return dest;
946}
947
948static U_INLINE uint8_t *
949_appendUTF8(uint8_t *pDest, UChar32 c) {
950    /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
951    if((c)<=0x7f) {
952        *pDest++=(uint8_t)c;
953    } else if(c<=0x7ff) {
954        *pDest++=(uint8_t)((c>>6)|0xc0);
955        *pDest++=(uint8_t)((c&0x3f)|0x80);
956    } else if(c<=0xffff) {
957        *pDest++=(uint8_t)((c>>12)|0xe0);
958        *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
959        *pDest++=(uint8_t)(((c)&0x3f)|0x80);
960    } else /* if((uint32_t)(c)<=0x10ffff) */ {
961        *pDest++=(uint8_t)(((c)>>18)|0xf0);
962        *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
963        *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
964        *pDest++=(uint8_t)(((c)&0x3f)|0x80);
965    }
966    return pDest;
967}
968
969
970U_CAPI char* U_EXPORT2
971u_strToUTF8WithSub(char *dest,
972            int32_t destCapacity,
973            int32_t *pDestLength,
974            const UChar *pSrc,
975            int32_t srcLength,
976            UChar32 subchar, int32_t *pNumSubstitutions,
977            UErrorCode *pErrorCode){
978    int32_t reqLength=0;
979    uint32_t ch=0,ch2=0;
980    uint8_t *pDest = (uint8_t *)dest;
981    uint8_t *pDestLimit = pDest + destCapacity;
982    int32_t numSubstitutions;
983
984    /* args check */
985    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
986        return NULL;
987    }
988
989    if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
990        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
991        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
992    ) {
993        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
994        return NULL;
995    }
996
997    if(pNumSubstitutions!=NULL) {
998        *pNumSubstitutions=0;
999    }
1000    numSubstitutions=0;
1001
1002    if(srcLength==-1) {
1003        while((ch=*pSrc)!=0) {
1004            ++pSrc;
1005            if(ch <= 0x7f) {
1006                if(pDest<pDestLimit) {
1007                    *pDest++ = (uint8_t)ch;
1008                } else {
1009                    reqLength = 1;
1010                    break;
1011                }
1012            } else if(ch <= 0x7ff) {
1013                if((pDestLimit - pDest) >= 2) {
1014                    *pDest++=(uint8_t)((ch>>6)|0xc0);
1015                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1016                } else {
1017                    reqLength = 2;
1018                    break;
1019                }
1020            } else if(ch <= 0xd7ff || ch >= 0xe000) {
1021                if((pDestLimit - pDest) >= 3) {
1022                    *pDest++=(uint8_t)((ch>>12)|0xe0);
1023                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1024                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1025                } else {
1026                    reqLength = 3;
1027                    break;
1028                }
1029            } else /* ch is a surrogate */ {
1030                int32_t length;
1031
1032                /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
1033                if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
1034                    ++pSrc;
1035                    ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1036                } else if(subchar>=0) {
1037                    ch=subchar;
1038                    ++numSubstitutions;
1039                } else {
1040                    /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1041                    *pErrorCode = U_INVALID_CHAR_FOUND;
1042                    return NULL;
1043                }
1044
1045                length = U8_LENGTH(ch);
1046                if((pDestLimit - pDest) >= length) {
1047                    /* convert and append*/
1048                    pDest=_appendUTF8(pDest, ch);
1049                } else {
1050                    reqLength = length;
1051                    break;
1052                }
1053            }
1054        }
1055        while((ch=*pSrc++)!=0) {
1056            if(ch<=0x7f) {
1057                ++reqLength;
1058            } else if(ch<=0x7ff) {
1059                reqLength+=2;
1060            } else if(!UTF_IS_SURROGATE(ch)) {
1061                reqLength+=3;
1062            } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
1063                ++pSrc;
1064                reqLength+=4;
1065            } else if(subchar>=0) {
1066                reqLength+=U8_LENGTH(subchar);
1067                ++numSubstitutions;
1068            } else {
1069                /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1070                *pErrorCode = U_INVALID_CHAR_FOUND;
1071                return NULL;
1072            }
1073        }
1074    } else {
1075        const UChar *pSrcLimit = pSrc+srcLength;
1076        int32_t count;
1077
1078        /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1079        for(;;) {
1080            /*
1081             * Each iteration of the inner loop progresses by at most 3 UTF-8
1082             * bytes and one UChar, for most characters.
1083             * For supplementary code points (4 & 2), which are rare,
1084             * there is an additional adjustment.
1085             */
1086            count = (int32_t)((pDestLimit - pDest) / 3);
1087            srcLength = (int32_t)(pSrcLimit - pSrc);
1088            if(count > srcLength) {
1089                count = srcLength; /* min(remaining dest/3, remaining src) */
1090            }
1091            if(count < 3) {
1092                /*
1093                 * Too much overhead if we get near the end of the string,
1094                 * continue with the next loop.
1095                 */
1096                break;
1097            }
1098            do {
1099                ch=*pSrc++;
1100                if(ch <= 0x7f) {
1101                    *pDest++ = (uint8_t)ch;
1102                } else if(ch <= 0x7ff) {
1103                    *pDest++=(uint8_t)((ch>>6)|0xc0);
1104                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1105                } else if(ch <= 0xd7ff || ch >= 0xe000) {
1106                    *pDest++=(uint8_t)((ch>>12)|0xe0);
1107                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1108                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1109                } else /* ch is a surrogate */ {
1110                    /*
1111                     * We will read two UChars and probably output four bytes,
1112                     * which we didn't account for with computing count,
1113                     * so we adjust it here.
1114                     */
1115                    if(--count == 0) {
1116                        --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1117                        break;  /* recompute count */
1118                    }
1119
1120                    if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
1121                        ++pSrc;
1122                        ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1123
1124                        /* writing 4 bytes per 2 UChars is ok */
1125                        *pDest++=(uint8_t)((ch>>18)|0xf0);
1126                        *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1127                        *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1128                        *pDest++=(uint8_t)((ch&0x3f)|0x80);
1129                    } else  {
1130                        /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1131                        if(subchar>=0) {
1132                            ch=subchar;
1133                            ++numSubstitutions;
1134                        } else {
1135                            *pErrorCode = U_INVALID_CHAR_FOUND;
1136                            return NULL;
1137                        }
1138
1139                        /* convert and append*/
1140                        pDest=_appendUTF8(pDest, ch);
1141                    }
1142                }
1143            } while(--count > 0);
1144        }
1145
1146        while(pSrc<pSrcLimit) {
1147            ch=*pSrc++;
1148            if(ch <= 0x7f) {
1149                if(pDest<pDestLimit) {
1150                    *pDest++ = (uint8_t)ch;
1151                } else {
1152                    reqLength = 1;
1153                    break;
1154                }
1155            } else if(ch <= 0x7ff) {
1156                if((pDestLimit - pDest) >= 2) {
1157                    *pDest++=(uint8_t)((ch>>6)|0xc0);
1158                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1159                } else {
1160                    reqLength = 2;
1161                    break;
1162                }
1163            } else if(ch <= 0xd7ff || ch >= 0xe000) {
1164                if((pDestLimit - pDest) >= 3) {
1165                    *pDest++=(uint8_t)((ch>>12)|0xe0);
1166                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1167                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1168                } else {
1169                    reqLength = 3;
1170                    break;
1171                }
1172            } else /* ch is a surrogate */ {
1173                int32_t length;
1174
1175                if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
1176                    ++pSrc;
1177                    ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1178                } else if(subchar>=0) {
1179                    ch=subchar;
1180                    ++numSubstitutions;
1181                } else {
1182                    /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1183                    *pErrorCode = U_INVALID_CHAR_FOUND;
1184                    return NULL;
1185                }
1186
1187                length = U8_LENGTH(ch);
1188                if((pDestLimit - pDest) >= length) {
1189                    /* convert and append*/
1190                    pDest=_appendUTF8(pDest, ch);
1191                } else {
1192                    reqLength = length;
1193                    break;
1194                }
1195            }
1196        }
1197        while(pSrc<pSrcLimit) {
1198            ch=*pSrc++;
1199            if(ch<=0x7f) {
1200                ++reqLength;
1201            } else if(ch<=0x7ff) {
1202                reqLength+=2;
1203            } else if(!UTF_IS_SURROGATE(ch)) {
1204                reqLength+=3;
1205            } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
1206                ++pSrc;
1207                reqLength+=4;
1208            } else if(subchar>=0) {
1209                reqLength+=U8_LENGTH(subchar);
1210                ++numSubstitutions;
1211            } else {
1212                /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1213                *pErrorCode = U_INVALID_CHAR_FOUND;
1214                return NULL;
1215            }
1216        }
1217    }
1218
1219    reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1220
1221    if(pNumSubstitutions!=NULL) {
1222        *pNumSubstitutions=numSubstitutions;
1223    }
1224
1225    if(pDestLength){
1226        *pDestLength = reqLength;
1227    }
1228
1229    /* Terminate the buffer */
1230    u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1231    return dest;
1232}
1233
1234U_CAPI char* U_EXPORT2
1235u_strToUTF8(char *dest,
1236            int32_t destCapacity,
1237            int32_t *pDestLength,
1238            const UChar *pSrc,
1239            int32_t srcLength,
1240            UErrorCode *pErrorCode){
1241    return u_strToUTF8WithSub(
1242            dest, destCapacity, pDestLength,
1243            pSrc, srcLength,
1244            U_SENTINEL, NULL,
1245            pErrorCode);
1246}
1247
1248U_CAPI UChar* U_EXPORT2
1249u_strFromJavaModifiedUTF8WithSub(
1250        UChar *dest,
1251        int32_t destCapacity,
1252        int32_t *pDestLength,
1253        const char *src,
1254        int32_t srcLength,
1255        UChar32 subchar, int32_t *pNumSubstitutions,
1256        UErrorCode *pErrorCode) {
1257    UChar *pDest = dest;
1258    UChar *pDestLimit = dest+destCapacity;
1259    UChar32 ch;
1260    int32_t reqLength = 0;
1261    const uint8_t* pSrc = (const uint8_t*) src;
1262    const uint8_t *pSrcLimit;
1263    int32_t count;
1264    uint8_t t1, t2; /* trail bytes */
1265    int32_t numSubstitutions;
1266
1267    /* args check */
1268    if(U_FAILURE(*pErrorCode)){
1269        return NULL;
1270    }
1271    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1272        (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1273        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1274    ) {
1275        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1276        return NULL;
1277    }
1278
1279    if(pNumSubstitutions!=NULL) {
1280        *pNumSubstitutions=0;
1281    }
1282    numSubstitutions=0;
1283
1284    if(srcLength < 0) {
1285        /*
1286         * Transform a NUL-terminated ASCII string.
1287         * Handle non-ASCII strings with slower code.
1288         */
1289        while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1290            *pDest++=(UChar)ch;
1291            ++pSrc;
1292        }
1293        if(ch == 0) {
1294            reqLength=(int32_t)(pDest - dest);
1295            if(pDestLength) {
1296                *pDestLength = reqLength;
1297            }
1298
1299            /* Terminate the buffer */
1300            u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1301            return dest;
1302        }
1303        srcLength = uprv_strlen((const char *)pSrc);
1304    }
1305
1306    /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1307    pSrcLimit = pSrc + srcLength;
1308    for(;;) {
1309        count = (int32_t)(pDestLimit - pDest);
1310        srcLength = (int32_t)(pSrcLimit - pSrc);
1311        if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1312            /* fast ASCII loop */
1313            const uint8_t *prevSrc = pSrc;
1314            int32_t delta;
1315            while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1316                *pDest++=(UChar)ch;
1317                ++pSrc;
1318            }
1319            delta = (int32_t)(pSrc - prevSrc);
1320            count -= delta;
1321            srcLength -= delta;
1322        }
1323        /*
1324         * Each iteration of the inner loop progresses by at most 3 UTF-8
1325         * bytes and one UChar.
1326         */
1327        srcLength /= 3;
1328        if(count > srcLength) {
1329            count = srcLength; /* min(remaining dest, remaining src/3) */
1330        }
1331        if(count < 3) {
1332            /*
1333             * Too much overhead if we get near the end of the string,
1334             * continue with the next loop.
1335             */
1336            break;
1337        }
1338        do {
1339            ch = *pSrc;
1340            if(ch <= 0x7f){
1341                *pDest++=(UChar)ch;
1342                ++pSrc;
1343            } else {
1344                if(ch >= 0xe0) {
1345                    if( /* handle U+0000..U+FFFF inline */
1346                        ch <= 0xef &&
1347                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1348                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1349                    ) {
1350                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1351                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1352                        pSrc += 3;
1353                        continue;
1354                    }
1355                } else {
1356                    if( /* handle U+0000..U+07FF inline */
1357                        ch >= 0xc0 &&
1358                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1359                    ) {
1360                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1361                        pSrc += 2;
1362                        continue;
1363                    }
1364                }
1365
1366                if(subchar < 0) {
1367                    *pErrorCode = U_INVALID_CHAR_FOUND;
1368                    return NULL;
1369                } else if(subchar > 0xffff && --count == 0) {
1370                    /*
1371                     * We need to write two UChars, adjusted count for that,
1372                     * and ran out of space.
1373                     */
1374                    break;
1375                } else {
1376                    /* function call for error cases */
1377                    ++pSrc; /* continue after the lead byte */
1378                    utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1379                    ++numSubstitutions;
1380                    if(subchar<=0xFFFF) {
1381                        *(pDest++)=(UChar)subchar;
1382                    } else {
1383                        *(pDest++)=U16_LEAD(subchar);
1384                        *(pDest++)=U16_TRAIL(subchar);
1385                    }
1386                }
1387            }
1388        } while(--count > 0);
1389    }
1390
1391    while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1392        ch = *pSrc;
1393        if(ch <= 0x7f){
1394            *pDest++=(UChar)ch;
1395            ++pSrc;
1396        } else {
1397            if(ch >= 0xe0) {
1398                if( /* handle U+0000..U+FFFF inline */
1399                    ch <= 0xef &&
1400                    ((pSrcLimit - pSrc) >= 3) &&
1401                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1402                    (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1403                ) {
1404                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1405                    *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1406                    pSrc += 3;
1407                    continue;
1408                }
1409            } else {
1410                if( /* handle U+0000..U+07FF inline */
1411                    ch >= 0xc0 &&
1412                    ((pSrcLimit - pSrc) >= 2) &&
1413                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1414                ) {
1415                    *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1416                    pSrc += 2;
1417                    continue;
1418                }
1419            }
1420
1421            if(subchar < 0) {
1422                *pErrorCode = U_INVALID_CHAR_FOUND;
1423                return NULL;
1424            } else {
1425                /* function call for error cases */
1426                ++pSrc; /* continue after the lead byte */
1427                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1428                ++numSubstitutions;
1429                if(subchar<=0xFFFF) {
1430                    *(pDest++)=(UChar)subchar;
1431                } else {
1432                    *(pDest++)=U16_LEAD(subchar);
1433                    if(pDest<pDestLimit) {
1434                        *(pDest++)=U16_TRAIL(subchar);
1435                    } else {
1436                        reqLength++;
1437                        break;
1438                    }
1439                }
1440            }
1441        }
1442    }
1443
1444    /* do not fill the dest buffer just count the UChars needed */
1445    while(pSrc < pSrcLimit){
1446        ch = *pSrc;
1447        if(ch <= 0x7f) {
1448            reqLength++;
1449            ++pSrc;
1450        } else {
1451            if(ch >= 0xe0) {
1452                if( /* handle U+0000..U+FFFF inline */
1453                    ch <= 0xef &&
1454                    ((pSrcLimit - pSrc) >= 3) &&
1455                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1456                    (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1457                ) {
1458                    reqLength++;
1459                    pSrc += 3;
1460                    continue;
1461                }
1462            } else {
1463                if( /* handle U+0000..U+07FF inline */
1464                    ch >= 0xc0 &&
1465                    ((pSrcLimit - pSrc) >= 2) &&
1466                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1467                ) {
1468                    reqLength++;
1469                    pSrc += 2;
1470                    continue;
1471                }
1472            }
1473
1474            if(subchar < 0) {
1475                *pErrorCode = U_INVALID_CHAR_FOUND;
1476                return NULL;
1477            } else {
1478                /* function call for error cases */
1479                ++pSrc; /* continue after the lead byte */
1480                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1481                ++numSubstitutions;
1482                reqLength+=U16_LENGTH(ch);
1483            }
1484        }
1485    }
1486
1487    if(pNumSubstitutions!=NULL) {
1488        *pNumSubstitutions=numSubstitutions;
1489    }
1490
1491    reqLength+=(int32_t)(pDest - dest);
1492    if(pDestLength) {
1493        *pDestLength = reqLength;
1494    }
1495
1496    /* Terminate the buffer */
1497    u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1498    return dest;
1499}
1500
1501U_CAPI char* U_EXPORT2
1502u_strToJavaModifiedUTF8(
1503        char *dest,
1504        int32_t destCapacity,
1505        int32_t *pDestLength,
1506        const UChar *src,
1507        int32_t srcLength,
1508        UErrorCode *pErrorCode) {
1509    int32_t reqLength=0;
1510    uint32_t ch=0;
1511    uint8_t *pDest = (uint8_t *)dest;
1512    uint8_t *pDestLimit = pDest + destCapacity;
1513    const UChar *pSrcLimit;
1514    int32_t count;
1515
1516    /* args check */
1517    if(U_FAILURE(*pErrorCode)){
1518        return NULL;
1519    }
1520    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1521        (dest==NULL && destCapacity!=0) || destCapacity<0
1522    ) {
1523        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1524        return NULL;
1525    }
1526
1527    if(srcLength==-1) {
1528        /* Convert NUL-terminated ASCII, then find the string length. */
1529        while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1530            *pDest++ = (uint8_t)ch;
1531            ++src;
1532        }
1533        if(ch == 0) {
1534            reqLength=(int32_t)(pDest - (uint8_t *)dest);
1535            if(pDestLength) {
1536                *pDestLength = reqLength;
1537            }
1538
1539            /* Terminate the buffer */
1540            u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1541            return dest;
1542        }
1543        srcLength = u_strlen(src);
1544    }
1545
1546    /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1547    pSrcLimit = src+srcLength;
1548    for(;;) {
1549        count = (int32_t)(pDestLimit - pDest);
1550        srcLength = (int32_t)(pSrcLimit - src);
1551        if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1552            /* fast ASCII loop */
1553            const UChar *prevSrc = src;
1554            int32_t delta;
1555            while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1556                *pDest++=(uint8_t)ch;
1557                ++src;
1558            }
1559            delta = (int32_t)(src - prevSrc);
1560            count -= delta;
1561            srcLength -= delta;
1562        }
1563        /*
1564         * Each iteration of the inner loop progresses by at most 3 UTF-8
1565         * bytes and one UChar.
1566         */
1567        count /= 3;
1568        if(count > srcLength) {
1569            count = srcLength; /* min(remaining dest/3, remaining src) */
1570        }
1571        if(count < 3) {
1572            /*
1573             * Too much overhead if we get near the end of the string,
1574             * continue with the next loop.
1575             */
1576            break;
1577        }
1578        do {
1579            ch=*src++;
1580            if(ch <= 0x7f && ch != 0) {
1581                *pDest++ = (uint8_t)ch;
1582            } else if(ch <= 0x7ff) {
1583                *pDest++=(uint8_t)((ch>>6)|0xc0);
1584                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1585            } else {
1586                *pDest++=(uint8_t)((ch>>12)|0xe0);
1587                *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1588                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1589            }
1590        } while(--count > 0);
1591    }
1592
1593    while(src<pSrcLimit) {
1594        ch=*src++;
1595        if(ch <= 0x7f && ch != 0) {
1596            if(pDest<pDestLimit) {
1597                *pDest++ = (uint8_t)ch;
1598            } else {
1599                reqLength = 1;
1600                break;
1601            }
1602        } else if(ch <= 0x7ff) {
1603            if((pDestLimit - pDest) >= 2) {
1604                *pDest++=(uint8_t)((ch>>6)|0xc0);
1605                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1606            } else {
1607                reqLength = 2;
1608                break;
1609            }
1610        } else {
1611            if((pDestLimit - pDest) >= 3) {
1612                *pDest++=(uint8_t)((ch>>12)|0xe0);
1613                *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1614                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1615            } else {
1616                reqLength = 3;
1617                break;
1618            }
1619        }
1620    }
1621    while(src<pSrcLimit) {
1622        ch=*src++;
1623        if(ch <= 0x7f && ch != 0) {
1624            ++reqLength;
1625        } else if(ch<=0x7ff) {
1626            reqLength+=2;
1627        } else {
1628            reqLength+=3;
1629        }
1630    }
1631
1632    reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1633    if(pDestLength){
1634        *pDestLength = reqLength;
1635    }
1636
1637    /* Terminate the buffer */
1638    u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1639    return dest;
1640}
1641