1/*
2******************************************************************************
3*
4*   Copyright (C) 1998-2010, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*
9* File ustring.h
10*
11* Modification History:
12*
13*   Date        Name        Description
14*   12/07/98    bertrand    Creation.
15******************************************************************************
16*/
17
18#include "unicode/utypes.h"
19#include "unicode/putil.h"
20#include "unicode/ustring.h"
21#include "cstring.h"
22#include "cwchar.h"
23#include "cmemory.h"
24#include "ustr_imp.h"
25
26/* ANSI string.h - style functions ------------------------------------------ */
27
28/* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */
29#define U_BMP_MAX 0xffff
30
31/* Forward binary string search functions ----------------------------------- */
32
33/*
34 * Test if a substring match inside a string is at code point boundaries.
35 * All pointers refer to the same buffer.
36 * The limit pointer may be NULL, all others must be real pointers.
37 */
38static U_INLINE UBool
39isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) {
40    if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) {
41        /* the leading edge of the match is in the middle of a surrogate pair */
42        return FALSE;
43    }
44    if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) {
45        /* the trailing edge of the match is in the middle of a surrogate pair */
46        return FALSE;
47    }
48    return TRUE;
49}
50
51U_CAPI UChar * U_EXPORT2
52u_strFindFirst(const UChar *s, int32_t length,
53               const UChar *sub, int32_t subLength) {
54    const UChar *start, *p, *q, *subLimit;
55    UChar c, cs, cq;
56
57    if(sub==NULL || subLength<-1) {
58        return (UChar *)s;
59    }
60    if(s==NULL || length<-1) {
61        return NULL;
62    }
63
64    start=s;
65
66    if(length<0 && subLength<0) {
67        /* both strings are NUL-terminated */
68        if((cs=*sub++)==0) {
69            return (UChar *)s;
70        }
71        if(*sub==0 && !U16_IS_SURROGATE(cs)) {
72            /* the substring consists of a single, non-surrogate BMP code point */
73            return u_strchr(s, cs);
74        }
75
76        while((c=*s++)!=0) {
77            if(c==cs) {
78                /* found first substring UChar, compare rest */
79                p=s;
80                q=sub;
81                for(;;) {
82                    if((cq=*q)==0) {
83                        if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
84                            return (UChar *)(s-1); /* well-formed match */
85                        } else {
86                            break; /* no match because surrogate pair is split */
87                        }
88                    }
89                    if((c=*p)==0) {
90                        return NULL; /* no match, and none possible after s */
91                    }
92                    if(c!=cq) {
93                        break; /* no match */
94                    }
95                    ++p;
96                    ++q;
97                }
98            }
99        }
100
101        /* not found */
102        return NULL;
103    }
104
105    if(subLength<0) {
106        subLength=u_strlen(sub);
107    }
108    if(subLength==0) {
109        return (UChar *)s;
110    }
111
112    /* get sub[0] to search for it fast */
113    cs=*sub++;
114    --subLength;
115    subLimit=sub+subLength;
116
117    if(subLength==0 && !U16_IS_SURROGATE(cs)) {
118        /* the substring consists of a single, non-surrogate BMP code point */
119        return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
120    }
121
122    if(length<0) {
123        /* s is NUL-terminated */
124        while((c=*s++)!=0) {
125            if(c==cs) {
126                /* found first substring UChar, compare rest */
127                p=s;
128                q=sub;
129                for(;;) {
130                    if(q==subLimit) {
131                        if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
132                            return (UChar *)(s-1); /* well-formed match */
133                        } else {
134                            break; /* no match because surrogate pair is split */
135                        }
136                    }
137                    if((c=*p)==0) {
138                        return NULL; /* no match, and none possible after s */
139                    }
140                    if(c!=*q) {
141                        break; /* no match */
142                    }
143                    ++p;
144                    ++q;
145                }
146            }
147        }
148    } else {
149        const UChar *limit, *preLimit;
150
151        /* subLength was decremented above */
152        if(length<=subLength) {
153            return NULL; /* s is shorter than sub */
154        }
155
156        limit=s+length;
157
158        /* the substring must start before preLimit */
159        preLimit=limit-subLength;
160
161        while(s!=preLimit) {
162            c=*s++;
163            if(c==cs) {
164                /* found first substring UChar, compare rest */
165                p=s;
166                q=sub;
167                for(;;) {
168                    if(q==subLimit) {
169                        if(isMatchAtCPBoundary(start, s-1, p, limit)) {
170                            return (UChar *)(s-1); /* well-formed match */
171                        } else {
172                            break; /* no match because surrogate pair is split */
173                        }
174                    }
175                    if(*p!=*q) {
176                        break; /* no match */
177                    }
178                    ++p;
179                    ++q;
180                }
181            }
182        }
183    }
184
185    /* not found */
186    return NULL;
187}
188
189U_CAPI UChar * U_EXPORT2
190u_strstr(const UChar *s, const UChar *substring) {
191    return u_strFindFirst(s, -1, substring, -1);
192}
193
194U_CAPI UChar * U_EXPORT2
195u_strchr(const UChar *s, UChar c) {
196    if(U16_IS_SURROGATE(c)) {
197        /* make sure to not find half of a surrogate pair */
198        return u_strFindFirst(s, -1, &c, 1);
199    } else {
200        UChar cs;
201
202        /* trivial search for a BMP code point */
203        for(;;) {
204            if((cs=*s)==c) {
205                return (UChar *)s;
206            }
207            if(cs==0) {
208                return NULL;
209            }
210            ++s;
211        }
212    }
213}
214
215U_CAPI UChar * U_EXPORT2
216u_strchr32(const UChar *s, UChar32 c) {
217    if((uint32_t)c<=U_BMP_MAX) {
218        /* find BMP code point */
219        return u_strchr(s, (UChar)c);
220    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
221        /* find supplementary code point as surrogate pair */
222        UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
223
224        while((cs=*s++)!=0) {
225            if(cs==lead && *s==trail) {
226                return (UChar *)(s-1);
227            }
228        }
229        return NULL;
230    } else {
231        /* not a Unicode code point, not findable */
232        return NULL;
233    }
234}
235
236U_CAPI UChar * U_EXPORT2
237u_memchr(const UChar *s, UChar c, int32_t count) {
238    if(count<=0) {
239        return NULL; /* no string */
240    } else if(U16_IS_SURROGATE(c)) {
241        /* make sure to not find half of a surrogate pair */
242        return u_strFindFirst(s, count, &c, 1);
243    } else {
244        /* trivial search for a BMP code point */
245        const UChar *limit=s+count;
246        do {
247            if(*s==c) {
248                return (UChar *)s;
249            }
250        } while(++s!=limit);
251        return NULL;
252    }
253}
254
255U_CAPI UChar * U_EXPORT2
256u_memchr32(const UChar *s, UChar32 c, int32_t count) {
257    if((uint32_t)c<=U_BMP_MAX) {
258        /* find BMP code point */
259        return u_memchr(s, (UChar)c, count);
260    } else if(count<2) {
261        /* too short for a surrogate pair */
262        return NULL;
263    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
264        /* find supplementary code point as surrogate pair */
265        const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
266        UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
267
268        do {
269            if(*s==lead && *(s+1)==trail) {
270                return (UChar *)s;
271            }
272        } while(++s!=limit);
273        return NULL;
274    } else {
275        /* not a Unicode code point, not findable */
276        return NULL;
277    }
278}
279
280/* Backward binary string search functions ---------------------------------- */
281
282U_CAPI UChar * U_EXPORT2
283u_strFindLast(const UChar *s, int32_t length,
284              const UChar *sub, int32_t subLength) {
285    const UChar *start, *limit, *p, *q, *subLimit;
286    UChar c, cs;
287
288    if(sub==NULL || subLength<-1) {
289        return (UChar *)s;
290    }
291    if(s==NULL || length<-1) {
292        return NULL;
293    }
294
295    /*
296     * This implementation is more lazy than the one for u_strFindFirst():
297     * There is no special search code for NUL-terminated strings.
298     * It does not seem to be worth it for searching substrings to
299     * search forward and find all matches like in u_strrchr() and similar.
300     * Therefore, we simply get both string lengths and search backward.
301     *
302     * markus 2002oct23
303     */
304
305    if(subLength<0) {
306        subLength=u_strlen(sub);
307    }
308    if(subLength==0) {
309        return (UChar *)s;
310    }
311
312    /* get sub[subLength-1] to search for it fast */
313    subLimit=sub+subLength;
314    cs=*(--subLimit);
315    --subLength;
316
317    if(subLength==0 && !U16_IS_SURROGATE(cs)) {
318        /* the substring consists of a single, non-surrogate BMP code point */
319        return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
320    }
321
322    if(length<0) {
323        length=u_strlen(s);
324    }
325
326    /* subLength was decremented above */
327    if(length<=subLength) {
328        return NULL; /* s is shorter than sub */
329    }
330
331    start=s;
332    limit=s+length;
333
334    /* the substring must start no later than s+subLength */
335    s+=subLength;
336
337    while(s!=limit) {
338        c=*(--limit);
339        if(c==cs) {
340            /* found last substring UChar, compare rest */
341            p=limit;
342            q=subLimit;
343            for(;;) {
344                if(q==sub) {
345                    if(isMatchAtCPBoundary(start, p, limit+1, start+length)) {
346                        return (UChar *)p; /* well-formed match */
347                    } else {
348                        break; /* no match because surrogate pair is split */
349                    }
350                }
351                if(*(--p)!=*(--q)) {
352                    break; /* no match */
353                }
354            }
355        }
356    }
357
358    /* not found */
359    return NULL;
360}
361
362U_CAPI UChar * U_EXPORT2
363u_strrstr(const UChar *s, const UChar *substring) {
364    return u_strFindLast(s, -1, substring, -1);
365}
366
367U_CAPI UChar * U_EXPORT2
368u_strrchr(const UChar *s, UChar c) {
369    if(U16_IS_SURROGATE(c)) {
370        /* make sure to not find half of a surrogate pair */
371        return u_strFindLast(s, -1, &c, 1);
372    } else {
373        const UChar *result=NULL;
374        UChar cs;
375
376        /* trivial search for a BMP code point */
377        for(;;) {
378            if((cs=*s)==c) {
379                result=s;
380            }
381            if(cs==0) {
382                return (UChar *)result;
383            }
384            ++s;
385        }
386    }
387}
388
389U_CAPI UChar * U_EXPORT2
390u_strrchr32(const UChar *s, UChar32 c) {
391    if((uint32_t)c<=U_BMP_MAX) {
392        /* find BMP code point */
393        return u_strrchr(s, (UChar)c);
394    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
395        /* find supplementary code point as surrogate pair */
396        const UChar *result=NULL;
397        UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
398
399        while((cs=*s++)!=0) {
400            if(cs==lead && *s==trail) {
401                result=s-1;
402            }
403        }
404        return (UChar *)result;
405    } else {
406        /* not a Unicode code point, not findable */
407        return NULL;
408    }
409}
410
411U_CAPI UChar * U_EXPORT2
412u_memrchr(const UChar *s, UChar c, int32_t count) {
413    if(count<=0) {
414        return NULL; /* no string */
415    } else if(U16_IS_SURROGATE(c)) {
416        /* make sure to not find half of a surrogate pair */
417        return u_strFindLast(s, count, &c, 1);
418    } else {
419        /* trivial search for a BMP code point */
420        const UChar *limit=s+count;
421        do {
422            if(*(--limit)==c) {
423                return (UChar *)limit;
424            }
425        } while(s!=limit);
426        return NULL;
427    }
428}
429
430U_CAPI UChar * U_EXPORT2
431u_memrchr32(const UChar *s, UChar32 c, int32_t count) {
432    if((uint32_t)c<=U_BMP_MAX) {
433        /* find BMP code point */
434        return u_memrchr(s, (UChar)c, count);
435    } else if(count<2) {
436        /* too short for a surrogate pair */
437        return NULL;
438    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
439        /* find supplementary code point as surrogate pair */
440        const UChar *limit=s+count-1;
441        UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
442
443        do {
444            if(*limit==trail && *(limit-1)==lead) {
445                return (UChar *)(limit-1);
446            }
447        } while(s!=--limit);
448        return NULL;
449    } else {
450        /* not a Unicode code point, not findable */
451        return NULL;
452    }
453}
454
455/* Tokenization functions --------------------------------------------------- */
456
457/*
458 * Match each code point in a string against each code point in the matchSet.
459 * Return the index of the first string code point that
460 * is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
461 * Return -(string length)-1 if there is no such code point.
462 */
463static int32_t
464_matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) {
465    int32_t matchLen, matchBMPLen, strItr, matchItr;
466    UChar32 stringCh, matchCh;
467    UChar c, c2;
468
469    /* first part of matchSet contains only BMP code points */
470    matchBMPLen = 0;
471    while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
472        ++matchBMPLen;
473    }
474
475    /* second part of matchSet contains BMP and supplementary code points */
476    matchLen = matchBMPLen;
477    while(matchSet[matchLen] != 0) {
478        ++matchLen;
479    }
480
481    for(strItr = 0; (c = string[strItr]) != 0;) {
482        ++strItr;
483        if(U16_IS_SINGLE(c)) {
484            if(polarity) {
485                for(matchItr = 0; matchItr < matchLen; ++matchItr) {
486                    if(c == matchSet[matchItr]) {
487                        return strItr - 1; /* one matches */
488                    }
489                }
490            } else {
491                for(matchItr = 0; matchItr < matchLen; ++matchItr) {
492                    if(c == matchSet[matchItr]) {
493                        goto endloop;
494                    }
495                }
496                return strItr - 1; /* none matches */
497            }
498        } else {
499            /*
500             * No need to check for string length before U16_IS_TRAIL
501             * because c2 could at worst be the terminating NUL.
502             */
503            if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
504                ++strItr;
505                stringCh = U16_GET_SUPPLEMENTARY(c, c2);
506            } else {
507                stringCh = c; /* unpaired trail surrogate */
508            }
509
510            if(polarity) {
511                for(matchItr = matchBMPLen; matchItr < matchLen;) {
512                    U16_NEXT(matchSet, matchItr, matchLen, matchCh);
513                    if(stringCh == matchCh) {
514                        return strItr - U16_LENGTH(stringCh); /* one matches */
515                    }
516                }
517            } else {
518                for(matchItr = matchBMPLen; matchItr < matchLen;) {
519                    U16_NEXT(matchSet, matchItr, matchLen, matchCh);
520                    if(stringCh == matchCh) {
521                        goto endloop;
522                    }
523                }
524                return strItr - U16_LENGTH(stringCh); /* none matches */
525            }
526        }
527endloop:
528        /* wish C had continue with labels like Java... */;
529    }
530
531    /* Didn't find it. */
532    return -strItr-1;
533}
534
535/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
536U_CAPI UChar * U_EXPORT2
537u_strpbrk(const UChar *string, const UChar *matchSet)
538{
539    int32_t idx = _matchFromSet(string, matchSet, TRUE);
540    if(idx >= 0) {
541        return (UChar *)string + idx;
542    } else {
543        return NULL;
544    }
545}
546
547/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
548U_CAPI int32_t U_EXPORT2
549u_strcspn(const UChar *string, const UChar *matchSet)
550{
551    int32_t idx = _matchFromSet(string, matchSet, TRUE);
552    if(idx >= 0) {
553        return idx;
554    } else {
555        return -idx - 1; /* == u_strlen(string) */
556    }
557}
558
559/* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
560U_CAPI int32_t U_EXPORT2
561u_strspn(const UChar *string, const UChar *matchSet)
562{
563    int32_t idx = _matchFromSet(string, matchSet, FALSE);
564    if(idx >= 0) {
565        return idx;
566    } else {
567        return -idx - 1; /* == u_strlen(string) */
568    }
569}
570
571/* ----- Text manipulation functions --- */
572
573U_CAPI UChar* U_EXPORT2
574u_strtok_r(UChar    *src,
575     const UChar    *delim,
576           UChar   **saveState)
577{
578    UChar *tokSource;
579    UChar *nextToken;
580    uint32_t nonDelimIdx;
581
582    /* If saveState is NULL, the user messed up. */
583    if (src != NULL) {
584        tokSource = src;
585        *saveState = src; /* Set to "src" in case there are no delimiters */
586    }
587    else if (*saveState) {
588        tokSource = *saveState;
589    }
590    else {
591        /* src == NULL && *saveState == NULL */
592        /* This shouldn't happen. We already finished tokenizing. */
593        return NULL;
594    }
595
596    /* Skip initial delimiters */
597    nonDelimIdx = u_strspn(tokSource, delim);
598    tokSource = &tokSource[nonDelimIdx];
599
600    if (*tokSource) {
601        nextToken = u_strpbrk(tokSource, delim);
602        if (nextToken != NULL) {
603            /* Create a token */
604            *(nextToken++) = 0;
605            *saveState = nextToken;
606            return tokSource;
607        }
608        else if (*saveState) {
609            /* Return the last token */
610            *saveState = NULL;
611            return tokSource;
612        }
613    }
614    else {
615        /* No tokens were found. Only delimiters were left. */
616        *saveState = NULL;
617    }
618    return NULL;
619}
620
621/* Miscellaneous functions -------------------------------------------------- */
622
623U_CAPI UChar* U_EXPORT2
624u_strcat(UChar     *dst,
625    const UChar     *src)
626{
627    UChar *anchor = dst;            /* save a pointer to start of dst */
628
629    while(*dst != 0) {              /* To end of first string          */
630        ++dst;
631    }
632    while((*(dst++) = *(src++)) != 0) {     /* copy string 2 over              */
633    }
634
635    return anchor;
636}
637
638U_CAPI UChar*  U_EXPORT2
639u_strncat(UChar     *dst,
640     const UChar     *src,
641     int32_t     n )
642{
643    if(n > 0) {
644        UChar *anchor = dst;            /* save a pointer to start of dst */
645
646        while(*dst != 0) {              /* To end of first string          */
647            ++dst;
648        }
649        while((*dst = *src) != 0) {     /* copy string 2 over              */
650            ++dst;
651            if(--n == 0) {
652                *dst = 0;
653                break;
654            }
655            ++src;
656        }
657
658        return anchor;
659    } else {
660        return dst;
661    }
662}
663
664/* ----- Text property functions --- */
665
666U_CAPI int32_t   U_EXPORT2
667u_strcmp(const UChar *s1,
668    const UChar *s2)
669{
670    UChar  c1, c2;
671
672    for(;;) {
673        c1=*s1++;
674        c2=*s2++;
675        if (c1 != c2 || c1 == 0) {
676            break;
677        }
678    }
679    return (int32_t)c1 - (int32_t)c2;
680}
681
682U_CFUNC int32_t U_EXPORT2
683uprv_strCompare(const UChar *s1, int32_t length1,
684                const UChar *s2, int32_t length2,
685                UBool strncmpStyle, UBool codePointOrder) {
686    const UChar *start1, *start2, *limit1, *limit2;
687    UChar c1, c2;
688
689    /* setup for fix-up */
690    start1=s1;
691    start2=s2;
692
693    /* compare identical prefixes - they do not need to be fixed up */
694    if(length1<0 && length2<0) {
695        /* strcmp style, both NUL-terminated */
696        if(s1==s2) {
697            return 0;
698        }
699
700        for(;;) {
701            c1=*s1;
702            c2=*s2;
703            if(c1!=c2) {
704                break;
705            }
706            if(c1==0) {
707                return 0;
708            }
709            ++s1;
710            ++s2;
711        }
712
713        /* setup for fix-up */
714        limit1=limit2=NULL;
715    } else if(strncmpStyle) {
716        /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
717        if(s1==s2) {
718            return 0;
719        }
720
721        limit1=start1+length1;
722
723        for(;;) {
724            /* both lengths are same, check only one limit */
725            if(s1==limit1) {
726                return 0;
727            }
728
729            c1=*s1;
730            c2=*s2;
731            if(c1!=c2) {
732                break;
733            }
734            if(c1==0) {
735                return 0;
736            }
737            ++s1;
738            ++s2;
739        }
740
741        /* setup for fix-up */
742        limit2=start2+length1; /* use length1 here, too, to enforce assumption */
743    } else {
744        /* memcmp/UnicodeString style, both length-specified */
745        int32_t lengthResult;
746
747        if(length1<0) {
748            length1=u_strlen(s1);
749        }
750        if(length2<0) {
751            length2=u_strlen(s2);
752        }
753
754        /* limit1=start1+min(lenght1, length2) */
755        if(length1<length2) {
756            lengthResult=-1;
757            limit1=start1+length1;
758        } else if(length1==length2) {
759            lengthResult=0;
760            limit1=start1+length1;
761        } else /* length1>length2 */ {
762            lengthResult=1;
763            limit1=start1+length2;
764        }
765
766        if(s1==s2) {
767            return lengthResult;
768        }
769
770        for(;;) {
771            /* check pseudo-limit */
772            if(s1==limit1) {
773                return lengthResult;
774            }
775
776            c1=*s1;
777            c2=*s2;
778            if(c1!=c2) {
779                break;
780            }
781            ++s1;
782            ++s2;
783        }
784
785        /* setup for fix-up */
786        limit1=start1+length1;
787        limit2=start2+length2;
788    }
789
790    /* if both values are in or above the surrogate range, fix them up */
791    if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
792        /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
793        if(
794            (c1<=0xdbff && (s1+1)!=limit1 && UTF_IS_TRAIL(*(s1+1))) ||
795            (UTF_IS_TRAIL(c1) && start1!=s1 && UTF_IS_LEAD(*(s1-1)))
796        ) {
797            /* part of a surrogate pair, leave >=d800 */
798        } else {
799            /* BMP code point - may be surrogate code point - make <d800 */
800            c1-=0x2800;
801        }
802
803        if(
804            (c2<=0xdbff && (s2+1)!=limit2 && UTF_IS_TRAIL(*(s2+1))) ||
805            (UTF_IS_TRAIL(c2) && start2!=s2 && UTF_IS_LEAD(*(s2-1)))
806        ) {
807            /* part of a surrogate pair, leave >=d800 */
808        } else {
809            /* BMP code point - may be surrogate code point - make <d800 */
810            c2-=0x2800;
811        }
812    }
813
814    /* now c1 and c2 are in the requested (code unit or code point) order */
815    return (int32_t)c1-(int32_t)c2;
816}
817
818/*
819 * Compare two strings as presented by UCharIterators.
820 * Use code unit or code point order.
821 * When the function returns, it is undefined where the iterators
822 * have stopped.
823 */
824U_CAPI int32_t U_EXPORT2
825u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) {
826    UChar32 c1, c2;
827
828    /* argument checking */
829    if(iter1==NULL || iter2==NULL) {
830        return 0; /* bad arguments */
831    }
832    if(iter1==iter2) {
833        return 0; /* identical iterators */
834    }
835
836    /* reset iterators to start? */
837    iter1->move(iter1, 0, UITER_START);
838    iter2->move(iter2, 0, UITER_START);
839
840    /* compare identical prefixes - they do not need to be fixed up */
841    for(;;) {
842        c1=iter1->next(iter1);
843        c2=iter2->next(iter2);
844        if(c1!=c2) {
845            break;
846        }
847        if(c1==-1) {
848            return 0;
849        }
850    }
851
852    /* if both values are in or above the surrogate range, fix them up */
853    if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
854        /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
855        if(
856            (c1<=0xdbff && UTF_IS_TRAIL(iter1->current(iter1))) ||
857            (UTF_IS_TRAIL(c1) && (iter1->previous(iter1), UTF_IS_LEAD(iter1->previous(iter1))))
858        ) {
859            /* part of a surrogate pair, leave >=d800 */
860        } else {
861            /* BMP code point - may be surrogate code point - make <d800 */
862            c1-=0x2800;
863        }
864
865        if(
866            (c2<=0xdbff && UTF_IS_TRAIL(iter2->current(iter2))) ||
867            (UTF_IS_TRAIL(c2) && (iter2->previous(iter2), UTF_IS_LEAD(iter2->previous(iter2))))
868        ) {
869            /* part of a surrogate pair, leave >=d800 */
870        } else {
871            /* BMP code point - may be surrogate code point - make <d800 */
872            c2-=0x2800;
873        }
874    }
875
876    /* now c1 and c2 are in the requested (code unit or code point) order */
877    return (int32_t)c1-(int32_t)c2;
878}
879
880#if 0
881/*
882 * u_strCompareIter() does not leave the iterators _on_ the different units.
883 * This is possible but would cost a few extra indirect function calls to back
884 * up if the last unit (c1 or c2 respectively) was >=0.
885 *
886 * Consistently leaving them _behind_ the different units is not an option
887 * because the current "unit" is the end of the string if that is reached,
888 * and in such a case the iterator does not move.
889 * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
890 * of their strings. Calling previous() on each does not move them to where
891 * the comparison fails.
892 *
893 * So the simplest semantics is to not define where the iterators end up.
894 *
895 * The following fragment is part of what would need to be done for backing up.
896 */
897void fragment {
898        /* iff a surrogate is part of a surrogate pair, leave >=d800 */
899        if(c1<=0xdbff) {
900            if(!UTF_IS_TRAIL(iter1->current(iter1))) {
901                /* lead surrogate code point - make <d800 */
902                c1-=0x2800;
903            }
904        } else if(c1<=0xdfff) {
905            int32_t idx=iter1->getIndex(iter1, UITER_CURRENT);
906            iter1->previous(iter1); /* ==c1 */
907            if(!UTF_IS_LEAD(iter1->previous(iter1))) {
908                /* trail surrogate code point - make <d800 */
909                c1-=0x2800;
910            }
911            /* go back to behind where the difference is */
912            iter1->move(iter1, idx, UITER_ZERO);
913        } else /* 0xe000<=c1<=0xffff */ {
914            /* BMP code point - make <d800 */
915            c1-=0x2800;
916        }
917}
918#endif
919
920U_CAPI int32_t U_EXPORT2
921u_strCompare(const UChar *s1, int32_t length1,
922             const UChar *s2, int32_t length2,
923             UBool codePointOrder) {
924    /* argument checking */
925    if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
926        return 0;
927    }
928    return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder);
929}
930
931/* String compare in code point order - u_strcmp() compares in code unit order. */
932U_CAPI int32_t U_EXPORT2
933u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) {
934    return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE);
935}
936
937U_CAPI int32_t   U_EXPORT2
938u_strncmp(const UChar     *s1,
939     const UChar     *s2,
940     int32_t     n)
941{
942    if(n > 0) {
943        int32_t rc;
944        for(;;) {
945            rc = (int32_t)*s1 - (int32_t)*s2;
946            if(rc != 0 || *s1 == 0 || --n == 0) {
947                return rc;
948            }
949            ++s1;
950            ++s2;
951        }
952    } else {
953        return 0;
954    }
955}
956
957U_CAPI int32_t U_EXPORT2
958u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) {
959    return uprv_strCompare(s1, n, s2, n, TRUE, TRUE);
960}
961
962U_CAPI UChar* U_EXPORT2
963u_strcpy(UChar     *dst,
964    const UChar     *src)
965{
966    UChar *anchor = dst;            /* save a pointer to start of dst */
967
968    while((*(dst++) = *(src++)) != 0) {     /* copy string 2 over              */
969    }
970
971    return anchor;
972}
973
974U_CAPI UChar*  U_EXPORT2
975u_strncpy(UChar     *dst,
976     const UChar     *src,
977     int32_t     n)
978{
979    UChar *anchor = dst;            /* save a pointer to start of dst */
980
981    /* copy string 2 over */
982    while(n > 0 && (*(dst++) = *(src++)) != 0) {
983        --n;
984    }
985
986    return anchor;
987}
988
989U_CAPI int32_t   U_EXPORT2
990u_strlen(const UChar *s)
991{
992#if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
993    return (int32_t)uprv_wcslen(s);
994#else
995    const UChar *t = s;
996    while(*t != 0) {
997      ++t;
998    }
999    return t - s;
1000#endif
1001}
1002
1003U_CAPI int32_t U_EXPORT2
1004u_countChar32(const UChar *s, int32_t length) {
1005    int32_t count;
1006
1007    if(s==NULL || length<-1) {
1008        return 0;
1009    }
1010
1011    count=0;
1012    if(length>=0) {
1013        while(length>0) {
1014            ++count;
1015            if(UTF_IS_LEAD(*s) && length>=2 && UTF_IS_TRAIL(*(s+1))) {
1016                s+=2;
1017                length-=2;
1018            } else {
1019                ++s;
1020                --length;
1021            }
1022        }
1023    } else /* length==-1 */ {
1024        UChar c;
1025
1026        for(;;) {
1027            if((c=*s++)==0) {
1028                break;
1029            }
1030            ++count;
1031
1032            /*
1033             * sufficient to look ahead one because of UTF-16;
1034             * safe to look ahead one because at worst that would be the terminating NUL
1035             */
1036            if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(*s)) {
1037                ++s;
1038            }
1039        }
1040    }
1041    return count;
1042}
1043
1044U_CAPI UBool U_EXPORT2
1045u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) {
1046
1047    if(number<0) {
1048        return TRUE;
1049    }
1050    if(s==NULL || length<-1) {
1051        return FALSE;
1052    }
1053
1054    if(length==-1) {
1055        /* s is NUL-terminated */
1056        UChar c;
1057
1058        /* count code points until they exceed */
1059        for(;;) {
1060            if((c=*s++)==0) {
1061                return FALSE;
1062            }
1063            if(number==0) {
1064                return TRUE;
1065            }
1066            if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1067                ++s;
1068            }
1069            --number;
1070        }
1071    } else {
1072        /* length>=0 known */
1073        const UChar *limit;
1074        int32_t maxSupplementary;
1075
1076        /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
1077        if(((length+1)/2)>number) {
1078            return TRUE;
1079        }
1080
1081        /* check if s does not even contain enough UChars */
1082        maxSupplementary=length-number;
1083        if(maxSupplementary<=0) {
1084            return FALSE;
1085        }
1086        /* there are maxSupplementary=length-number more UChars than asked-for code points */
1087
1088        /*
1089         * count code points until they exceed and also check that there are
1090         * no more than maxSupplementary supplementary code points (UChar pairs)
1091         */
1092        limit=s+length;
1093        for(;;) {
1094            if(s==limit) {
1095                return FALSE;
1096            }
1097            if(number==0) {
1098                return TRUE;
1099            }
1100            if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) {
1101                ++s;
1102                if(--maxSupplementary<=0) {
1103                    /* too many pairs - too few code points */
1104                    return FALSE;
1105                }
1106            }
1107            --number;
1108        }
1109    }
1110}
1111
1112U_CAPI UChar * U_EXPORT2
1113u_memcpy(UChar *dest, const UChar *src, int32_t count) {
1114    return (UChar *)uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR);
1115}
1116
1117U_CAPI UChar * U_EXPORT2
1118u_memmove(UChar *dest, const UChar *src, int32_t count) {
1119    return (UChar *)uprv_memmove(dest, src, count*U_SIZEOF_UCHAR);
1120}
1121
1122U_CAPI UChar * U_EXPORT2
1123u_memset(UChar *dest, UChar c, int32_t count) {
1124    if(count > 0) {
1125        UChar *ptr = dest;
1126        UChar *limit = dest + count;
1127
1128        while (ptr < limit) {
1129            *(ptr++) = c;
1130        }
1131    }
1132    return dest;
1133}
1134
1135U_CAPI int32_t U_EXPORT2
1136u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) {
1137    if(count > 0) {
1138        const UChar *limit = buf1 + count;
1139        int32_t result;
1140
1141        while (buf1 < limit) {
1142            result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2;
1143            if (result != 0) {
1144                return result;
1145            }
1146            buf1++;
1147            buf2++;
1148        }
1149    }
1150    return 0;
1151}
1152
1153U_CAPI int32_t U_EXPORT2
1154u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) {
1155    return uprv_strCompare(s1, count, s2, count, FALSE, TRUE);
1156}
1157
1158/* u_unescape & support fns ------------------------------------------------- */
1159
1160/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
1161static const UChar UNESCAPE_MAP[] = {
1162    /*"   0x22, 0x22 */
1163    /*'   0x27, 0x27 */
1164    /*?   0x3F, 0x3F */
1165    /*\   0x5C, 0x5C */
1166    /*a*/ 0x61, 0x07,
1167    /*b*/ 0x62, 0x08,
1168    /*e*/ 0x65, 0x1b,
1169    /*f*/ 0x66, 0x0c,
1170    /*n*/ 0x6E, 0x0a,
1171    /*r*/ 0x72, 0x0d,
1172    /*t*/ 0x74, 0x09,
1173    /*v*/ 0x76, 0x0b
1174};
1175enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) };
1176
1177/* Convert one octal digit to a numeric value 0..7, or -1 on failure */
1178static int8_t _digit8(UChar c) {
1179    if (c >= 0x0030 && c <= 0x0037) {
1180        return (int8_t)(c - 0x0030);
1181    }
1182    return -1;
1183}
1184
1185/* Convert one hex digit to a numeric value 0..F, or -1 on failure */
1186static int8_t _digit16(UChar c) {
1187    if (c >= 0x0030 && c <= 0x0039) {
1188        return (int8_t)(c - 0x0030);
1189    }
1190    if (c >= 0x0041 && c <= 0x0046) {
1191        return (int8_t)(c - (0x0041 - 10));
1192    }
1193    if (c >= 0x0061 && c <= 0x0066) {
1194        return (int8_t)(c - (0x0061 - 10));
1195    }
1196    return -1;
1197}
1198
1199/* Parse a single escape sequence.  Although this method deals in
1200 * UChars, it does not use C++ or UnicodeString.  This allows it to
1201 * be used from C contexts. */
1202U_CAPI UChar32 U_EXPORT2
1203u_unescapeAt(UNESCAPE_CHAR_AT charAt,
1204             int32_t *offset,
1205             int32_t length,
1206             void *context) {
1207
1208    int32_t start = *offset;
1209    UChar c;
1210    UChar32 result = 0;
1211    int8_t n = 0;
1212    int8_t minDig = 0;
1213    int8_t maxDig = 0;
1214    int8_t bitsPerDigit = 4;
1215    int8_t dig;
1216    int32_t i;
1217    UBool braces = FALSE;
1218
1219    /* Check that offset is in range */
1220    if (*offset < 0 || *offset >= length) {
1221        goto err;
1222    }
1223
1224    /* Fetch first UChar after '\\' */
1225    c = charAt((*offset)++, context);
1226
1227    /* Convert hexadecimal and octal escapes */
1228    switch (c) {
1229    case 0x0075 /*'u'*/:
1230        minDig = maxDig = 4;
1231        break;
1232    case 0x0055 /*'U'*/:
1233        minDig = maxDig = 8;
1234        break;
1235    case 0x0078 /*'x'*/:
1236        minDig = 1;
1237        if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
1238            ++(*offset);
1239            braces = TRUE;
1240            maxDig = 8;
1241        } else {
1242            maxDig = 2;
1243        }
1244        break;
1245    default:
1246        dig = _digit8(c);
1247        if (dig >= 0) {
1248            minDig = 1;
1249            maxDig = 3;
1250            n = 1; /* Already have first octal digit */
1251            bitsPerDigit = 3;
1252            result = dig;
1253        }
1254        break;
1255    }
1256    if (minDig != 0) {
1257        while (*offset < length && n < maxDig) {
1258            c = charAt(*offset, context);
1259            dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
1260            if (dig < 0) {
1261                break;
1262            }
1263            result = (result << bitsPerDigit) | dig;
1264            ++(*offset);
1265            ++n;
1266        }
1267        if (n < minDig) {
1268            goto err;
1269        }
1270        if (braces) {
1271            if (c != 0x7D /*}*/) {
1272                goto err;
1273            }
1274            ++(*offset);
1275        }
1276        if (result < 0 || result >= 0x110000) {
1277            goto err;
1278        }
1279        /* If an escape sequence specifies a lead surrogate, see if
1280         * there is a trail surrogate after it, either as an escape or
1281         * as a literal.  If so, join them up into a supplementary.
1282         */
1283        if (*offset < length && U16_IS_LEAD(result)) {
1284            int32_t ahead = *offset + 1;
1285            c = charAt(*offset, context);
1286            if (c == 0x5C /*'\\'*/ && ahead < length) {
1287                c = (UChar) u_unescapeAt(charAt, &ahead, length, context);
1288            }
1289            if (U16_IS_TRAIL(c)) {
1290                *offset = ahead;
1291                result = U16_GET_SUPPLEMENTARY(result, c);
1292            }
1293        }
1294        return result;
1295    }
1296
1297    /* Convert C-style escapes in table */
1298    for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
1299        if (c == UNESCAPE_MAP[i]) {
1300            return UNESCAPE_MAP[i+1];
1301        } else if (c < UNESCAPE_MAP[i]) {
1302            break;
1303        }
1304    }
1305
1306    /* Map \cX to control-X: X & 0x1F */
1307    if (c == 0x0063 /*'c'*/ && *offset < length) {
1308        c = charAt((*offset)++, context);
1309        if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) {
1310            UChar c2 = charAt(*offset, context);
1311            if (UTF_IS_SECOND_SURROGATE(c2)) {
1312                ++(*offset);
1313                c = (UChar) UTF16_GET_PAIR_VALUE(c, c2); /* [sic] */
1314            }
1315        }
1316        return 0x1F & c;
1317    }
1318
1319    /* If no special forms are recognized, then consider
1320     * the backslash to generically escape the next character.
1321     * Deal with surrogate pairs. */
1322    if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) {
1323        UChar c2 = charAt(*offset, context);
1324        if (UTF_IS_SECOND_SURROGATE(c2)) {
1325            ++(*offset);
1326            return UTF16_GET_PAIR_VALUE(c, c2);
1327        }
1328    }
1329    return c;
1330
1331 err:
1332    /* Invalid escape sequence */
1333    *offset = start; /* Reset to initial value */
1334    return (UChar32)0xFFFFFFFF;
1335}
1336
1337/* u_unescapeAt() callback to return a UChar from a char* */
1338static UChar U_CALLCONV
1339_charPtr_charAt(int32_t offset, void *context) {
1340    UChar c16;
1341    /* It would be more efficient to access the invariant tables
1342     * directly but there is no API for that. */
1343    u_charsToUChars(((char*) context) + offset, &c16, 1);
1344    return c16;
1345}
1346
1347/* Append an escape-free segment of the text; used by u_unescape() */
1348static void _appendUChars(UChar *dest, int32_t destCapacity,
1349                          const char *src, int32_t srcLen) {
1350    if (destCapacity < 0) {
1351        destCapacity = 0;
1352    }
1353    if (srcLen > destCapacity) {
1354        srcLen = destCapacity;
1355    }
1356    u_charsToUChars(src, dest, srcLen);
1357}
1358
1359/* Do an invariant conversion of char* -> UChar*, with escape parsing */
1360U_CAPI int32_t U_EXPORT2
1361u_unescape(const char *src, UChar *dest, int32_t destCapacity) {
1362    const char *segment = src;
1363    int32_t i = 0;
1364    char c;
1365
1366    while ((c=*src) != 0) {
1367        /* '\\' intentionally written as compiler-specific
1368         * character constant to correspond to compiler-specific
1369         * char* constants. */
1370        if (c == '\\') {
1371            int32_t lenParsed = 0;
1372            UChar32 c32;
1373            if (src != segment) {
1374                if (dest != NULL) {
1375                    _appendUChars(dest + i, destCapacity - i,
1376                                  segment, (int32_t)(src - segment));
1377                }
1378                i += (int32_t)(src - segment);
1379            }
1380            ++src; /* advance past '\\' */
1381            c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src);
1382            if (lenParsed == 0) {
1383                goto err;
1384            }
1385            src += lenParsed; /* advance past escape seq. */
1386            if (dest != NULL && UTF_CHAR_LENGTH(c32) <= (destCapacity - i)) {
1387                UTF_APPEND_CHAR_UNSAFE(dest, i, c32);
1388            } else {
1389                i += UTF_CHAR_LENGTH(c32);
1390            }
1391            segment = src;
1392        } else {
1393            ++src;
1394        }
1395    }
1396    if (src != segment) {
1397        if (dest != NULL) {
1398            _appendUChars(dest + i, destCapacity - i,
1399                          segment, (int32_t)(src - segment));
1400        }
1401        i += (int32_t)(src - segment);
1402    }
1403    if (dest != NULL && i < destCapacity) {
1404        dest[i] = 0;
1405    }
1406    return i;
1407
1408 err:
1409    if (dest != NULL && destCapacity > 0) {
1410        *dest = 0;
1411    }
1412    return 0;
1413}
1414
1415/* NUL-termination of strings ----------------------------------------------- */
1416
1417/**
1418 * NUL-terminate a string no matter what its type.
1419 * Set warning and error codes accordingly.
1420 */
1421#define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode)      \
1422    if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) {                    \
1423        /* not a public function, so no complete argument checking */   \
1424                                                                        \
1425        if(length<0) {                                                  \
1426            /* assume that the caller handles this */                   \
1427        } else if(length<destCapacity) {                                \
1428            /* NUL-terminate the string, the NUL fits */                \
1429            dest[length]=0;                                             \
1430            /* unset the not-terminated warning but leave all others */ \
1431            if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {          \
1432                *pErrorCode=U_ZERO_ERROR;                               \
1433            }                                                           \
1434        } else if(length==destCapacity) {                               \
1435            /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
1436            *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;                \
1437        } else /* length>destCapacity */ {                              \
1438            /* even the string itself did not fit - set an error code */ \
1439            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;                        \
1440        }                                                               \
1441    }
1442
1443U_CAPI int32_t U_EXPORT2
1444u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1445    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1446    return length;
1447}
1448
1449U_CAPI int32_t U_EXPORT2
1450u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1451    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1452    return length;
1453}
1454
1455U_CAPI int32_t U_EXPORT2
1456u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1457    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1458    return length;
1459}
1460
1461U_CAPI int32_t U_EXPORT2
1462u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1463    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1464    return length;
1465}
1466