1/*
2******************************************************************************
3*
4*   Copyright (C) 1998-2012, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*
9* File ustring.cpp
10*
11* Modification History:
12*
13*   Date        Name        Description
14*   12/07/98    bertrand    Creation.
15******************************************************************************
16*/
17
18#include "unicode/utypes.h"
19#include "unicode/putil.h"
20#include "unicode/ustring.h"
21#include "unicode/utf16.h"
22#include "cstring.h"
23#include "cwchar.h"
24#include "cmemory.h"
25#include "ustr_imp.h"
26
27/* ANSI string.h - style functions ------------------------------------------ */
28
29/* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */
30#define U_BMP_MAX 0xffff
31
32/* Forward binary string search functions ----------------------------------- */
33
34/*
35 * Test if a substring match inside a string is at code point boundaries.
36 * All pointers refer to the same buffer.
37 * The limit pointer may be NULL, all others must be real pointers.
38 */
39static inline UBool
40isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) {
41    if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) {
42        /* the leading edge of the match is in the middle of a surrogate pair */
43        return FALSE;
44    }
45    if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) {
46        /* the trailing edge of the match is in the middle of a surrogate pair */
47        return FALSE;
48    }
49    return TRUE;
50}
51
52U_CAPI UChar * U_EXPORT2
53u_strFindFirst(const UChar *s, int32_t length,
54               const UChar *sub, int32_t subLength) {
55    const UChar *start, *p, *q, *subLimit;
56    UChar c, cs, cq;
57
58    if(sub==NULL || subLength<-1) {
59        return (UChar *)s;
60    }
61    if(s==NULL || length<-1) {
62        return NULL;
63    }
64
65    start=s;
66
67    if(length<0 && subLength<0) {
68        /* both strings are NUL-terminated */
69        if((cs=*sub++)==0) {
70            return (UChar *)s;
71        }
72        if(*sub==0 && !U16_IS_SURROGATE(cs)) {
73            /* the substring consists of a single, non-surrogate BMP code point */
74            return u_strchr(s, cs);
75        }
76
77        while((c=*s++)!=0) {
78            if(c==cs) {
79                /* found first substring UChar, compare rest */
80                p=s;
81                q=sub;
82                for(;;) {
83                    if((cq=*q)==0) {
84                        if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
85                            return (UChar *)(s-1); /* well-formed match */
86                        } else {
87                            break; /* no match because surrogate pair is split */
88                        }
89                    }
90                    if((c=*p)==0) {
91                        return NULL; /* no match, and none possible after s */
92                    }
93                    if(c!=cq) {
94                        break; /* no match */
95                    }
96                    ++p;
97                    ++q;
98                }
99            }
100        }
101
102        /* not found */
103        return NULL;
104    }
105
106    if(subLength<0) {
107        subLength=u_strlen(sub);
108    }
109    if(subLength==0) {
110        return (UChar *)s;
111    }
112
113    /* get sub[0] to search for it fast */
114    cs=*sub++;
115    --subLength;
116    subLimit=sub+subLength;
117
118    if(subLength==0 && !U16_IS_SURROGATE(cs)) {
119        /* the substring consists of a single, non-surrogate BMP code point */
120        return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
121    }
122
123    if(length<0) {
124        /* s is NUL-terminated */
125        while((c=*s++)!=0) {
126            if(c==cs) {
127                /* found first substring UChar, compare rest */
128                p=s;
129                q=sub;
130                for(;;) {
131                    if(q==subLimit) {
132                        if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
133                            return (UChar *)(s-1); /* well-formed match */
134                        } else {
135                            break; /* no match because surrogate pair is split */
136                        }
137                    }
138                    if((c=*p)==0) {
139                        return NULL; /* no match, and none possible after s */
140                    }
141                    if(c!=*q) {
142                        break; /* no match */
143                    }
144                    ++p;
145                    ++q;
146                }
147            }
148        }
149    } else {
150        const UChar *limit, *preLimit;
151
152        /* subLength was decremented above */
153        if(length<=subLength) {
154            return NULL; /* s is shorter than sub */
155        }
156
157        limit=s+length;
158
159        /* the substring must start before preLimit */
160        preLimit=limit-subLength;
161
162        while(s!=preLimit) {
163            c=*s++;
164            if(c==cs) {
165                /* found first substring UChar, compare rest */
166                p=s;
167                q=sub;
168                for(;;) {
169                    if(q==subLimit) {
170                        if(isMatchAtCPBoundary(start, s-1, p, limit)) {
171                            return (UChar *)(s-1); /* well-formed match */
172                        } else {
173                            break; /* no match because surrogate pair is split */
174                        }
175                    }
176                    if(*p!=*q) {
177                        break; /* no match */
178                    }
179                    ++p;
180                    ++q;
181                }
182            }
183        }
184    }
185
186    /* not found */
187    return NULL;
188}
189
190U_CAPI UChar * U_EXPORT2
191u_strstr(const UChar *s, const UChar *substring) {
192    return u_strFindFirst(s, -1, substring, -1);
193}
194
195U_CAPI UChar * U_EXPORT2
196u_strchr(const UChar *s, UChar c) {
197    if(U16_IS_SURROGATE(c)) {
198        /* make sure to not find half of a surrogate pair */
199        return u_strFindFirst(s, -1, &c, 1);
200    } else {
201        UChar cs;
202
203        /* trivial search for a BMP code point */
204        for(;;) {
205            if((cs=*s)==c) {
206                return (UChar *)s;
207            }
208            if(cs==0) {
209                return NULL;
210            }
211            ++s;
212        }
213    }
214}
215
216U_CAPI UChar * U_EXPORT2
217u_strchr32(const UChar *s, UChar32 c) {
218    if((uint32_t)c<=U_BMP_MAX) {
219        /* find BMP code point */
220        return u_strchr(s, (UChar)c);
221    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
222        /* find supplementary code point as surrogate pair */
223        UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
224
225        while((cs=*s++)!=0) {
226            if(cs==lead && *s==trail) {
227                return (UChar *)(s-1);
228            }
229        }
230        return NULL;
231    } else {
232        /* not a Unicode code point, not findable */
233        return NULL;
234    }
235}
236
237U_CAPI UChar * U_EXPORT2
238u_memchr(const UChar *s, UChar c, int32_t count) {
239    if(count<=0) {
240        return NULL; /* no string */
241    } else if(U16_IS_SURROGATE(c)) {
242        /* make sure to not find half of a surrogate pair */
243        return u_strFindFirst(s, count, &c, 1);
244    } else {
245        /* trivial search for a BMP code point */
246        const UChar *limit=s+count;
247        do {
248            if(*s==c) {
249                return (UChar *)s;
250            }
251        } while(++s!=limit);
252        return NULL;
253    }
254}
255
256U_CAPI UChar * U_EXPORT2
257u_memchr32(const UChar *s, UChar32 c, int32_t count) {
258    if((uint32_t)c<=U_BMP_MAX) {
259        /* find BMP code point */
260        return u_memchr(s, (UChar)c, count);
261    } else if(count<2) {
262        /* too short for a surrogate pair */
263        return NULL;
264    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
265        /* find supplementary code point as surrogate pair */
266        const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
267        UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
268
269        do {
270            if(*s==lead && *(s+1)==trail) {
271                return (UChar *)s;
272            }
273        } while(++s!=limit);
274        return NULL;
275    } else {
276        /* not a Unicode code point, not findable */
277        return NULL;
278    }
279}
280
281/* Backward binary string search functions ---------------------------------- */
282
283U_CAPI UChar * U_EXPORT2
284u_strFindLast(const UChar *s, int32_t length,
285              const UChar *sub, int32_t subLength) {
286    const UChar *start, *limit, *p, *q, *subLimit;
287    UChar c, cs;
288
289    if(sub==NULL || subLength<-1) {
290        return (UChar *)s;
291    }
292    if(s==NULL || length<-1) {
293        return NULL;
294    }
295
296    /*
297     * This implementation is more lazy than the one for u_strFindFirst():
298     * There is no special search code for NUL-terminated strings.
299     * It does not seem to be worth it for searching substrings to
300     * search forward and find all matches like in u_strrchr() and similar.
301     * Therefore, we simply get both string lengths and search backward.
302     *
303     * markus 2002oct23
304     */
305
306    if(subLength<0) {
307        subLength=u_strlen(sub);
308    }
309    if(subLength==0) {
310        return (UChar *)s;
311    }
312
313    /* get sub[subLength-1] to search for it fast */
314    subLimit=sub+subLength;
315    cs=*(--subLimit);
316    --subLength;
317
318    if(subLength==0 && !U16_IS_SURROGATE(cs)) {
319        /* the substring consists of a single, non-surrogate BMP code point */
320        return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
321    }
322
323    if(length<0) {
324        length=u_strlen(s);
325    }
326
327    /* subLength was decremented above */
328    if(length<=subLength) {
329        return NULL; /* s is shorter than sub */
330    }
331
332    start=s;
333    limit=s+length;
334
335    /* the substring must start no later than s+subLength */
336    s+=subLength;
337
338    while(s!=limit) {
339        c=*(--limit);
340        if(c==cs) {
341            /* found last substring UChar, compare rest */
342            p=limit;
343            q=subLimit;
344            for(;;) {
345                if(q==sub) {
346                    if(isMatchAtCPBoundary(start, p, limit+1, start+length)) {
347                        return (UChar *)p; /* well-formed match */
348                    } else {
349                        break; /* no match because surrogate pair is split */
350                    }
351                }
352                if(*(--p)!=*(--q)) {
353                    break; /* no match */
354                }
355            }
356        }
357    }
358
359    /* not found */
360    return NULL;
361}
362
363U_CAPI UChar * U_EXPORT2
364u_strrstr(const UChar *s, const UChar *substring) {
365    return u_strFindLast(s, -1, substring, -1);
366}
367
368U_CAPI UChar * U_EXPORT2
369u_strrchr(const UChar *s, UChar c) {
370    if(U16_IS_SURROGATE(c)) {
371        /* make sure to not find half of a surrogate pair */
372        return u_strFindLast(s, -1, &c, 1);
373    } else {
374        const UChar *result=NULL;
375        UChar cs;
376
377        /* trivial search for a BMP code point */
378        for(;;) {
379            if((cs=*s)==c) {
380                result=s;
381            }
382            if(cs==0) {
383                return (UChar *)result;
384            }
385            ++s;
386        }
387    }
388}
389
390U_CAPI UChar * U_EXPORT2
391u_strrchr32(const UChar *s, UChar32 c) {
392    if((uint32_t)c<=U_BMP_MAX) {
393        /* find BMP code point */
394        return u_strrchr(s, (UChar)c);
395    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
396        /* find supplementary code point as surrogate pair */
397        const UChar *result=NULL;
398        UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
399
400        while((cs=*s++)!=0) {
401            if(cs==lead && *s==trail) {
402                result=s-1;
403            }
404        }
405        return (UChar *)result;
406    } else {
407        /* not a Unicode code point, not findable */
408        return NULL;
409    }
410}
411
412U_CAPI UChar * U_EXPORT2
413u_memrchr(const UChar *s, UChar c, int32_t count) {
414    if(count<=0) {
415        return NULL; /* no string */
416    } else if(U16_IS_SURROGATE(c)) {
417        /* make sure to not find half of a surrogate pair */
418        return u_strFindLast(s, count, &c, 1);
419    } else {
420        /* trivial search for a BMP code point */
421        const UChar *limit=s+count;
422        do {
423            if(*(--limit)==c) {
424                return (UChar *)limit;
425            }
426        } while(s!=limit);
427        return NULL;
428    }
429}
430
431U_CAPI UChar * U_EXPORT2
432u_memrchr32(const UChar *s, UChar32 c, int32_t count) {
433    if((uint32_t)c<=U_BMP_MAX) {
434        /* find BMP code point */
435        return u_memrchr(s, (UChar)c, count);
436    } else if(count<2) {
437        /* too short for a surrogate pair */
438        return NULL;
439    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
440        /* find supplementary code point as surrogate pair */
441        const UChar *limit=s+count-1;
442        UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
443
444        do {
445            if(*limit==trail && *(limit-1)==lead) {
446                return (UChar *)(limit-1);
447            }
448        } while(s!=--limit);
449        return NULL;
450    } else {
451        /* not a Unicode code point, not findable */
452        return NULL;
453    }
454}
455
456/* Tokenization functions --------------------------------------------------- */
457
458/*
459 * Match each code point in a string against each code point in the matchSet.
460 * Return the index of the first string code point that
461 * is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
462 * Return -(string length)-1 if there is no such code point.
463 */
464static int32_t
465_matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) {
466    int32_t matchLen, matchBMPLen, strItr, matchItr;
467    UChar32 stringCh, matchCh;
468    UChar c, c2;
469
470    /* first part of matchSet contains only BMP code points */
471    matchBMPLen = 0;
472    while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
473        ++matchBMPLen;
474    }
475
476    /* second part of matchSet contains BMP and supplementary code points */
477    matchLen = matchBMPLen;
478    while(matchSet[matchLen] != 0) {
479        ++matchLen;
480    }
481
482    for(strItr = 0; (c = string[strItr]) != 0;) {
483        ++strItr;
484        if(U16_IS_SINGLE(c)) {
485            if(polarity) {
486                for(matchItr = 0; matchItr < matchLen; ++matchItr) {
487                    if(c == matchSet[matchItr]) {
488                        return strItr - 1; /* one matches */
489                    }
490                }
491            } else {
492                for(matchItr = 0; matchItr < matchLen; ++matchItr) {
493                    if(c == matchSet[matchItr]) {
494                        goto endloop;
495                    }
496                }
497                return strItr - 1; /* none matches */
498            }
499        } else {
500            /*
501             * No need to check for string length before U16_IS_TRAIL
502             * because c2 could at worst be the terminating NUL.
503             */
504            if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
505                ++strItr;
506                stringCh = U16_GET_SUPPLEMENTARY(c, c2);
507            } else {
508                stringCh = c; /* unpaired trail surrogate */
509            }
510
511            if(polarity) {
512                for(matchItr = matchBMPLen; matchItr < matchLen;) {
513                    U16_NEXT(matchSet, matchItr, matchLen, matchCh);
514                    if(stringCh == matchCh) {
515                        return strItr - U16_LENGTH(stringCh); /* one matches */
516                    }
517                }
518            } else {
519                for(matchItr = matchBMPLen; matchItr < matchLen;) {
520                    U16_NEXT(matchSet, matchItr, matchLen, matchCh);
521                    if(stringCh == matchCh) {
522                        goto endloop;
523                    }
524                }
525                return strItr - U16_LENGTH(stringCh); /* none matches */
526            }
527        }
528endloop:
529        /* wish C had continue with labels like Java... */;
530    }
531
532    /* Didn't find it. */
533    return -strItr-1;
534}
535
536/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
537U_CAPI UChar * U_EXPORT2
538u_strpbrk(const UChar *string, const UChar *matchSet)
539{
540    int32_t idx = _matchFromSet(string, matchSet, TRUE);
541    if(idx >= 0) {
542        return (UChar *)string + idx;
543    } else {
544        return NULL;
545    }
546}
547
548/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
549U_CAPI int32_t U_EXPORT2
550u_strcspn(const UChar *string, const UChar *matchSet)
551{
552    int32_t idx = _matchFromSet(string, matchSet, TRUE);
553    if(idx >= 0) {
554        return idx;
555    } else {
556        return -idx - 1; /* == u_strlen(string) */
557    }
558}
559
560/* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
561U_CAPI int32_t U_EXPORT2
562u_strspn(const UChar *string, const UChar *matchSet)
563{
564    int32_t idx = _matchFromSet(string, matchSet, FALSE);
565    if(idx >= 0) {
566        return idx;
567    } else {
568        return -idx - 1; /* == u_strlen(string) */
569    }
570}
571
572/* ----- Text manipulation functions --- */
573
574U_CAPI UChar* U_EXPORT2
575u_strtok_r(UChar    *src,
576     const UChar    *delim,
577           UChar   **saveState)
578{
579    UChar *tokSource;
580    UChar *nextToken;
581    uint32_t nonDelimIdx;
582
583    /* If saveState is NULL, the user messed up. */
584    if (src != NULL) {
585        tokSource = src;
586        *saveState = src; /* Set to "src" in case there are no delimiters */
587    }
588    else if (*saveState) {
589        tokSource = *saveState;
590    }
591    else {
592        /* src == NULL && *saveState == NULL */
593        /* This shouldn't happen. We already finished tokenizing. */
594        return NULL;
595    }
596
597    /* Skip initial delimiters */
598    nonDelimIdx = u_strspn(tokSource, delim);
599    tokSource = &tokSource[nonDelimIdx];
600
601    if (*tokSource) {
602        nextToken = u_strpbrk(tokSource, delim);
603        if (nextToken != NULL) {
604            /* Create a token */
605            *(nextToken++) = 0;
606            *saveState = nextToken;
607            return tokSource;
608        }
609        else if (*saveState) {
610            /* Return the last token */
611            *saveState = NULL;
612            return tokSource;
613        }
614    }
615    else {
616        /* No tokens were found. Only delimiters were left. */
617        *saveState = NULL;
618    }
619    return NULL;
620}
621
622/* Miscellaneous functions -------------------------------------------------- */
623
624U_CAPI UChar* U_EXPORT2
625u_strcat(UChar     *dst,
626    const UChar     *src)
627{
628    UChar *anchor = dst;            /* save a pointer to start of dst */
629
630    while(*dst != 0) {              /* To end of first string          */
631        ++dst;
632    }
633    while((*(dst++) = *(src++)) != 0) {     /* copy string 2 over              */
634    }
635
636    return anchor;
637}
638
639U_CAPI UChar*  U_EXPORT2
640u_strncat(UChar     *dst,
641     const UChar     *src,
642     int32_t     n )
643{
644    if(n > 0) {
645        UChar *anchor = dst;            /* save a pointer to start of dst */
646
647        while(*dst != 0) {              /* To end of first string          */
648            ++dst;
649        }
650        while((*dst = *src) != 0) {     /* copy string 2 over              */
651            ++dst;
652            if(--n == 0) {
653                *dst = 0;
654                break;
655            }
656            ++src;
657        }
658
659        return anchor;
660    } else {
661        return dst;
662    }
663}
664
665/* ----- Text property functions --- */
666
667U_CAPI int32_t   U_EXPORT2
668u_strcmp(const UChar *s1,
669    const UChar *s2)
670{
671    UChar  c1, c2;
672
673    for(;;) {
674        c1=*s1++;
675        c2=*s2++;
676        if (c1 != c2 || c1 == 0) {
677            break;
678        }
679    }
680    return (int32_t)c1 - (int32_t)c2;
681}
682
683U_CFUNC int32_t U_EXPORT2
684uprv_strCompare(const UChar *s1, int32_t length1,
685                const UChar *s2, int32_t length2,
686                UBool strncmpStyle, UBool codePointOrder) {
687    const UChar *start1, *start2, *limit1, *limit2;
688    UChar c1, c2;
689
690    /* setup for fix-up */
691    start1=s1;
692    start2=s2;
693
694    /* compare identical prefixes - they do not need to be fixed up */
695    if(length1<0 && length2<0) {
696        /* strcmp style, both NUL-terminated */
697        if(s1==s2) {
698            return 0;
699        }
700
701        for(;;) {
702            c1=*s1;
703            c2=*s2;
704            if(c1!=c2) {
705                break;
706            }
707            if(c1==0) {
708                return 0;
709            }
710            ++s1;
711            ++s2;
712        }
713
714        /* setup for fix-up */
715        limit1=limit2=NULL;
716    } else if(strncmpStyle) {
717        /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
718        if(s1==s2) {
719            return 0;
720        }
721
722        limit1=start1+length1;
723
724        for(;;) {
725            /* both lengths are same, check only one limit */
726            if(s1==limit1) {
727                return 0;
728            }
729
730            c1=*s1;
731            c2=*s2;
732            if(c1!=c2) {
733                break;
734            }
735            if(c1==0) {
736                return 0;
737            }
738            ++s1;
739            ++s2;
740        }
741
742        /* setup for fix-up */
743        limit2=start2+length1; /* use length1 here, too, to enforce assumption */
744    } else {
745        /* memcmp/UnicodeString style, both length-specified */
746        int32_t lengthResult;
747
748        if(length1<0) {
749            length1=u_strlen(s1);
750        }
751        if(length2<0) {
752            length2=u_strlen(s2);
753        }
754
755        /* limit1=start1+min(lenght1, length2) */
756        if(length1<length2) {
757            lengthResult=-1;
758            limit1=start1+length1;
759        } else if(length1==length2) {
760            lengthResult=0;
761            limit1=start1+length1;
762        } else /* length1>length2 */ {
763            lengthResult=1;
764            limit1=start1+length2;
765        }
766
767        if(s1==s2) {
768            return lengthResult;
769        }
770
771        for(;;) {
772            /* check pseudo-limit */
773            if(s1==limit1) {
774                return lengthResult;
775            }
776
777            c1=*s1;
778            c2=*s2;
779            if(c1!=c2) {
780                break;
781            }
782            ++s1;
783            ++s2;
784        }
785
786        /* setup for fix-up */
787        limit1=start1+length1;
788        limit2=start2+length2;
789    }
790
791    /* if both values are in or above the surrogate range, fix them up */
792    if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
793        /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
794        if(
795            (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) ||
796            (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1)))
797        ) {
798            /* part of a surrogate pair, leave >=d800 */
799        } else {
800            /* BMP code point - may be surrogate code point - make <d800 */
801            c1-=0x2800;
802        }
803
804        if(
805            (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) ||
806            (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1)))
807        ) {
808            /* part of a surrogate pair, leave >=d800 */
809        } else {
810            /* BMP code point - may be surrogate code point - make <d800 */
811            c2-=0x2800;
812        }
813    }
814
815    /* now c1 and c2 are in the requested (code unit or code point) order */
816    return (int32_t)c1-(int32_t)c2;
817}
818
819/*
820 * Compare two strings as presented by UCharIterators.
821 * Use code unit or code point order.
822 * When the function returns, it is undefined where the iterators
823 * have stopped.
824 */
825U_CAPI int32_t U_EXPORT2
826u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) {
827    UChar32 c1, c2;
828
829    /* argument checking */
830    if(iter1==NULL || iter2==NULL) {
831        return 0; /* bad arguments */
832    }
833    if(iter1==iter2) {
834        return 0; /* identical iterators */
835    }
836
837    /* reset iterators to start? */
838    iter1->move(iter1, 0, UITER_START);
839    iter2->move(iter2, 0, UITER_START);
840
841    /* compare identical prefixes - they do not need to be fixed up */
842    for(;;) {
843        c1=iter1->next(iter1);
844        c2=iter2->next(iter2);
845        if(c1!=c2) {
846            break;
847        }
848        if(c1==-1) {
849            return 0;
850        }
851    }
852
853    /* if both values are in or above the surrogate range, fix them up */
854    if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
855        /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
856        if(
857            (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) ||
858            (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1))))
859        ) {
860            /* part of a surrogate pair, leave >=d800 */
861        } else {
862            /* BMP code point - may be surrogate code point - make <d800 */
863            c1-=0x2800;
864        }
865
866        if(
867            (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) ||
868            (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2))))
869        ) {
870            /* part of a surrogate pair, leave >=d800 */
871        } else {
872            /* BMP code point - may be surrogate code point - make <d800 */
873            c2-=0x2800;
874        }
875    }
876
877    /* now c1 and c2 are in the requested (code unit or code point) order */
878    return (int32_t)c1-(int32_t)c2;
879}
880
881#if 0
882/*
883 * u_strCompareIter() does not leave the iterators _on_ the different units.
884 * This is possible but would cost a few extra indirect function calls to back
885 * up if the last unit (c1 or c2 respectively) was >=0.
886 *
887 * Consistently leaving them _behind_ the different units is not an option
888 * because the current "unit" is the end of the string if that is reached,
889 * and in such a case the iterator does not move.
890 * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
891 * of their strings. Calling previous() on each does not move them to where
892 * the comparison fails.
893 *
894 * So the simplest semantics is to not define where the iterators end up.
895 *
896 * The following fragment is part of what would need to be done for backing up.
897 */
898void fragment {
899        /* iff a surrogate is part of a surrogate pair, leave >=d800 */
900        if(c1<=0xdbff) {
901            if(!U16_IS_TRAIL(iter1->current(iter1))) {
902                /* lead surrogate code point - make <d800 */
903                c1-=0x2800;
904            }
905        } else if(c1<=0xdfff) {
906            int32_t idx=iter1->getIndex(iter1, UITER_CURRENT);
907            iter1->previous(iter1); /* ==c1 */
908            if(!U16_IS_LEAD(iter1->previous(iter1))) {
909                /* trail surrogate code point - make <d800 */
910                c1-=0x2800;
911            }
912            /* go back to behind where the difference is */
913            iter1->move(iter1, idx, UITER_ZERO);
914        } else /* 0xe000<=c1<=0xffff */ {
915            /* BMP code point - make <d800 */
916            c1-=0x2800;
917        }
918}
919#endif
920
921U_CAPI int32_t U_EXPORT2
922u_strCompare(const UChar *s1, int32_t length1,
923             const UChar *s2, int32_t length2,
924             UBool codePointOrder) {
925    /* argument checking */
926    if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
927        return 0;
928    }
929    return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder);
930}
931
932/* String compare in code point order - u_strcmp() compares in code unit order. */
933U_CAPI int32_t U_EXPORT2
934u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) {
935    return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE);
936}
937
938U_CAPI int32_t   U_EXPORT2
939u_strncmp(const UChar     *s1,
940     const UChar     *s2,
941     int32_t     n)
942{
943    if(n > 0) {
944        int32_t rc;
945        for(;;) {
946            rc = (int32_t)*s1 - (int32_t)*s2;
947            if(rc != 0 || *s1 == 0 || --n == 0) {
948                return rc;
949            }
950            ++s1;
951            ++s2;
952        }
953    } else {
954        return 0;
955    }
956}
957
958U_CAPI int32_t U_EXPORT2
959u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) {
960    return uprv_strCompare(s1, n, s2, n, TRUE, TRUE);
961}
962
963U_CAPI UChar* U_EXPORT2
964u_strcpy(UChar     *dst,
965    const UChar     *src)
966{
967    UChar *anchor = dst;            /* save a pointer to start of dst */
968
969    while((*(dst++) = *(src++)) != 0) {     /* copy string 2 over              */
970    }
971
972    return anchor;
973}
974
975U_CAPI UChar*  U_EXPORT2
976u_strncpy(UChar     *dst,
977     const UChar     *src,
978     int32_t     n)
979{
980    UChar *anchor = dst;            /* save a pointer to start of dst */
981
982    /* copy string 2 over */
983    while(n > 0 && (*(dst++) = *(src++)) != 0) {
984        --n;
985    }
986
987    return anchor;
988}
989
990U_CAPI int32_t   U_EXPORT2
991u_strlen(const UChar *s)
992{
993#if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
994    return (int32_t)uprv_wcslen(s);
995#else
996    const UChar *t = s;
997    while(*t != 0) {
998      ++t;
999    }
1000    return t - s;
1001#endif
1002}
1003
1004U_CAPI int32_t U_EXPORT2
1005u_countChar32(const UChar *s, int32_t length) {
1006    int32_t count;
1007
1008    if(s==NULL || length<-1) {
1009        return 0;
1010    }
1011
1012    count=0;
1013    if(length>=0) {
1014        while(length>0) {
1015            ++count;
1016            if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) {
1017                s+=2;
1018                length-=2;
1019            } else {
1020                ++s;
1021                --length;
1022            }
1023        }
1024    } else /* length==-1 */ {
1025        UChar c;
1026
1027        for(;;) {
1028            if((c=*s++)==0) {
1029                break;
1030            }
1031            ++count;
1032
1033            /*
1034             * sufficient to look ahead one because of UTF-16;
1035             * safe to look ahead one because at worst that would be the terminating NUL
1036             */
1037            if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1038                ++s;
1039            }
1040        }
1041    }
1042    return count;
1043}
1044
1045U_CAPI UBool U_EXPORT2
1046u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) {
1047
1048    if(number<0) {
1049        return TRUE;
1050    }
1051    if(s==NULL || length<-1) {
1052        return FALSE;
1053    }
1054
1055    if(length==-1) {
1056        /* s is NUL-terminated */
1057        UChar c;
1058
1059        /* count code points until they exceed */
1060        for(;;) {
1061            if((c=*s++)==0) {
1062                return FALSE;
1063            }
1064            if(number==0) {
1065                return TRUE;
1066            }
1067            if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1068                ++s;
1069            }
1070            --number;
1071        }
1072    } else {
1073        /* length>=0 known */
1074        const UChar *limit;
1075        int32_t maxSupplementary;
1076
1077        /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
1078        if(((length+1)/2)>number) {
1079            return TRUE;
1080        }
1081
1082        /* check if s does not even contain enough UChars */
1083        maxSupplementary=length-number;
1084        if(maxSupplementary<=0) {
1085            return FALSE;
1086        }
1087        /* there are maxSupplementary=length-number more UChars than asked-for code points */
1088
1089        /*
1090         * count code points until they exceed and also check that there are
1091         * no more than maxSupplementary supplementary code points (UChar pairs)
1092         */
1093        limit=s+length;
1094        for(;;) {
1095            if(s==limit) {
1096                return FALSE;
1097            }
1098            if(number==0) {
1099                return TRUE;
1100            }
1101            if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) {
1102                ++s;
1103                if(--maxSupplementary<=0) {
1104                    /* too many pairs - too few code points */
1105                    return FALSE;
1106                }
1107            }
1108            --number;
1109        }
1110    }
1111}
1112
1113U_CAPI UChar * U_EXPORT2
1114u_memcpy(UChar *dest, const UChar *src, int32_t count) {
1115    if(count > 0) {
1116        uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR);
1117    }
1118    return dest;
1119}
1120
1121U_CAPI UChar * U_EXPORT2
1122u_memmove(UChar *dest, const UChar *src, int32_t count) {
1123    if(count > 0) {
1124        uprv_memmove(dest, src, count*U_SIZEOF_UCHAR);
1125    }
1126    return dest;
1127}
1128
1129U_CAPI UChar * U_EXPORT2
1130u_memset(UChar *dest, UChar c, int32_t count) {
1131    if(count > 0) {
1132        UChar *ptr = dest;
1133        UChar *limit = dest + count;
1134
1135        while (ptr < limit) {
1136            *(ptr++) = c;
1137        }
1138    }
1139    return dest;
1140}
1141
1142U_CAPI int32_t U_EXPORT2
1143u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) {
1144    if(count > 0) {
1145        const UChar *limit = buf1 + count;
1146        int32_t result;
1147
1148        while (buf1 < limit) {
1149            result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2;
1150            if (result != 0) {
1151                return result;
1152            }
1153            buf1++;
1154            buf2++;
1155        }
1156    }
1157    return 0;
1158}
1159
1160U_CAPI int32_t U_EXPORT2
1161u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) {
1162    return uprv_strCompare(s1, count, s2, count, FALSE, TRUE);
1163}
1164
1165/* u_unescape & support fns ------------------------------------------------- */
1166
1167/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
1168static const UChar UNESCAPE_MAP[] = {
1169    /*"   0x22, 0x22 */
1170    /*'   0x27, 0x27 */
1171    /*?   0x3F, 0x3F */
1172    /*\   0x5C, 0x5C */
1173    /*a*/ 0x61, 0x07,
1174    /*b*/ 0x62, 0x08,
1175    /*e*/ 0x65, 0x1b,
1176    /*f*/ 0x66, 0x0c,
1177    /*n*/ 0x6E, 0x0a,
1178    /*r*/ 0x72, 0x0d,
1179    /*t*/ 0x74, 0x09,
1180    /*v*/ 0x76, 0x0b
1181};
1182enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) };
1183
1184/* Convert one octal digit to a numeric value 0..7, or -1 on failure */
1185static int8_t _digit8(UChar c) {
1186    if (c >= 0x0030 && c <= 0x0037) {
1187        return (int8_t)(c - 0x0030);
1188    }
1189    return -1;
1190}
1191
1192/* Convert one hex digit to a numeric value 0..F, or -1 on failure */
1193static int8_t _digit16(UChar c) {
1194    if (c >= 0x0030 && c <= 0x0039) {
1195        return (int8_t)(c - 0x0030);
1196    }
1197    if (c >= 0x0041 && c <= 0x0046) {
1198        return (int8_t)(c - (0x0041 - 10));
1199    }
1200    if (c >= 0x0061 && c <= 0x0066) {
1201        return (int8_t)(c - (0x0061 - 10));
1202    }
1203    return -1;
1204}
1205
1206/* Parse a single escape sequence.  Although this method deals in
1207 * UChars, it does not use C++ or UnicodeString.  This allows it to
1208 * be used from C contexts. */
1209U_CAPI UChar32 U_EXPORT2
1210u_unescapeAt(UNESCAPE_CHAR_AT charAt,
1211             int32_t *offset,
1212             int32_t length,
1213             void *context) {
1214
1215    int32_t start = *offset;
1216    UChar c;
1217    UChar32 result = 0;
1218    int8_t n = 0;
1219    int8_t minDig = 0;
1220    int8_t maxDig = 0;
1221    int8_t bitsPerDigit = 4;
1222    int8_t dig;
1223    int32_t i;
1224    UBool braces = FALSE;
1225
1226    /* Check that offset is in range */
1227    if (*offset < 0 || *offset >= length) {
1228        goto err;
1229    }
1230
1231    /* Fetch first UChar after '\\' */
1232    c = charAt((*offset)++, context);
1233
1234    /* Convert hexadecimal and octal escapes */
1235    switch (c) {
1236    case 0x0075 /*'u'*/:
1237        minDig = maxDig = 4;
1238        break;
1239    case 0x0055 /*'U'*/:
1240        minDig = maxDig = 8;
1241        break;
1242    case 0x0078 /*'x'*/:
1243        minDig = 1;
1244        if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
1245            ++(*offset);
1246            braces = TRUE;
1247            maxDig = 8;
1248        } else {
1249            maxDig = 2;
1250        }
1251        break;
1252    default:
1253        dig = _digit8(c);
1254        if (dig >= 0) {
1255            minDig = 1;
1256            maxDig = 3;
1257            n = 1; /* Already have first octal digit */
1258            bitsPerDigit = 3;
1259            result = dig;
1260        }
1261        break;
1262    }
1263    if (minDig != 0) {
1264        while (*offset < length && n < maxDig) {
1265            c = charAt(*offset, context);
1266            dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
1267            if (dig < 0) {
1268                break;
1269            }
1270            result = (result << bitsPerDigit) | dig;
1271            ++(*offset);
1272            ++n;
1273        }
1274        if (n < minDig) {
1275            goto err;
1276        }
1277        if (braces) {
1278            if (c != 0x7D /*}*/) {
1279                goto err;
1280            }
1281            ++(*offset);
1282        }
1283        if (result < 0 || result >= 0x110000) {
1284            goto err;
1285        }
1286        /* If an escape sequence specifies a lead surrogate, see if
1287         * there is a trail surrogate after it, either as an escape or
1288         * as a literal.  If so, join them up into a supplementary.
1289         */
1290        if (*offset < length && U16_IS_LEAD(result)) {
1291            int32_t ahead = *offset + 1;
1292            c = charAt(*offset, context);
1293            if (c == 0x5C /*'\\'*/ && ahead < length) {
1294                c = (UChar) u_unescapeAt(charAt, &ahead, length, context);
1295            }
1296            if (U16_IS_TRAIL(c)) {
1297                *offset = ahead;
1298                result = U16_GET_SUPPLEMENTARY(result, c);
1299            }
1300        }
1301        return result;
1302    }
1303
1304    /* Convert C-style escapes in table */
1305    for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
1306        if (c == UNESCAPE_MAP[i]) {
1307            return UNESCAPE_MAP[i+1];
1308        } else if (c < UNESCAPE_MAP[i]) {
1309            break;
1310        }
1311    }
1312
1313    /* Map \cX to control-X: X & 0x1F */
1314    if (c == 0x0063 /*'c'*/ && *offset < length) {
1315        c = charAt((*offset)++, context);
1316        if (U16_IS_LEAD(c) && *offset < length) {
1317            UChar c2 = charAt(*offset, context);
1318            if (U16_IS_TRAIL(c2)) {
1319                ++(*offset);
1320                c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */
1321            }
1322        }
1323        return 0x1F & c;
1324    }
1325
1326    /* If no special forms are recognized, then consider
1327     * the backslash to generically escape the next character.
1328     * Deal with surrogate pairs. */
1329    if (U16_IS_LEAD(c) && *offset < length) {
1330        UChar c2 = charAt(*offset, context);
1331        if (U16_IS_TRAIL(c2)) {
1332            ++(*offset);
1333            return U16_GET_SUPPLEMENTARY(c, c2);
1334        }
1335    }
1336    return c;
1337
1338 err:
1339    /* Invalid escape sequence */
1340    *offset = start; /* Reset to initial value */
1341    return (UChar32)0xFFFFFFFF;
1342}
1343
1344/* u_unescapeAt() callback to return a UChar from a char* */
1345static UChar U_CALLCONV
1346_charPtr_charAt(int32_t offset, void *context) {
1347    UChar c16;
1348    /* It would be more efficient to access the invariant tables
1349     * directly but there is no API for that. */
1350    u_charsToUChars(((char*) context) + offset, &c16, 1);
1351    return c16;
1352}
1353
1354/* Append an escape-free segment of the text; used by u_unescape() */
1355static void _appendUChars(UChar *dest, int32_t destCapacity,
1356                          const char *src, int32_t srcLen) {
1357    if (destCapacity < 0) {
1358        destCapacity = 0;
1359    }
1360    if (srcLen > destCapacity) {
1361        srcLen = destCapacity;
1362    }
1363    u_charsToUChars(src, dest, srcLen);
1364}
1365
1366/* Do an invariant conversion of char* -> UChar*, with escape parsing */
1367U_CAPI int32_t U_EXPORT2
1368u_unescape(const char *src, UChar *dest, int32_t destCapacity) {
1369    const char *segment = src;
1370    int32_t i = 0;
1371    char c;
1372
1373    while ((c=*src) != 0) {
1374        /* '\\' intentionally written as compiler-specific
1375         * character constant to correspond to compiler-specific
1376         * char* constants. */
1377        if (c == '\\') {
1378            int32_t lenParsed = 0;
1379            UChar32 c32;
1380            if (src != segment) {
1381                if (dest != NULL) {
1382                    _appendUChars(dest + i, destCapacity - i,
1383                                  segment, (int32_t)(src - segment));
1384                }
1385                i += (int32_t)(src - segment);
1386            }
1387            ++src; /* advance past '\\' */
1388            c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src);
1389            if (lenParsed == 0) {
1390                goto err;
1391            }
1392            src += lenParsed; /* advance past escape seq. */
1393            if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) {
1394                U16_APPEND_UNSAFE(dest, i, c32);
1395            } else {
1396                i += U16_LENGTH(c32);
1397            }
1398            segment = src;
1399        } else {
1400            ++src;
1401        }
1402    }
1403    if (src != segment) {
1404        if (dest != NULL) {
1405            _appendUChars(dest + i, destCapacity - i,
1406                          segment, (int32_t)(src - segment));
1407        }
1408        i += (int32_t)(src - segment);
1409    }
1410    if (dest != NULL && i < destCapacity) {
1411        dest[i] = 0;
1412    }
1413    return i;
1414
1415 err:
1416    if (dest != NULL && destCapacity > 0) {
1417        *dest = 0;
1418    }
1419    return 0;
1420}
1421
1422/* NUL-termination of strings ----------------------------------------------- */
1423
1424/**
1425 * NUL-terminate a string no matter what its type.
1426 * Set warning and error codes accordingly.
1427 */
1428#define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode)      \
1429    if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) {                    \
1430        /* not a public function, so no complete argument checking */   \
1431                                                                        \
1432        if(length<0) {                                                  \
1433            /* assume that the caller handles this */                   \
1434        } else if(length<destCapacity) {                                \
1435            /* NUL-terminate the string, the NUL fits */                \
1436            dest[length]=0;                                             \
1437            /* unset the not-terminated warning but leave all others */ \
1438            if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {          \
1439                *pErrorCode=U_ZERO_ERROR;                               \
1440            }                                                           \
1441        } else if(length==destCapacity) {                               \
1442            /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
1443            *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;                \
1444        } else /* length>destCapacity */ {                              \
1445            /* even the string itself did not fit - set an error code */ \
1446            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;                        \
1447        }                                                               \
1448    }
1449
1450U_CAPI int32_t U_EXPORT2
1451u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1452    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1453    return length;
1454}
1455
1456U_CAPI int32_t U_EXPORT2
1457u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1458    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1459    return length;
1460}
1461
1462U_CAPI int32_t U_EXPORT2
1463u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1464    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1465    return length;
1466}
1467
1468U_CAPI int32_t U_EXPORT2
1469u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1470    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1471    return length;
1472}
1473
1474// Compute the hash code for a string -------------------------------------- ***
1475
1476// Moved here from uhash.c so that UnicodeString::hashCode() does not depend
1477// on UHashtable code.
1478
1479/*
1480  Compute the hash by iterating sparsely over about 32 (up to 63)
1481  characters spaced evenly through the string.  For each character,
1482  multiply the previous hash value by a prime number and add the new
1483  character in, like a linear congruential random number generator,
1484  producing a pseudorandom deterministic value well distributed over
1485  the output range. [LIU]
1486*/
1487
1488#define STRING_HASH(TYPE, STR, STRLEN, DEREF) \
1489    int32_t hash = 0;                         \
1490    const TYPE *p = (const TYPE*) STR;        \
1491    if (p != NULL) {                          \
1492        int32_t len = (int32_t)(STRLEN);      \
1493        int32_t inc = ((len - 32) / 32) + 1;  \
1494        const TYPE *limit = p + len;          \
1495        while (p<limit) {                     \
1496            hash = (hash * 37) + DEREF;       \
1497            p += inc;                         \
1498        }                                     \
1499    }                                         \
1500    return hash
1501
1502/* Used by UnicodeString to compute its hashcode - Not public API. */
1503U_CAPI int32_t U_EXPORT2
1504ustr_hashUCharsN(const UChar *str, int32_t length) {
1505    STRING_HASH(UChar, str, length, *p);
1506}
1507
1508U_CAPI int32_t U_EXPORT2
1509ustr_hashCharsN(const char *str, int32_t length) {
1510    STRING_HASH(uint8_t, str, length, *p);
1511}
1512
1513U_CAPI int32_t U_EXPORT2
1514ustr_hashICharsN(const char *str, int32_t length) {
1515    STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p));
1516}
1517