1/*
2*******************************************************************************
3*
4*   Copyright (C) 2001-2011, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  unormcmp.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2004sep13
14*   created by: Markus W. Scherer
15*
16*   unorm_compare() function moved here from unorm.cpp for better modularization.
17*   Depends on both normalization and case folding.
18*   Allows unorm.cpp to not depend on any character properties code.
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_NORMALIZATION
24
25#include "unicode/unorm.h"
26#include "unicode/ustring.h"
27#include "cmemory.h"
28#include "normalizer2impl.h"
29#include "ucase.h"
30#include "uprops.h"
31#include "ustr_imp.h"
32
33U_NAMESPACE_USE
34
35#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
36
37/* compare canonically equivalent ------------------------------------------- */
38
39/*
40 * Compare two strings for canonical equivalence.
41 * Further options include case-insensitive comparison and
42 * code point order (as opposed to code unit order).
43 *
44 * In this function, canonical equivalence is optional as well.
45 * If canonical equivalence is tested, then both strings must fulfill
46 * the FCD check.
47 *
48 * Semantically, this is equivalent to
49 *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
50 * where code point order, NFD and foldCase are all optional.
51 *
52 * String comparisons almost always yield results before processing both strings
53 * completely.
54 * They are generally more efficient working incrementally instead of
55 * performing the sub-processing (strlen, normalization, case-folding)
56 * on the entire strings first.
57 *
58 * It is also unnecessary to not normalize identical characters.
59 *
60 * This function works in principle as follows:
61 *
62 * loop {
63 *   get one code unit c1 from s1 (-1 if end of source)
64 *   get one code unit c2 from s2 (-1 if end of source)
65 *
66 *   if(either string finished) {
67 *     return result;
68 *   }
69 *   if(c1==c2) {
70 *     continue;
71 *   }
72 *
73 *   // c1!=c2
74 *   try to decompose/case-fold c1/c2, and continue if one does;
75 *
76 *   // still c1!=c2 and neither decomposes/case-folds, return result
77 *   return c1-c2;
78 * }
79 *
80 * When a character decomposes, then the pointer for that source changes to
81 * the decomposition, pushing the previous pointer onto a stack.
82 * When the end of the decomposition is reached, then the code unit reader
83 * pops the previous source from the stack.
84 * (Same for case-folding.)
85 *
86 * This is complicated further by operating on variable-width UTF-16.
87 * The top part of the loop works on code units, while lookups for decomposition
88 * and case-folding need code points.
89 * Code points are assembled after the equality/end-of-source part.
90 * The source pointer is only advanced beyond all code units when the code point
91 * actually decomposes/case-folds.
92 *
93 * If we were on a trail surrogate unit when assembling a code point,
94 * and the code point decomposes/case-folds, then the decomposition/folding
95 * result must be compared with the part of the other string that corresponds to
96 * this string's lead surrogate.
97 * Since we only assemble a code point when hitting a trail unit when the
98 * preceding lead units were identical, we back up the other string by one unit
99 * in such a case.
100 *
101 * The optional code point order comparison at the end works with
102 * the same fix-up as the other code point order comparison functions.
103 * See ustring.c and the comment near the end of this function.
104 *
105 * Assumption: A decomposition or case-folding result string never contains
106 * a single surrogate. This is a safe assumption in the Unicode Standard.
107 * Therefore, we do not need to check for surrogate pairs across
108 * decomposition/case-folding boundaries.
109 *
110 * Further assumptions (see verifications tstnorm.cpp):
111 * The API function checks for FCD first, while the core function
112 * first case-folds and then decomposes. This requires that case-folding does not
113 * un-FCD any strings.
114 *
115 * The API function may also NFD the input and turn off decomposition.
116 * This requires that case-folding does not un-NFD strings either.
117 *
118 * TODO If any of the above two assumptions is violated,
119 * then this entire code must be re-thought.
120 * If this happens, then a simple solution is to case-fold both strings up front
121 * and to turn off UNORM_INPUT_IS_FCD.
122 * We already do this when not both strings are in FCD because makeFCD
123 * would be a partial NFD before the case folding, which does not work.
124 * Note that all of this is only a problem when case-folding _and_
125 * canonical equivalence come together.
126 * (Comments in unorm_compare() are more up to date than this TODO.)
127 */
128
129/* stack element for previous-level source/decomposition pointers */
130struct CmpEquivLevel {
131    const UChar *start, *s, *limit;
132};
133typedef struct CmpEquivLevel CmpEquivLevel;
134
135/**
136 * Internal option for unorm_cmpEquivFold() for decomposing.
137 * If not set, just do strcasecmp().
138 */
139#define _COMPARE_EQUIV 0x80000
140
141/* internal function */
142static int32_t
143unorm_cmpEquivFold(const UChar *s1, int32_t length1,
144                   const UChar *s2, int32_t length2,
145                   uint32_t options,
146                   UErrorCode *pErrorCode) {
147    const Normalizer2Impl *nfcImpl;
148    const UCaseProps *csp;
149
150    /* current-level start/limit - s1/s2 as current */
151    const UChar *start1, *start2, *limit1, *limit2;
152
153    /* decomposition and case folding variables */
154    const UChar *p;
155    int32_t length;
156
157    /* stacks of previous-level start/current/limit */
158    CmpEquivLevel stack1[2], stack2[2];
159
160    /* buffers for algorithmic decompositions */
161    UChar decomp1[4], decomp2[4];
162
163    /* case folding buffers, only use current-level start/limit */
164    UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
165
166    /* track which is the current level per string */
167    int32_t level1, level2;
168
169    /* current code units, and code points for lookups */
170    UChar32 c1, c2, cp1, cp2;
171
172    /* no argument error checking because this itself is not an API */
173
174    /*
175     * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
176     * otherwise this function must behave exactly as uprv_strCompare()
177     * not checking for that here makes testing this function easier
178     */
179
180    /* normalization/properties data loaded? */
181    if((options&_COMPARE_EQUIV)!=0) {
182        nfcImpl=Normalizer2Factory::getNFCImpl(*pErrorCode);
183    } else {
184        nfcImpl=NULL;
185    }
186    if((options&U_COMPARE_IGNORE_CASE)!=0) {
187        csp=ucase_getSingleton();
188    } else {
189        csp=NULL;
190    }
191    if(U_FAILURE(*pErrorCode)) {
192        return 0;
193    }
194
195    /* initialize */
196    start1=s1;
197    if(length1==-1) {
198        limit1=NULL;
199    } else {
200        limit1=s1+length1;
201    }
202
203    start2=s2;
204    if(length2==-1) {
205        limit2=NULL;
206    } else {
207        limit2=s2+length2;
208    }
209
210    level1=level2=0;
211    c1=c2=-1;
212
213    /* comparison loop */
214    for(;;) {
215        /*
216         * here a code unit value of -1 means "get another code unit"
217         * below it will mean "this source is finished"
218         */
219
220        if(c1<0) {
221            /* get next code unit from string 1, post-increment */
222            for(;;) {
223                if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
224                    if(level1==0) {
225                        c1=-1;
226                        break;
227                    }
228                } else {
229                    ++s1;
230                    break;
231                }
232
233                /* reached end of level buffer, pop one level */
234                do {
235                    --level1;
236                    start1=stack1[level1].start;    /*Not uninitialized*/
237                } while(start1==NULL);
238                s1=stack1[level1].s;                /*Not uninitialized*/
239                limit1=stack1[level1].limit;        /*Not uninitialized*/
240            }
241        }
242
243        if(c2<0) {
244            /* get next code unit from string 2, post-increment */
245            for(;;) {
246                if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
247                    if(level2==0) {
248                        c2=-1;
249                        break;
250                    }
251                } else {
252                    ++s2;
253                    break;
254                }
255
256                /* reached end of level buffer, pop one level */
257                do {
258                    --level2;
259                    start2=stack2[level2].start;    /*Not uninitialized*/
260                } while(start2==NULL);
261                s2=stack2[level2].s;                /*Not uninitialized*/
262                limit2=stack2[level2].limit;        /*Not uninitialized*/
263            }
264        }
265
266        /*
267         * compare c1 and c2
268         * either variable c1, c2 is -1 only if the corresponding string is finished
269         */
270        if(c1==c2) {
271            if(c1<0) {
272                return 0;   /* c1==c2==-1 indicating end of strings */
273            }
274            c1=c2=-1;       /* make us fetch new code units */
275            continue;
276        } else if(c1<0) {
277            return -1;      /* string 1 ends before string 2 */
278        } else if(c2<0) {
279            return 1;       /* string 2 ends before string 1 */
280        }
281        /* c1!=c2 && c1>=0 && c2>=0 */
282
283        /* get complete code points for c1, c2 for lookups if either is a surrogate */
284        cp1=c1;
285        if(U_IS_SURROGATE(c1)) {
286            UChar c;
287
288            if(U_IS_SURROGATE_LEAD(c1)) {
289                if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
290                    /* advance ++s1; only below if cp1 decomposes/case-folds */
291                    cp1=U16_GET_SUPPLEMENTARY(c1, c);
292                }
293            } else /* isTrail(c1) */ {
294                if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
295                    cp1=U16_GET_SUPPLEMENTARY(c, c1);
296                }
297            }
298        }
299
300        cp2=c2;
301        if(U_IS_SURROGATE(c2)) {
302            UChar c;
303
304            if(U_IS_SURROGATE_LEAD(c2)) {
305                if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
306                    /* advance ++s2; only below if cp2 decomposes/case-folds */
307                    cp2=U16_GET_SUPPLEMENTARY(c2, c);
308                }
309            } else /* isTrail(c2) */ {
310                if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
311                    cp2=U16_GET_SUPPLEMENTARY(c, c2);
312                }
313            }
314        }
315
316        /*
317         * go down one level for each string
318         * continue with the main loop as soon as there is a real change
319         */
320
321        if( level1==0 && (options&U_COMPARE_IGNORE_CASE) &&
322            (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
323        ) {
324            /* cp1 case-folds to the code point "length" or to p[length] */
325            if(U_IS_SURROGATE(c1)) {
326                if(U_IS_SURROGATE_LEAD(c1)) {
327                    /* advance beyond source surrogate pair if it case-folds */
328                    ++s1;
329                } else /* isTrail(c1) */ {
330                    /*
331                     * we got a supplementary code point when hitting its trail surrogate,
332                     * therefore the lead surrogate must have been the same as in the other string;
333                     * compare this decomposition with the lead surrogate in the other string
334                     * remember that this simulates bulk text replacement:
335                     * the decomposition would replace the entire code point
336                     */
337                    --s2;
338                    c2=*(s2-1);
339                }
340            }
341
342            /* push current level pointers */
343            stack1[0].start=start1;
344            stack1[0].s=s1;
345            stack1[0].limit=limit1;
346            ++level1;
347
348            /* copy the folding result to fold1[] */
349            if(length<=UCASE_MAX_STRING_LENGTH) {
350                u_memcpy(fold1, p, length);
351            } else {
352                int32_t i=0;
353                U16_APPEND_UNSAFE(fold1, i, length);
354                length=i;
355            }
356
357            /* set next level pointers to case folding */
358            start1=s1=fold1;
359            limit1=fold1+length;
360
361            /* get ready to read from decomposition, continue with loop */
362            c1=-1;
363            continue;
364        }
365
366        if( level2==0 && (options&U_COMPARE_IGNORE_CASE) &&
367            (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
368        ) {
369            /* cp2 case-folds to the code point "length" or to p[length] */
370            if(U_IS_SURROGATE(c2)) {
371                if(U_IS_SURROGATE_LEAD(c2)) {
372                    /* advance beyond source surrogate pair if it case-folds */
373                    ++s2;
374                } else /* isTrail(c2) */ {
375                    /*
376                     * we got a supplementary code point when hitting its trail surrogate,
377                     * therefore the lead surrogate must have been the same as in the other string;
378                     * compare this decomposition with the lead surrogate in the other string
379                     * remember that this simulates bulk text replacement:
380                     * the decomposition would replace the entire code point
381                     */
382                    --s1;
383                    c1=*(s1-1);
384                }
385            }
386
387            /* push current level pointers */
388            stack2[0].start=start2;
389            stack2[0].s=s2;
390            stack2[0].limit=limit2;
391            ++level2;
392
393            /* copy the folding result to fold2[] */
394            if(length<=UCASE_MAX_STRING_LENGTH) {
395                u_memcpy(fold2, p, length);
396            } else {
397                int32_t i=0;
398                U16_APPEND_UNSAFE(fold2, i, length);
399                length=i;
400            }
401
402            /* set next level pointers to case folding */
403            start2=s2=fold2;
404            limit2=fold2+length;
405
406            /* get ready to read from decomposition, continue with loop */
407            c2=-1;
408            continue;
409        }
410
411        if( level1<2 && (options&_COMPARE_EQUIV) &&
412            0!=(p=nfcImpl->getDecomposition((UChar32)cp1, decomp1, length))
413        ) {
414            /* cp1 decomposes into p[length] */
415            if(U_IS_SURROGATE(c1)) {
416                if(U_IS_SURROGATE_LEAD(c1)) {
417                    /* advance beyond source surrogate pair if it decomposes */
418                    ++s1;
419                } else /* isTrail(c1) */ {
420                    /*
421                     * we got a supplementary code point when hitting its trail surrogate,
422                     * therefore the lead surrogate must have been the same as in the other string;
423                     * compare this decomposition with the lead surrogate in the other string
424                     * remember that this simulates bulk text replacement:
425                     * the decomposition would replace the entire code point
426                     */
427                    --s2;
428                    c2=*(s2-1);
429                }
430            }
431
432            /* push current level pointers */
433            stack1[level1].start=start1;
434            stack1[level1].s=s1;
435            stack1[level1].limit=limit1;
436            ++level1;
437
438            /* set empty intermediate level if skipped */
439            if(level1<2) {
440                stack1[level1++].start=NULL;
441            }
442
443            /* set next level pointers to decomposition */
444            start1=s1=p;
445            limit1=p+length;
446
447            /* get ready to read from decomposition, continue with loop */
448            c1=-1;
449            continue;
450        }
451
452        if( level2<2 && (options&_COMPARE_EQUIV) &&
453            0!=(p=nfcImpl->getDecomposition((UChar32)cp2, decomp2, length))
454        ) {
455            /* cp2 decomposes into p[length] */
456            if(U_IS_SURROGATE(c2)) {
457                if(U_IS_SURROGATE_LEAD(c2)) {
458                    /* advance beyond source surrogate pair if it decomposes */
459                    ++s2;
460                } else /* isTrail(c2) */ {
461                    /*
462                     * we got a supplementary code point when hitting its trail surrogate,
463                     * therefore the lead surrogate must have been the same as in the other string;
464                     * compare this decomposition with the lead surrogate in the other string
465                     * remember that this simulates bulk text replacement:
466                     * the decomposition would replace the entire code point
467                     */
468                    --s1;
469                    c1=*(s1-1);
470                }
471            }
472
473            /* push current level pointers */
474            stack2[level2].start=start2;
475            stack2[level2].s=s2;
476            stack2[level2].limit=limit2;
477            ++level2;
478
479            /* set empty intermediate level if skipped */
480            if(level2<2) {
481                stack2[level2++].start=NULL;
482            }
483
484            /* set next level pointers to decomposition */
485            start2=s2=p;
486            limit2=p+length;
487
488            /* get ready to read from decomposition, continue with loop */
489            c2=-1;
490            continue;
491        }
492
493        /*
494         * no decomposition/case folding, max level for both sides:
495         * return difference result
496         *
497         * code point order comparison must not just return cp1-cp2
498         * because when single surrogates are present then the surrogate pairs
499         * that formed cp1 and cp2 may be from different string indexes
500         *
501         * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
502         * c1=d800 cp1=10001 c2=dc00 cp2=10000
503         * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
504         *
505         * therefore, use same fix-up as in ustring.c/uprv_strCompare()
506         * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
507         * so we have slightly different pointer/start/limit comparisons here
508         */
509
510        if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
511            /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
512            if(
513                (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
514                (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
515            ) {
516                /* part of a surrogate pair, leave >=d800 */
517            } else {
518                /* BMP code point - may be surrogate code point - make <d800 */
519                c1-=0x2800;
520            }
521
522            if(
523                (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
524                (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
525            ) {
526                /* part of a surrogate pair, leave >=d800 */
527            } else {
528                /* BMP code point - may be surrogate code point - make <d800 */
529                c2-=0x2800;
530            }
531        }
532
533        return c1-c2;
534    }
535}
536
537static
538UBool _normalize(const Normalizer2 *n2, const UChar *s, int32_t length,
539                UnicodeString &normalized, UErrorCode *pErrorCode) {
540    UnicodeString str(length<0, s, length);
541
542    // check if s fulfill the conditions
543    int32_t spanQCYes=n2->spanQuickCheckYes(str, *pErrorCode);
544    if (U_FAILURE(*pErrorCode)) {
545        return FALSE;
546    }
547    /*
548     * ICU 2.4 had a further optimization:
549     * If both strings were not in FCD, then they were both NFD'ed,
550     * and the _COMPARE_EQUIV option was turned off.
551     * It is not entirely clear that this is valid with the current
552     * definition of the canonical caseless match.
553     * Therefore, ICU 2.6 removes that optimization.
554     */
555    if(spanQCYes<str.length()) {
556        UnicodeString unnormalized=str.tempSubString(spanQCYes);
557        normalized.setTo(FALSE, str.getBuffer(), spanQCYes);
558        n2->normalizeSecondAndAppend(normalized, unnormalized, *pErrorCode);
559        if (U_SUCCESS(*pErrorCode)) {
560            return TRUE;
561        }
562    }
563    return FALSE;
564}
565
566U_CAPI int32_t U_EXPORT2
567unorm_compare(const UChar *s1, int32_t length1,
568              const UChar *s2, int32_t length2,
569              uint32_t options,
570              UErrorCode *pErrorCode) {
571    /* argument checking */
572    if(U_FAILURE(*pErrorCode)) {
573        return 0;
574    }
575    if(s1==0 || length1<-1 || s2==0 || length2<-1) {
576        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
577        return 0;
578    }
579
580    UnicodeString fcd1, fcd2;
581    int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
582    options|=_COMPARE_EQUIV;
583
584    /*
585     * UAX #21 Case Mappings, as fixed for Unicode version 4
586     * (see Jitterbug 2021), defines a canonical caseless match as
587     *
588     * A string X is a canonical caseless match
589     * for a string Y if and only if
590     * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
591     *
592     * For better performance, we check for FCD (or let the caller tell us that
593     * both strings are in FCD) for the inner normalization.
594     * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
595     * case-folding preserves the FCD-ness of a string.
596     * The outer normalization is then only performed by unorm_cmpEquivFold()
597     * when there is a difference.
598     *
599     * Exception: When using the Turkic case-folding option, we do perform
600     * full NFD first. This is because in the Turkic case precomposed characters
601     * with 0049 capital I or 0069 small i fold differently whether they
602     * are first decomposed or not, so an FCD check - a check only for
603     * canonical order - is not sufficient.
604     */
605    if(!(options&UNORM_INPUT_IS_FCD) || (options&U_FOLD_CASE_EXCLUDE_SPECIAL_I)) {
606        const Normalizer2 *n2;
607        if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
608            n2=Normalizer2Factory::getNFDInstance(*pErrorCode);
609        } else {
610            n2=Normalizer2Factory::getFCDInstance(*pErrorCode);
611        }
612        if (U_FAILURE(*pErrorCode)) {
613            return 0;
614        }
615
616        if(normOptions&UNORM_UNICODE_3_2) {
617            const UnicodeSet *uni32=uniset_getUnicode32Instance(*pErrorCode);
618            FilteredNormalizer2 fn2(*n2, *uni32);
619            if(_normalize(&fn2, s1, length1, fcd1, pErrorCode)) {
620                s1=fcd1.getBuffer();
621                length1=fcd1.length();
622            }
623            if(_normalize(&fn2, s2, length2, fcd2, pErrorCode)) {
624                s2=fcd2.getBuffer();
625                length2=fcd2.length();
626            }
627        } else {
628            if(_normalize(n2, s1, length1, fcd1, pErrorCode)) {
629                s1=fcd1.getBuffer();
630                length1=fcd1.length();
631            }
632            if(_normalize(n2, s2, length2, fcd2, pErrorCode)) {
633                s2=fcd2.getBuffer();
634                length2=fcd2.length();
635            }
636        }
637    }
638
639    if(U_SUCCESS(*pErrorCode)) {
640        return unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
641    } else {
642        return 0;
643    }
644}
645
646#endif /* #if !UCONFIG_NO_NORMALIZATION */
647