1/*
2*******************************************************************************
3*
4*   Copyright (C) 2004-2012, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  ucase.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2004aug30
14*   created by: Markus W. Scherer
15*
16*   Low-level Unicode character/string case mapping code.
17*   Much code moved here (and modified) from uchar.c.
18*/
19
20#include "unicode/utypes.h"
21#include "unicode/unistr.h"
22#include "unicode/uset.h"
23#include "unicode/udata.h" /* UDataInfo */
24#include "unicode/utf16.h"
25#include "ucmndata.h" /* DataHeader */
26#include "udatamem.h"
27#include "umutex.h"
28#include "uassert.h"
29#include "cmemory.h"
30#include "utrie2.h"
31#include "ucase.h"
32#include "ucln_cmn.h"
33
34struct UCaseProps {
35    UDataMemory *mem;
36    const int32_t *indexes;
37    const uint16_t *exceptions;
38    const uint16_t *unfold;
39
40    UTrie2 trie;
41    uint8_t formatVersion[4];
42};
43
44/* ucase_props_data.h is machine-generated by gencase --csource */
45#define INCLUDED_FROM_UCASE_CPP
46#include "ucase_props_data.h"
47
48/* UCaseProps singleton ----------------------------------------------------- */
49
50U_CAPI const UCaseProps * U_EXPORT2
51ucase_getSingleton() {
52    return &ucase_props_singleton;
53}
54
55/* set of property starts for UnicodeSet ------------------------------------ */
56
57static UBool U_CALLCONV
58_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
59    /* add the start code point to the USet */
60    const USetAdder *sa=(const USetAdder *)context;
61    sa->add(sa->set, start);
62    return TRUE;
63}
64
65U_CFUNC void U_EXPORT2
66ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
67    if(U_FAILURE(*pErrorCode)) {
68        return;
69    }
70
71    /* add the start code point of each same-value range of the trie */
72    utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
73
74    /* add code points with hardcoded properties, plus the ones following them */
75
76    /* (none right now, see comment below) */
77
78    /*
79     * Omit code points with hardcoded specialcasing properties
80     * because we do not build property UnicodeSets for them right now.
81     */
82}
83
84/* data access primitives --------------------------------------------------- */
85
86#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
87
88#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
89
90/* number of bits in an 8-bit integer value */
91static const uint8_t flagsOffset[256]={
92    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
93    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
95    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
97    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
98    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
100    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
101    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
103    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
105    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
106    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
107    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
108};
109
110#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
111#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
112
113/*
114 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
115 *
116 * @param excWord (in) initial exceptions word
117 * @param idx (in) desired slot index
118 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
119 *               moved to the last uint16_t of the value, use +1 for beginning of next slot
120 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
121 */
122#define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
123    if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
124        (pExc16)+=SLOT_OFFSET(excWord, idx); \
125        (value)=*pExc16; \
126    } else { \
127        (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
128        (value)=*pExc16++; \
129        (value)=((value)<<16)|*pExc16; \
130    }
131
132/* simple case mappings ----------------------------------------------------- */
133
134U_CAPI UChar32 U_EXPORT2
135ucase_tolower(const UCaseProps *csp, UChar32 c) {
136    uint16_t props=UTRIE2_GET16(&csp->trie, c);
137    if(!PROPS_HAS_EXCEPTION(props)) {
138        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
139            c+=UCASE_GET_DELTA(props);
140        }
141    } else {
142        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
143        uint16_t excWord=*pe++;
144        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
145            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
146        }
147    }
148    return c;
149}
150
151U_CAPI UChar32 U_EXPORT2
152ucase_toupper(const UCaseProps *csp, UChar32 c) {
153    uint16_t props=UTRIE2_GET16(&csp->trie, c);
154    if(!PROPS_HAS_EXCEPTION(props)) {
155        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
156            c+=UCASE_GET_DELTA(props);
157        }
158    } else {
159        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
160        uint16_t excWord=*pe++;
161        if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
162            GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
163        }
164    }
165    return c;
166}
167
168U_CAPI UChar32 U_EXPORT2
169ucase_totitle(const UCaseProps *csp, UChar32 c) {
170    uint16_t props=UTRIE2_GET16(&csp->trie, c);
171    if(!PROPS_HAS_EXCEPTION(props)) {
172        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
173            c+=UCASE_GET_DELTA(props);
174        }
175    } else {
176        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
177        uint16_t excWord=*pe++;
178        int32_t idx;
179        if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
180            idx=UCASE_EXC_TITLE;
181        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
182            idx=UCASE_EXC_UPPER;
183        } else {
184            return c;
185        }
186        GET_SLOT_VALUE(excWord, idx, pe, c);
187    }
188    return c;
189}
190
191static const UChar iDot[2] = { 0x69, 0x307 };
192static const UChar jDot[2] = { 0x6a, 0x307 };
193static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
194static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
195static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
196static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
197
198
199U_CFUNC void U_EXPORT2
200ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
201    uint16_t props;
202
203    /*
204     * Hardcode the case closure of i and its relatives and ignore the
205     * data file data for these characters.
206     * The Turkic dotless i and dotted I with their case mapping conditions
207     * and case folding option make the related characters behave specially.
208     * This code matches their closure behavior to their case folding behavior.
209     */
210
211    switch(c) {
212    case 0x49:
213        /* regular i and I are in one equivalence class */
214        sa->add(sa->set, 0x69);
215        return;
216    case 0x69:
217        sa->add(sa->set, 0x49);
218        return;
219    case 0x130:
220        /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
221        sa->addString(sa->set, iDot, 2);
222        return;
223    case 0x131:
224        /* dotless i is in a class by itself */
225        return;
226    default:
227        /* otherwise use the data file data */
228        break;
229    }
230
231    props=UTRIE2_GET16(&csp->trie, c);
232    if(!PROPS_HAS_EXCEPTION(props)) {
233        if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
234            /* add the one simple case mapping, no matter what type it is */
235            int32_t delta=UCASE_GET_DELTA(props);
236            if(delta!=0) {
237                sa->add(sa->set, c+delta);
238            }
239        }
240    } else {
241        /*
242         * c has exceptions, so there may be multiple simple and/or
243         * full case mappings. Add them all.
244         */
245        const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
246        const UChar *closure;
247        uint16_t excWord=*pe++;
248        int32_t idx, closureLength, fullLength, length;
249
250        pe0=pe;
251
252        /* add all simple case mappings */
253        for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
254            if(HAS_SLOT(excWord, idx)) {
255                pe=pe0;
256                GET_SLOT_VALUE(excWord, idx, pe, c);
257                sa->add(sa->set, c);
258            }
259        }
260
261        /* get the closure string pointer & length */
262        if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
263            pe=pe0;
264            GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
265            closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
266            closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
267        } else {
268            closureLength=0;
269            closure=NULL;
270        }
271
272        /* add the full case folding */
273        if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
274            pe=pe0;
275            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
276
277            /* start of full case mapping strings */
278            ++pe;
279
280            fullLength&=0xffff; /* bits 16 and higher are reserved */
281
282            /* skip the lowercase result string */
283            pe+=fullLength&UCASE_FULL_LOWER;
284            fullLength>>=4;
285
286            /* add the full case folding string */
287            length=fullLength&0xf;
288            if(length!=0) {
289                sa->addString(sa->set, (const UChar *)pe, length);
290                pe+=length;
291            }
292
293            /* skip the uppercase and titlecase strings */
294            fullLength>>=4;
295            pe+=fullLength&0xf;
296            fullLength>>=4;
297            pe+=fullLength;
298
299            closure=(const UChar *)pe; /* behind full case mappings */
300        }
301
302        /* add each code point in the closure string */
303        for(idx=0; idx<closureLength;) {
304            U16_NEXT_UNSAFE(closure, idx, c);
305            sa->add(sa->set, c);
306        }
307    }
308}
309
310/*
311 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
312 * must be length>0 and max>0 and length<=max
313 */
314static inline int32_t
315strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
316    int32_t c1, c2;
317
318    max-=length; /* we require length<=max, so no need to decrement max in the loop */
319    do {
320        c1=*s++;
321        c2=*t++;
322        if(c2==0) {
323            return 1; /* reached the end of t but not of s */
324        }
325        c1-=c2;
326        if(c1!=0) {
327            return c1; /* return difference result */
328        }
329    } while(--length>0);
330    /* ends with length==0 */
331
332    if(max==0 || *t==0) {
333        return 0; /* equal to length of both strings */
334    } else {
335        return -max; /* return lengh difference */
336    }
337}
338
339U_CFUNC UBool U_EXPORT2
340ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
341    int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
342
343    if(csp->unfold==NULL || s==NULL) {
344        return FALSE; /* no reverse case folding data, or no string */
345    }
346    if(length<=1) {
347        /* the string is too short to find any match */
348        /*
349         * more precise would be:
350         * if(!u_strHasMoreChar32Than(s, length, 1))
351         * but this does not make much practical difference because
352         * a single supplementary code point would just not be found
353         */
354        return FALSE;
355    }
356
357    const uint16_t *unfold=csp->unfold;
358    unfoldRows=unfold[UCASE_UNFOLD_ROWS];
359    unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
360    unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
361    unfold+=unfoldRowWidth;
362
363    if(length>unfoldStringWidth) {
364        /* the string is too long to find any match */
365        return FALSE;
366    }
367
368    /* do a binary search for the string */
369    start=0;
370    limit=unfoldRows;
371    while(start<limit) {
372        i=(start+limit)/2;
373        const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
374        result=strcmpMax(s, length, p, unfoldStringWidth);
375
376        if(result==0) {
377            /* found the string: add each code point, and its case closure */
378            UChar32 c;
379
380            for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
381                U16_NEXT_UNSAFE(p, i, c);
382                sa->add(sa->set, c);
383                ucase_addCaseClosure(csp, c, sa);
384            }
385            return TRUE;
386        } else if(result<0) {
387            limit=i;
388        } else /* result>0 */ {
389            start=i+1;
390        }
391    }
392
393    return FALSE; /* string not found */
394}
395
396U_NAMESPACE_BEGIN
397
398FullCaseFoldingIterator::FullCaseFoldingIterator()
399        : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
400          unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
401          unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
402          unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
403          currentRow(0),
404          rowCpIndex(unfoldStringWidth) {
405    unfold+=unfoldRowWidth;
406}
407
408UChar32
409FullCaseFoldingIterator::next(UnicodeString &full) {
410    // Advance past the last-delivered code point.
411    const UChar *p=unfold+(currentRow*unfoldRowWidth);
412    if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
413        ++currentRow;
414        p+=unfoldRowWidth;
415        rowCpIndex=unfoldStringWidth;
416    }
417    if(currentRow>=unfoldRows) { return U_SENTINEL; }
418    // Set "full" to the NUL-terminated string in the first unfold column.
419    int32_t length=unfoldStringWidth;
420    while(length>0 && p[length-1]==0) { --length; }
421    full.setTo(FALSE, p, length);
422    // Return the code point.
423    UChar32 c;
424    U16_NEXT_UNSAFE(p, rowCpIndex, c);
425    return c;
426}
427
428U_NAMESPACE_END
429
430/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
431U_CAPI int32_t U_EXPORT2
432ucase_getType(const UCaseProps *csp, UChar32 c) {
433    uint16_t props=UTRIE2_GET16(&csp->trie, c);
434    return UCASE_GET_TYPE(props);
435}
436
437/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
438U_CAPI int32_t U_EXPORT2
439ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
440    uint16_t props=UTRIE2_GET16(&csp->trie, c);
441    return UCASE_GET_TYPE_AND_IGNORABLE(props);
442}
443
444/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
445static inline int32_t
446getDotType(const UCaseProps *csp, UChar32 c) {
447    uint16_t props=UTRIE2_GET16(&csp->trie, c);
448    if(!PROPS_HAS_EXCEPTION(props)) {
449        return props&UCASE_DOT_MASK;
450    } else {
451        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
452        return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
453    }
454}
455
456U_CAPI UBool U_EXPORT2
457ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
458    return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
459}
460
461U_CAPI UBool U_EXPORT2
462ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
463    uint16_t props=UTRIE2_GET16(&csp->trie, c);
464    return (UBool)((props&UCASE_SENSITIVE)!=0);
465}
466
467/* string casing ------------------------------------------------------------ */
468
469/*
470 * These internal functions form the core of string case mappings.
471 * They map single code points to result code points or strings and take
472 * all necessary conditions (context, locale ID, options) into account.
473 *
474 * They do not iterate over the source or write to the destination
475 * so that the same functions are useful for non-standard string storage,
476 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
477 * For the same reason, the "surrounding text" context is passed in as a
478 * UCaseContextIterator which does not make any assumptions about
479 * the underlying storage.
480 *
481 * This section contains helper functions that check for conditions
482 * in the input text surrounding the current code point
483 * according to SpecialCasing.txt.
484 *
485 * Each helper function gets the index
486 * - after the current code point if it looks at following text
487 * - before the current code point if it looks at preceding text
488 *
489 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
490 *
491 * Final_Sigma
492 *   C is preceded by a sequence consisting of
493 *     a cased letter and a case-ignorable sequence,
494 *   and C is not followed by a sequence consisting of
495 *     an ignorable sequence and then a cased letter.
496 *
497 * More_Above
498 *   C is followed by one or more characters of combining class 230 (ABOVE)
499 *   in the combining character sequence.
500 *
501 * After_Soft_Dotted
502 *   The last preceding character with combining class of zero before C
503 *   was Soft_Dotted,
504 *   and there is no intervening combining character class 230 (ABOVE).
505 *
506 * Before_Dot
507 *   C is followed by combining dot above (U+0307).
508 *   Any sequence of characters with a combining class that is neither 0 nor 230
509 *   may intervene between the current character and the combining dot above.
510 *
511 * The erratum from 2002-10-31 adds the condition
512 *
513 * After_I
514 *   The last preceding base character was an uppercase I, and there is no
515 *   intervening combining character class 230 (ABOVE).
516 *
517 *   (See Jitterbug 2344 and the comments on After_I below.)
518 *
519 * Helper definitions in Unicode 3.2 UAX 21:
520 *
521 * D1. A character C is defined to be cased
522 *     if it meets any of the following criteria:
523 *
524 *   - The general category of C is Titlecase Letter (Lt)
525 *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
526 *   - Given D = NFD(C), then it is not the case that:
527 *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
528 *     (This third criterium does not add any characters to the list
529 *      for Unicode 3.2. Ignored.)
530 *
531 * D2. A character C is defined to be case-ignorable
532 *     if it meets either of the following criteria:
533 *
534 *   - The general category of C is
535 *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
536 *     Letter Modifier (Lm), or Symbol Modifier (Sk)
537 *   - C is one of the following characters
538 *     U+0027 APOSTROPHE
539 *     U+00AD SOFT HYPHEN (SHY)
540 *     U+2019 RIGHT SINGLE QUOTATION MARK
541 *            (the preferred character for apostrophe)
542 *
543 * D3. A case-ignorable sequence is a sequence of
544 *     zero or more case-ignorable characters.
545 */
546
547#define is_a(c) ((c)=='a' || (c)=='A')
548#define is_d(c) ((c)=='d' || (c)=='D')
549#define is_e(c) ((c)=='e' || (c)=='E')
550#define is_i(c) ((c)=='i' || (c)=='I')
551#define is_l(c) ((c)=='l' || (c)=='L')
552#define is_n(c) ((c)=='n' || (c)=='N')
553#define is_r(c) ((c)=='r' || (c)=='R')
554#define is_t(c) ((c)=='t' || (c)=='T')
555#define is_u(c) ((c)=='u' || (c)=='U')
556#define is_z(c) ((c)=='z' || (c)=='Z')
557
558/* separator? */
559#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
560
561/**
562 * Requires non-NULL locale ID but otherwise does the equivalent of
563 * checking for language codes as if uloc_getLanguage() were called:
564 * Accepts both 2- and 3-letter codes and accepts case variants.
565 */
566U_CFUNC int32_t
567ucase_getCaseLocale(const char *locale, int32_t *locCache) {
568    int32_t result;
569    char c;
570
571    if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
572        return result;
573    }
574
575    result=UCASE_LOC_ROOT;
576
577    /*
578     * This function used to use uloc_getLanguage(), but the current code
579     * removes the dependency of this low-level code on uloc implementation code
580     * and is faster because not the whole locale ID has to be
581     * examined and copied/transformed.
582     *
583     * Because this code does not want to depend on uloc, the caller must
584     * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
585     */
586    c=*locale++;
587    if(is_t(c)) {
588        /* tr or tur? */
589        c=*locale++;
590        if(is_u(c)) {
591            c=*locale++;
592        }
593        if(is_r(c)) {
594            c=*locale;
595            if(is_sep(c)) {
596                result=UCASE_LOC_TURKISH;
597            }
598        }
599    } else if(is_a(c)) {
600        /* az or aze? */
601        c=*locale++;
602        if(is_z(c)) {
603            c=*locale++;
604            if(is_e(c)) {
605                c=*locale;
606            }
607            if(is_sep(c)) {
608                result=UCASE_LOC_TURKISH;
609            }
610        }
611    } else if(is_l(c)) {
612        /* lt or lit? */
613        c=*locale++;
614        if(is_i(c)) {
615            c=*locale++;
616        }
617        if(is_t(c)) {
618            c=*locale;
619            if(is_sep(c)) {
620                result=UCASE_LOC_LITHUANIAN;
621            }
622        }
623    } else if(is_n(c)) {
624        /* nl or nld? */
625        c=*locale++;
626        if(is_l(c)) {
627            c=*locale++;
628            if(is_d(c)) {
629                c=*locale;
630            }
631            if(is_sep(c)) {
632                result=UCASE_LOC_DUTCH;
633            }
634        }
635    }
636
637    if(locCache!=NULL) {
638        *locCache=result;
639    }
640    return result;
641}
642
643/*
644 * Is followed by
645 *   {case-ignorable}* cased
646 * ?
647 * (dir determines looking forward/backward)
648 * If a character is case-ignorable, it is skipped regardless of whether
649 * it is also cased or not.
650 */
651static UBool
652isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
653    UChar32 c;
654
655    if(iter==NULL) {
656        return FALSE;
657    }
658
659    for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
660        int32_t type=ucase_getTypeOrIgnorable(csp, c);
661        if(type&4) {
662            /* case-ignorable, continue with the loop */
663        } else if(type!=UCASE_NONE) {
664            return TRUE; /* followed by cased letter */
665        } else {
666            return FALSE; /* uncased and not case-ignorable */
667        }
668    }
669
670    return FALSE; /* not followed by cased letter */
671}
672
673/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
674static UBool
675isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
676    UChar32 c;
677    int32_t dotType;
678    int8_t dir;
679
680    if(iter==NULL) {
681        return FALSE;
682    }
683
684    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
685        dotType=getDotType(csp, c);
686        if(dotType==UCASE_SOFT_DOTTED) {
687            return TRUE; /* preceded by TYPE_i */
688        } else if(dotType!=UCASE_OTHER_ACCENT) {
689            return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
690        }
691    }
692
693    return FALSE; /* not preceded by TYPE_i */
694}
695
696/*
697 * See Jitterbug 2344:
698 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
699 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
700 * we made those releases compatible with Unicode 3.2 which had not fixed
701 * a related bug in SpecialCasing.txt.
702 *
703 * From the Jitterbug 2344 text:
704 * ... this bug is listed as a Unicode erratum
705 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
706 * <quote>
707 * There are two errors in SpecialCasing.txt.
708 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
709 * 2. An incorrect context definition. Correct as follows:
710 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
711 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
712 * ---
713 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
714 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
715 * where the context After_I is defined as:
716 * The last preceding base character was an uppercase I, and there is no
717 * intervening combining character class 230 (ABOVE).
718 * </quote>
719 *
720 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
721 *
722 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
723 * # This matches the behavior of the canonically equivalent I-dot_above
724 *
725 * See also the description in this place in older versions of uchar.c (revision 1.100).
726 *
727 * Markus W. Scherer 2003-feb-15
728 */
729
730/* Is preceded by base character 'I' with no intervening cc=230 ? */
731static UBool
732isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
733    UChar32 c;
734    int32_t dotType;
735    int8_t dir;
736
737    if(iter==NULL) {
738        return FALSE;
739    }
740
741    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
742        if(c==0x49) {
743            return TRUE; /* preceded by I */
744        }
745        dotType=getDotType(csp, c);
746        if(dotType!=UCASE_OTHER_ACCENT) {
747            return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
748        }
749    }
750
751    return FALSE; /* not preceded by I */
752}
753
754/* Is followed by one or more cc==230 ? */
755static UBool
756isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
757    UChar32 c;
758    int32_t dotType;
759    int8_t dir;
760
761    if(iter==NULL) {
762        return FALSE;
763    }
764
765    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
766        dotType=getDotType(csp, c);
767        if(dotType==UCASE_ABOVE) {
768            return TRUE; /* at least one cc==230 following */
769        } else if(dotType!=UCASE_OTHER_ACCENT) {
770            return FALSE; /* next base character, no more cc==230 following */
771        }
772    }
773
774    return FALSE; /* no more cc==230 following */
775}
776
777/* Is followed by a dot above (without cc==230 in between) ? */
778static UBool
779isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
780    UChar32 c;
781    int32_t dotType;
782    int8_t dir;
783
784    if(iter==NULL) {
785        return FALSE;
786    }
787
788    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
789        if(c==0x307) {
790            return TRUE;
791        }
792        dotType=getDotType(csp, c);
793        if(dotType!=UCASE_OTHER_ACCENT) {
794            return FALSE; /* next base character or cc==230 in between */
795        }
796    }
797
798    return FALSE; /* no dot above following */
799}
800
801U_CAPI int32_t U_EXPORT2
802ucase_toFullLower(const UCaseProps *csp, UChar32 c,
803                  UCaseContextIterator *iter, void *context,
804                  const UChar **pString,
805                  const char *locale, int32_t *locCache)
806{
807    UChar32 result=c;
808    uint16_t props=UTRIE2_GET16(&csp->trie, c);
809    if(!PROPS_HAS_EXCEPTION(props)) {
810        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
811            result=c+UCASE_GET_DELTA(props);
812        }
813    } else {
814        const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
815        uint16_t excWord=*pe++;
816        int32_t full;
817
818        pe2=pe;
819
820        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
821            /* use hardcoded conditions and mappings */
822            int32_t loc=ucase_getCaseLocale(locale, locCache);
823
824            /*
825             * Test for conditional mappings first
826             *   (otherwise the unconditional default mappings are always taken),
827             * then test for characters that have unconditional mappings in SpecialCasing.txt,
828             * then get the UnicodeData.txt mappings.
829             */
830            if( loc==UCASE_LOC_LITHUANIAN &&
831                    /* base characters, find accents above */
832                    (((c==0x49 || c==0x4a || c==0x12e) &&
833                        isFollowedByMoreAbove(csp, iter, context)) ||
834                    /* precomposed with accent above, no need to find one */
835                    (c==0xcc || c==0xcd || c==0x128))
836            ) {
837                /*
838                    # Lithuanian
839
840                    # Lithuanian retains the dot in a lowercase i when followed by accents.
841
842                    # Introduce an explicit dot above when lowercasing capital I's and J's
843                    # whenever there are more accents above.
844                    # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
845
846                    0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
847                    004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
848                    012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
849                    00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
850                    00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
851                    0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
852                 */
853                switch(c) {
854                case 0x49:  /* LATIN CAPITAL LETTER I */
855                    *pString=iDot;
856                    return 2;
857                case 0x4a:  /* LATIN CAPITAL LETTER J */
858                    *pString=jDot;
859                    return 2;
860                case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
861                    *pString=iOgonekDot;
862                    return 2;
863                case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
864                    *pString=iDotGrave;
865                    return 3;
866                case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
867                    *pString=iDotAcute;
868                    return 3;
869                case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
870                    *pString=iDotTilde;
871                    return 3;
872                default:
873                    return 0; /* will not occur */
874                }
875            /* # Turkish and Azeri */
876            } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
877                /*
878                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
879                    # The following rules handle those cases.
880
881                    0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
882                    0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
883                 */
884                return 0x69;
885            } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
886                /*
887                    # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
888                    # This matches the behavior of the canonically equivalent I-dot_above
889
890                    0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
891                    0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
892                 */
893                return 0; /* remove the dot (continue without output) */
894            } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
895                /*
896                    # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
897
898                    0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
899                    0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
900                 */
901                return 0x131;
902            } else if(c==0x130) {
903                /*
904                    # Preserve canonical equivalence for I with dot. Turkic is handled below.
905
906                    0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
907                 */
908                *pString=iDot;
909                return 2;
910            } else if(  c==0x3a3 &&
911                        !isFollowedByCasedLetter(csp, iter, context, 1) &&
912                        isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
913            ) {
914                /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
915                /*
916                    # Special case for final form of sigma
917
918                    03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
919                 */
920                return 0x3c2; /* greek small final sigma */
921            } else {
922                /* no known conditional special case mapping, use a normal mapping */
923            }
924        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
925            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
926            full&=UCASE_FULL_LOWER;
927            if(full!=0) {
928                /* set the output pointer to the lowercase mapping */
929                *pString=reinterpret_cast<const UChar *>(pe+1);
930
931                /* return the string length */
932                return full;
933            }
934        }
935
936        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
937            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
938        }
939    }
940
941    return (result==c) ? ~result : result;
942}
943
944/* internal */
945static int32_t
946toUpperOrTitle(const UCaseProps *csp, UChar32 c,
947               UCaseContextIterator *iter, void *context,
948               const UChar **pString,
949               const char *locale, int32_t *locCache,
950               UBool upperNotTitle) {
951    UChar32 result=c;
952    uint16_t props=UTRIE2_GET16(&csp->trie, c);
953    if(!PROPS_HAS_EXCEPTION(props)) {
954        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
955            result=c+UCASE_GET_DELTA(props);
956        }
957    } else {
958        const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
959        uint16_t excWord=*pe++;
960        int32_t full, idx;
961
962        pe2=pe;
963
964        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
965            /* use hardcoded conditions and mappings */
966            int32_t loc=ucase_getCaseLocale(locale, locCache);
967
968            if(loc==UCASE_LOC_TURKISH && c==0x69) {
969                /*
970                    # Turkish and Azeri
971
972                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
973                    # The following rules handle those cases.
974
975                    # When uppercasing, i turns into a dotted capital I
976
977                    0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
978                    0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
979                */
980                return 0x130;
981            } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
982                /*
983                    # Lithuanian
984
985                    # Lithuanian retains the dot in a lowercase i when followed by accents.
986
987                    # Remove DOT ABOVE after "i" with upper or titlecase
988
989                    0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
990                 */
991                return 0; /* remove the dot (continue without output) */
992            } else {
993                /* no known conditional special case mapping, use a normal mapping */
994            }
995        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
996            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
997
998            /* start of full case mapping strings */
999            ++pe;
1000
1001            /* skip the lowercase and case-folding result strings */
1002            pe+=full&UCASE_FULL_LOWER;
1003            full>>=4;
1004            pe+=full&0xf;
1005            full>>=4;
1006
1007            if(upperNotTitle) {
1008                full&=0xf;
1009            } else {
1010                /* skip the uppercase result string */
1011                pe+=full&0xf;
1012                full=(full>>4)&0xf;
1013            }
1014
1015            if(full!=0) {
1016                /* set the output pointer to the result string */
1017                *pString=reinterpret_cast<const UChar *>(pe);
1018
1019                /* return the string length */
1020                return full;
1021            }
1022        }
1023
1024        if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1025            idx=UCASE_EXC_TITLE;
1026        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1027            /* here, titlecase is same as uppercase */
1028            idx=UCASE_EXC_UPPER;
1029        } else {
1030            return ~c;
1031        }
1032        GET_SLOT_VALUE(excWord, idx, pe2, result);
1033    }
1034
1035    return (result==c) ? ~result : result;
1036}
1037
1038U_CAPI int32_t U_EXPORT2
1039ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
1040                  UCaseContextIterator *iter, void *context,
1041                  const UChar **pString,
1042                  const char *locale, int32_t *locCache) {
1043    return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
1044}
1045
1046U_CAPI int32_t U_EXPORT2
1047ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
1048                  UCaseContextIterator *iter, void *context,
1049                  const UChar **pString,
1050                  const char *locale, int32_t *locCache) {
1051    return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
1052}
1053
1054/* case folding ------------------------------------------------------------- */
1055
1056/*
1057 * Case folding is similar to lowercasing.
1058 * The result may be a simple mapping, i.e., a single code point, or
1059 * a full mapping, i.e., a string.
1060 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1061 * then only the lowercase mapping is stored.
1062 *
1063 * Some special cases are hardcoded because their conditions cannot be
1064 * parsed and processed from CaseFolding.txt.
1065 *
1066 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1067
1068# C: common case folding, common mappings shared by both simple and full mappings.
1069# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1070# S: simple case folding, mappings to single characters where different from F.
1071# T: special case for uppercase I and dotted uppercase I
1072#    - For non-Turkic languages, this mapping is normally not used.
1073#    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1074#
1075# Usage:
1076#  A. To do a simple case folding, use the mappings with status C + S.
1077#  B. To do a full case folding, use the mappings with status C + F.
1078#
1079#    The mappings with status T can be used or omitted depending on the desired case-folding
1080#    behavior. (The default option is to exclude them.)
1081
1082 * Unicode 3.2 has 'T' mappings as follows:
1083
10840049; T; 0131; # LATIN CAPITAL LETTER I
10850130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1086
1087 * while the default mappings for these code points are:
1088
10890049; C; 0069; # LATIN CAPITAL LETTER I
10900130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1091
1092 * U+0130 has no simple case folding (simple-case-folds to itself).
1093 */
1094
1095/* return the simple case folding mapping for c */
1096U_CAPI UChar32 U_EXPORT2
1097ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
1098    uint16_t props=UTRIE2_GET16(&csp->trie, c);
1099    if(!PROPS_HAS_EXCEPTION(props)) {
1100        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1101            c+=UCASE_GET_DELTA(props);
1102        }
1103    } else {
1104        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
1105        uint16_t excWord=*pe++;
1106        int32_t idx;
1107        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1108            /* special case folding mappings, hardcoded */
1109            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1110                /* default mappings */
1111                if(c==0x49) {
1112                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1113                    return 0x69;
1114                } else if(c==0x130) {
1115                    /* no simple case folding for U+0130 */
1116                    return c;
1117                }
1118            } else {
1119                /* Turkic mappings */
1120                if(c==0x49) {
1121                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1122                    return 0x131;
1123                } else if(c==0x130) {
1124                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1125                    return 0x69;
1126                }
1127            }
1128        }
1129        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1130            idx=UCASE_EXC_FOLD;
1131        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1132            idx=UCASE_EXC_LOWER;
1133        } else {
1134            return c;
1135        }
1136        GET_SLOT_VALUE(excWord, idx, pe, c);
1137    }
1138    return c;
1139}
1140
1141/*
1142 * Issue for canonical caseless match (UAX #21):
1143 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1144 * canonical equivalence, unlike default-option casefolding.
1145 * For example, I-grave and I + grave fold to strings that are not canonically
1146 * equivalent.
1147 * For more details, see the comment in unorm_compare() in unorm.cpp
1148 * and the intermediate prototype changes for Jitterbug 2021.
1149 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1150 *
1151 * This did not get fixed because it appears that it is not possible to fix
1152 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1153 * together in a way that they still fold to common result strings.
1154 */
1155
1156U_CAPI int32_t U_EXPORT2
1157ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
1158                    const UChar **pString,
1159                    uint32_t options)
1160{
1161    UChar32 result=c;
1162    uint16_t props=UTRIE2_GET16(&csp->trie, c);
1163    if(!PROPS_HAS_EXCEPTION(props)) {
1164        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1165            result=c+UCASE_GET_DELTA(props);
1166        }
1167    } else {
1168        const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1169        uint16_t excWord=*pe++;
1170        int32_t full, idx;
1171
1172        pe2=pe;
1173
1174        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1175            /* use hardcoded conditions and mappings */
1176            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1177                /* default mappings */
1178                if(c==0x49) {
1179                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1180                    return 0x69;
1181                } else if(c==0x130) {
1182                    /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1183                    *pString=iDot;
1184                    return 2;
1185                }
1186            } else {
1187                /* Turkic mappings */
1188                if(c==0x49) {
1189                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1190                    return 0x131;
1191                } else if(c==0x130) {
1192                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1193                    return 0x69;
1194                }
1195            }
1196        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1197            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1198
1199            /* start of full case mapping strings */
1200            ++pe;
1201
1202            /* skip the lowercase result string */
1203            pe+=full&UCASE_FULL_LOWER;
1204            full=(full>>4)&0xf;
1205
1206            if(full!=0) {
1207                /* set the output pointer to the result string */
1208                *pString=reinterpret_cast<const UChar *>(pe);
1209
1210                /* return the string length */
1211                return full;
1212            }
1213        }
1214
1215        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1216            idx=UCASE_EXC_FOLD;
1217        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1218            idx=UCASE_EXC_LOWER;
1219        } else {
1220            return ~c;
1221        }
1222        GET_SLOT_VALUE(excWord, idx, pe2, result);
1223    }
1224
1225    return (result==c) ? ~result : result;
1226}
1227
1228/* case mapping properties API ---------------------------------------------- */
1229
1230#define GET_CASE_PROPS() &ucase_props_singleton
1231
1232/* public API (see uchar.h) */
1233
1234U_CAPI UBool U_EXPORT2
1235u_isULowercase(UChar32 c) {
1236    return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
1237}
1238
1239U_CAPI UBool U_EXPORT2
1240u_isUUppercase(UChar32 c) {
1241    return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
1242}
1243
1244/* Transforms the Unicode character to its lower case equivalent.*/
1245U_CAPI UChar32 U_EXPORT2
1246u_tolower(UChar32 c) {
1247    return ucase_tolower(GET_CASE_PROPS(), c);
1248}
1249
1250/* Transforms the Unicode character to its upper case equivalent.*/
1251U_CAPI UChar32 U_EXPORT2
1252u_toupper(UChar32 c) {
1253    return ucase_toupper(GET_CASE_PROPS(), c);
1254}
1255
1256/* Transforms the Unicode character to its title case equivalent.*/
1257U_CAPI UChar32 U_EXPORT2
1258u_totitle(UChar32 c) {
1259    return ucase_totitle(GET_CASE_PROPS(), c);
1260}
1261
1262/* return the simple case folding mapping for c */
1263U_CAPI UChar32 U_EXPORT2
1264u_foldCase(UChar32 c, uint32_t options) {
1265    return ucase_fold(GET_CASE_PROPS(), c, options);
1266}
1267
1268U_CFUNC int32_t U_EXPORT2
1269ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1270    /* case mapping properties */
1271    const UChar *resultString;
1272    int32_t locCache;
1273    const UCaseProps *csp=GET_CASE_PROPS();
1274    if(csp==NULL) {
1275        return FALSE;
1276    }
1277    switch(which) {
1278    case UCHAR_LOWERCASE:
1279        return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
1280    case UCHAR_UPPERCASE:
1281        return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
1282    case UCHAR_SOFT_DOTTED:
1283        return ucase_isSoftDotted(csp, c);
1284    case UCHAR_CASE_SENSITIVE:
1285        return ucase_isCaseSensitive(csp, c);
1286    case UCHAR_CASED:
1287        return (UBool)(UCASE_NONE!=ucase_getType(csp, c));
1288    case UCHAR_CASE_IGNORABLE:
1289        return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);
1290    /*
1291     * Note: The following Changes_When_Xyz are defined as testing whether
1292     * the NFD form of the input changes when Xyz-case-mapped.
1293     * However, this simpler implementation of these properties,
1294     * ignoring NFD, passes the tests.
1295     * The implementation needs to be changed if the tests start failing.
1296     * When that happens, optimizations should be used to work with the
1297     * per-single-code point ucase_toFullXyz() functions unless
1298     * the NFD form has more than one code point,
1299     * and the property starts set needs to be the union of the
1300     * start sets for normalization and case mappings.
1301     */
1302    case UCHAR_CHANGES_WHEN_LOWERCASED:
1303        locCache=UCASE_LOC_ROOT;
1304        return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1305    case UCHAR_CHANGES_WHEN_UPPERCASED:
1306        locCache=UCASE_LOC_ROOT;
1307        return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1308    case UCHAR_CHANGES_WHEN_TITLECASED:
1309        locCache=UCASE_LOC_ROOT;
1310        return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1311    /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1312    case UCHAR_CHANGES_WHEN_CASEMAPPED:
1313        locCache=UCASE_LOC_ROOT;
1314        return (UBool)(
1315            ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1316            ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1317            ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1318    default:
1319        return FALSE;
1320    }
1321}
1322