1/*
2*******************************************************************************
3*
4*   Copyright (C) 2004-2010, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  ucase.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2004aug30
14*   created by: Markus W. Scherer
15*
16*   Low-level Unicode character/string case mapping code.
17*   Much code moved here (and modified) from uchar.c.
18*/
19
20#include "unicode/utypes.h"
21#include "unicode/uset.h"
22#include "unicode/udata.h" /* UDataInfo */
23#include "ucmndata.h" /* DataHeader */
24#include "udatamem.h"
25#include "umutex.h"
26#include "uassert.h"
27#include "cmemory.h"
28#include "utrie2.h"
29#include "ucase.h"
30#include "ucln_cmn.h"
31
32struct UCaseProps {
33    UDataMemory *mem;
34    const int32_t *indexes;
35    const uint16_t *exceptions;
36    const UChar *unfold;
37
38    UTrie2 trie;
39    uint8_t formatVersion[4];
40};
41
42/* ucase_props_data.c is machine-generated by gencase --csource */
43#include "ucase_props_data.c"
44
45/* UCaseProps singleton ----------------------------------------------------- */
46
47U_CAPI const UCaseProps * U_EXPORT2
48ucase_getSingleton() {
49    return &ucase_props_singleton;
50}
51
52/* set of property starts for UnicodeSet ------------------------------------ */
53
54static UBool U_CALLCONV
55_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
56    /* add the start code point to the USet */
57    const USetAdder *sa=(const USetAdder *)context;
58    sa->add(sa->set, start);
59    return TRUE;
60}
61
62U_CFUNC void U_EXPORT2
63ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
64    if(U_FAILURE(*pErrorCode)) {
65        return;
66    }
67
68    /* add the start code point of each same-value range of the trie */
69    utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
70
71    /* add code points with hardcoded properties, plus the ones following them */
72
73    /* (none right now, see comment below) */
74
75    /*
76     * Omit code points with hardcoded specialcasing properties
77     * because we do not build property UnicodeSets for them right now.
78     */
79}
80
81/* data access primitives --------------------------------------------------- */
82
83#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
84
85#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
86
87/* number of bits in an 8-bit integer value */
88static const uint8_t flagsOffset[256]={
89    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
90    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
91    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
98    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
100    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
101    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
105};
106
107#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
108#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
109
110/*
111 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
112 *
113 * @param excWord (in) initial exceptions word
114 * @param idx (in) desired slot index
115 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
116 *               moved to the last uint16_t of the value, use +1 for beginning of next slot
117 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
118 */
119#define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
120    if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
121        (pExc16)+=SLOT_OFFSET(excWord, idx); \
122        (value)=*pExc16; \
123    } else { \
124        (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
125        (value)=*pExc16++; \
126        (value)=((value)<<16)|*pExc16; \
127    }
128
129/* simple case mappings ----------------------------------------------------- */
130
131U_CAPI UChar32 U_EXPORT2
132ucase_tolower(const UCaseProps *csp, UChar32 c) {
133    uint16_t props=UTRIE2_GET16(&csp->trie, c);
134    if(!PROPS_HAS_EXCEPTION(props)) {
135        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
136            c+=UCASE_GET_DELTA(props);
137        }
138    } else {
139        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
140        uint16_t excWord=*pe++;
141        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
142            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
143        }
144    }
145    return c;
146}
147
148U_CAPI UChar32 U_EXPORT2
149ucase_toupper(const UCaseProps *csp, UChar32 c) {
150    uint16_t props=UTRIE2_GET16(&csp->trie, c);
151    if(!PROPS_HAS_EXCEPTION(props)) {
152        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
153            c+=UCASE_GET_DELTA(props);
154        }
155    } else {
156        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
157        uint16_t excWord=*pe++;
158        if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
159            GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
160        }
161    }
162    return c;
163}
164
165U_CAPI UChar32 U_EXPORT2
166ucase_totitle(const UCaseProps *csp, UChar32 c) {
167    uint16_t props=UTRIE2_GET16(&csp->trie, c);
168    if(!PROPS_HAS_EXCEPTION(props)) {
169        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
170            c+=UCASE_GET_DELTA(props);
171        }
172    } else {
173        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
174        uint16_t excWord=*pe++;
175        int32_t idx;
176        if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
177            idx=UCASE_EXC_TITLE;
178        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
179            idx=UCASE_EXC_UPPER;
180        } else {
181            return c;
182        }
183        GET_SLOT_VALUE(excWord, idx, pe, c);
184    }
185    return c;
186}
187
188static const UChar iDot[2] = { 0x69, 0x307 };
189static const UChar jDot[2] = { 0x6a, 0x307 };
190static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
191static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
192static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
193static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
194
195
196U_CFUNC void U_EXPORT2
197ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
198    uint16_t props;
199
200    /*
201     * Hardcode the case closure of i and its relatives and ignore the
202     * data file data for these characters.
203     * The Turkic dotless i and dotted I with their case mapping conditions
204     * and case folding option make the related characters behave specially.
205     * This code matches their closure behavior to their case folding behavior.
206     */
207
208    switch(c) {
209    case 0x49:
210        /* regular i and I are in one equivalence class */
211        sa->add(sa->set, 0x69);
212        return;
213    case 0x69:
214        sa->add(sa->set, 0x49);
215        return;
216    case 0x130:
217        /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
218        sa->addString(sa->set, iDot, 2);
219        return;
220    case 0x131:
221        /* dotless i is in a class by itself */
222        return;
223    default:
224        /* otherwise use the data file data */
225        break;
226    }
227
228    props=UTRIE2_GET16(&csp->trie, c);
229    if(!PROPS_HAS_EXCEPTION(props)) {
230        if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
231            /* add the one simple case mapping, no matter what type it is */
232            int32_t delta=UCASE_GET_DELTA(props);
233            if(delta!=0) {
234                sa->add(sa->set, c+delta);
235            }
236        }
237    } else {
238        /*
239         * c has exceptions, so there may be multiple simple and/or
240         * full case mappings. Add them all.
241         */
242        const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
243        const UChar *closure;
244        uint16_t excWord=*pe++;
245        int32_t idx, closureLength, fullLength, length;
246
247        pe0=pe;
248
249        /* add all simple case mappings */
250        for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
251            if(HAS_SLOT(excWord, idx)) {
252                pe=pe0;
253                GET_SLOT_VALUE(excWord, idx, pe, c);
254                sa->add(sa->set, c);
255            }
256        }
257
258        /* get the closure string pointer & length */
259        if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
260            pe=pe0;
261            GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
262            closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
263            closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
264        } else {
265            closureLength=0;
266            closure=NULL;
267        }
268
269        /* add the full case folding */
270        if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
271            pe=pe0;
272            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
273
274            /* start of full case mapping strings */
275            ++pe;
276
277            fullLength&=0xffff; /* bits 16 and higher are reserved */
278
279            /* skip the lowercase result string */
280            pe+=fullLength&UCASE_FULL_LOWER;
281            fullLength>>=4;
282
283            /* add the full case folding string */
284            length=fullLength&0xf;
285            if(length!=0) {
286                sa->addString(sa->set, (const UChar *)pe, length);
287                pe+=length;
288            }
289
290            /* skip the uppercase and titlecase strings */
291            fullLength>>=4;
292            pe+=fullLength&0xf;
293            fullLength>>=4;
294            pe+=fullLength;
295
296            closure=(const UChar *)pe; /* behind full case mappings */
297        }
298
299        /* add each code point in the closure string */
300        for(idx=0; idx<closureLength;) {
301            U16_NEXT_UNSAFE(closure, idx, c);
302            sa->add(sa->set, c);
303        }
304    }
305}
306
307/*
308 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
309 * must be length>0 and max>0 and length<=max
310 */
311static U_INLINE int32_t
312strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
313    int32_t c1, c2;
314
315    max-=length; /* we require length<=max, so no need to decrement max in the loop */
316    do {
317        c1=*s++;
318        c2=*t++;
319        if(c2==0) {
320            return 1; /* reached the end of t but not of s */
321        }
322        c1-=c2;
323        if(c1!=0) {
324            return c1; /* return difference result */
325        }
326    } while(--length>0);
327    /* ends with length==0 */
328
329    if(max==0 || *t==0) {
330        return 0; /* equal to length of both strings */
331    } else {
332        return -max; /* return lengh difference */
333    }
334}
335
336U_CFUNC UBool U_EXPORT2
337ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
338    const UChar *unfold, *p;
339    int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
340
341    if(csp->unfold==NULL || s==NULL) {
342        return FALSE; /* no reverse case folding data, or no string */
343    }
344    if(length<=1) {
345        /* the string is too short to find any match */
346        /*
347         * more precise would be:
348         * if(!u_strHasMoreChar32Than(s, length, 1))
349         * but this does not make much practical difference because
350         * a single supplementary code point would just not be found
351         */
352        return FALSE;
353    }
354
355    unfold=csp->unfold;
356    unfoldRows=unfold[UCASE_UNFOLD_ROWS];
357    unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
358    unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
359    unfold+=unfoldRowWidth;
360
361    if(length>unfoldStringWidth) {
362        /* the string is too long to find any match */
363        return FALSE;
364    }
365
366    /* do a binary search for the string */
367    start=0;
368    limit=unfoldRows;
369    while(start<limit) {
370        i=(start+limit)/2;
371        p=unfold+(i*unfoldRowWidth);
372        result=strcmpMax(s, length, p, unfoldStringWidth);
373
374        if(result==0) {
375            /* found the string: add each code point, and its case closure */
376            UChar32 c;
377
378            for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
379                U16_NEXT_UNSAFE(p, i, c);
380                sa->add(sa->set, c);
381                ucase_addCaseClosure(csp, c, sa);
382            }
383            return TRUE;
384        } else if(result<0) {
385            limit=i;
386        } else /* result>0 */ {
387            start=i+1;
388        }
389    }
390
391    return FALSE; /* string not found */
392}
393
394/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
395U_CAPI int32_t U_EXPORT2
396ucase_getType(const UCaseProps *csp, UChar32 c) {
397    uint16_t props=UTRIE2_GET16(&csp->trie, c);
398    return UCASE_GET_TYPE(props);
399}
400
401/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
402U_CAPI int32_t U_EXPORT2
403ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
404    uint16_t props=UTRIE2_GET16(&csp->trie, c);
405    int32_t type=UCASE_GET_TYPE(props);
406    if(props&UCASE_EXCEPTION) {
407        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
408        if(*pe&UCASE_EXC_CASE_IGNORABLE) {
409            type|=4;
410        }
411    } else if(type==UCASE_NONE && (props&UCASE_CASE_IGNORABLE)) {
412        type|=4;
413    }
414    return type;
415}
416
417/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
418static U_INLINE int32_t
419getDotType(const UCaseProps *csp, UChar32 c) {
420    uint16_t props=UTRIE2_GET16(&csp->trie, c);
421    if(!PROPS_HAS_EXCEPTION(props)) {
422        return props&UCASE_DOT_MASK;
423    } else {
424        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
425        return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
426    }
427}
428
429U_CAPI UBool U_EXPORT2
430ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
431    return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
432}
433
434U_CAPI UBool U_EXPORT2
435ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
436    uint16_t props=UTRIE2_GET16(&csp->trie, c);
437    return (UBool)((props&UCASE_SENSITIVE)!=0);
438}
439
440/* string casing ------------------------------------------------------------ */
441
442/*
443 * These internal functions form the core of string case mappings.
444 * They map single code points to result code points or strings and take
445 * all necessary conditions (context, locale ID, options) into account.
446 *
447 * They do not iterate over the source or write to the destination
448 * so that the same functions are useful for non-standard string storage,
449 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
450 * For the same reason, the "surrounding text" context is passed in as a
451 * UCaseContextIterator which does not make any assumptions about
452 * the underlying storage.
453 *
454 * This section contains helper functions that check for conditions
455 * in the input text surrounding the current code point
456 * according to SpecialCasing.txt.
457 *
458 * Each helper function gets the index
459 * - after the current code point if it looks at following text
460 * - before the current code point if it looks at preceding text
461 *
462 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
463 *
464 * Final_Sigma
465 *   C is preceded by a sequence consisting of
466 *     a cased letter and a case-ignorable sequence,
467 *   and C is not followed by a sequence consisting of
468 *     an ignorable sequence and then a cased letter.
469 *
470 * More_Above
471 *   C is followed by one or more characters of combining class 230 (ABOVE)
472 *   in the combining character sequence.
473 *
474 * After_Soft_Dotted
475 *   The last preceding character with combining class of zero before C
476 *   was Soft_Dotted,
477 *   and there is no intervening combining character class 230 (ABOVE).
478 *
479 * Before_Dot
480 *   C is followed by combining dot above (U+0307).
481 *   Any sequence of characters with a combining class that is neither 0 nor 230
482 *   may intervene between the current character and the combining dot above.
483 *
484 * The erratum from 2002-10-31 adds the condition
485 *
486 * After_I
487 *   The last preceding base character was an uppercase I, and there is no
488 *   intervening combining character class 230 (ABOVE).
489 *
490 *   (See Jitterbug 2344 and the comments on After_I below.)
491 *
492 * Helper definitions in Unicode 3.2 UAX 21:
493 *
494 * D1. A character C is defined to be cased
495 *     if it meets any of the following criteria:
496 *
497 *   - The general category of C is Titlecase Letter (Lt)
498 *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
499 *   - Given D = NFD(C), then it is not the case that:
500 *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
501 *     (This third criterium does not add any characters to the list
502 *      for Unicode 3.2. Ignored.)
503 *
504 * D2. A character C is defined to be case-ignorable
505 *     if it meets either of the following criteria:
506 *
507 *   - The general category of C is
508 *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
509 *     Letter Modifier (Lm), or Symbol Modifier (Sk)
510 *   - C is one of the following characters
511 *     U+0027 APOSTROPHE
512 *     U+00AD SOFT HYPHEN (SHY)
513 *     U+2019 RIGHT SINGLE QUOTATION MARK
514 *            (the preferred character for apostrophe)
515 *
516 * D3. A case-ignorable sequence is a sequence of
517 *     zero or more case-ignorable characters.
518 */
519
520#define is_a(c) ((c)=='a' || (c)=='A')
521#define is_d(c) ((c)=='d' || (c)=='D')
522#define is_e(c) ((c)=='e' || (c)=='E')
523#define is_i(c) ((c)=='i' || (c)=='I')
524#define is_l(c) ((c)=='l' || (c)=='L')
525#define is_n(c) ((c)=='n' || (c)=='N')
526#define is_r(c) ((c)=='r' || (c)=='R')
527#define is_t(c) ((c)=='t' || (c)=='T')
528#define is_u(c) ((c)=='u' || (c)=='U')
529#define is_z(c) ((c)=='z' || (c)=='Z')
530
531/* separator? */
532#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
533
534/**
535 * Requires non-NULL locale ID but otherwise does the equivalent of
536 * checking for language codes as if uloc_getLanguage() were called:
537 * Accepts both 2- and 3-letter codes and accepts case variants.
538 */
539U_CFUNC int32_t
540ucase_getCaseLocale(const char *locale, int32_t *locCache) {
541    int32_t result;
542    char c;
543
544    if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
545        return result;
546    }
547
548    result=UCASE_LOC_ROOT;
549
550    /*
551     * This function used to use uloc_getLanguage(), but the current code
552     * removes the dependency of this low-level code on uloc implementation code
553     * and is faster because not the whole locale ID has to be
554     * examined and copied/transformed.
555     *
556     * Because this code does not want to depend on uloc, the caller must
557     * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
558     */
559    c=*locale++;
560    if(is_t(c)) {
561        /* tr or tur? */
562        c=*locale++;
563        if(is_u(c)) {
564            c=*locale++;
565        }
566        if(is_r(c)) {
567            c=*locale;
568            if(is_sep(c)) {
569                result=UCASE_LOC_TURKISH;
570            }
571        }
572    } else if(is_a(c)) {
573        /* az or aze? */
574        c=*locale++;
575        if(is_z(c)) {
576            c=*locale++;
577            if(is_e(c)) {
578                c=*locale;
579            }
580            if(is_sep(c)) {
581                result=UCASE_LOC_TURKISH;
582            }
583        }
584    } else if(is_l(c)) {
585        /* lt or lit? */
586        c=*locale++;
587        if(is_i(c)) {
588            c=*locale++;
589        }
590        if(is_t(c)) {
591            c=*locale;
592            if(is_sep(c)) {
593                result=UCASE_LOC_LITHUANIAN;
594            }
595        }
596    } else if(is_n(c)) {
597        /* nl or nld? */
598        c=*locale++;
599        if(is_l(c)) {
600            c=*locale++;
601            if(is_d(c)) {
602                c=*locale;
603            }
604            if(is_sep(c)) {
605                result=UCASE_LOC_DUTCH;
606            }
607        }
608    }
609
610    if(locCache!=NULL) {
611        *locCache=result;
612    }
613    return result;
614}
615
616/*
617 * Is followed by
618 *   {case-ignorable}* cased
619 * ?
620 * (dir determines looking forward/backward)
621 * If a character is case-ignorable, it is skipped regardless of whether
622 * it is also cased or not.
623 */
624static UBool
625isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
626    UChar32 c;
627
628    if(iter==NULL) {
629        return FALSE;
630    }
631
632    for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
633        int32_t type=ucase_getTypeOrIgnorable(csp, c);
634        if(type&4) {
635            /* case-ignorable, continue with the loop */
636        } else if(type!=UCASE_NONE) {
637            return TRUE; /* followed by cased letter */
638        } else {
639            return FALSE; /* uncased and not case-ignorable */
640        }
641    }
642
643    return FALSE; /* not followed by cased letter */
644}
645
646/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
647static UBool
648isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
649    UChar32 c;
650    int32_t dotType;
651    int8_t dir;
652
653    if(iter==NULL) {
654        return FALSE;
655    }
656
657    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
658        dotType=getDotType(csp, c);
659        if(dotType==UCASE_SOFT_DOTTED) {
660            return TRUE; /* preceded by TYPE_i */
661        } else if(dotType!=UCASE_OTHER_ACCENT) {
662            return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
663        }
664    }
665
666    return FALSE; /* not preceded by TYPE_i */
667}
668
669/*
670 * See Jitterbug 2344:
671 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
672 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
673 * we made those releases compatible with Unicode 3.2 which had not fixed
674 * a related bug in SpecialCasing.txt.
675 *
676 * From the Jitterbug 2344 text:
677 * ... this bug is listed as a Unicode erratum
678 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
679 * <quote>
680 * There are two errors in SpecialCasing.txt.
681 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
682 * 2. An incorrect context definition. Correct as follows:
683 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
684 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
685 * ---
686 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
687 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
688 * where the context After_I is defined as:
689 * The last preceding base character was an uppercase I, and there is no
690 * intervening combining character class 230 (ABOVE).
691 * </quote>
692 *
693 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
694 *
695 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
696 * # This matches the behavior of the canonically equivalent I-dot_above
697 *
698 * See also the description in this place in older versions of uchar.c (revision 1.100).
699 *
700 * Markus W. Scherer 2003-feb-15
701 */
702
703/* Is preceded by base character 'I' with no intervening cc=230 ? */
704static UBool
705isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
706    UChar32 c;
707    int32_t dotType;
708    int8_t dir;
709
710    if(iter==NULL) {
711        return FALSE;
712    }
713
714    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
715        if(c==0x49) {
716            return TRUE; /* preceded by I */
717        }
718        dotType=getDotType(csp, c);
719        if(dotType!=UCASE_OTHER_ACCENT) {
720            return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
721        }
722    }
723
724    return FALSE; /* not preceded by I */
725}
726
727/* Is followed by one or more cc==230 ? */
728static UBool
729isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
730    UChar32 c;
731    int32_t dotType;
732    int8_t dir;
733
734    if(iter==NULL) {
735        return FALSE;
736    }
737
738    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
739        dotType=getDotType(csp, c);
740        if(dotType==UCASE_ABOVE) {
741            return TRUE; /* at least one cc==230 following */
742        } else if(dotType!=UCASE_OTHER_ACCENT) {
743            return FALSE; /* next base character, no more cc==230 following */
744        }
745    }
746
747    return FALSE; /* no more cc==230 following */
748}
749
750/* Is followed by a dot above (without cc==230 in between) ? */
751static UBool
752isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
753    UChar32 c;
754    int32_t dotType;
755    int8_t dir;
756
757    if(iter==NULL) {
758        return FALSE;
759    }
760
761    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
762        if(c==0x307) {
763            return TRUE;
764        }
765        dotType=getDotType(csp, c);
766        if(dotType!=UCASE_OTHER_ACCENT) {
767            return FALSE; /* next base character or cc==230 in between */
768        }
769    }
770
771    return FALSE; /* no dot above following */
772}
773
774U_CAPI int32_t U_EXPORT2
775ucase_toFullLower(const UCaseProps *csp, UChar32 c,
776                  UCaseContextIterator *iter, void *context,
777                  const UChar **pString,
778                  const char *locale, int32_t *locCache)
779{
780    UChar32 result=c;
781    uint16_t props=UTRIE2_GET16(&csp->trie, c);
782    if(!PROPS_HAS_EXCEPTION(props)) {
783        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
784            result=c+UCASE_GET_DELTA(props);
785        }
786    } else {
787        const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
788        uint16_t excWord=*pe++;
789        int32_t full;
790
791        pe2=pe;
792
793        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
794            /* use hardcoded conditions and mappings */
795            int32_t loc=ucase_getCaseLocale(locale, locCache);
796
797            /*
798             * Test for conditional mappings first
799             *   (otherwise the unconditional default mappings are always taken),
800             * then test for characters that have unconditional mappings in SpecialCasing.txt,
801             * then get the UnicodeData.txt mappings.
802             */
803            if( loc==UCASE_LOC_LITHUANIAN &&
804                    /* base characters, find accents above */
805                    (((c==0x49 || c==0x4a || c==0x12e) &&
806                        isFollowedByMoreAbove(csp, iter, context)) ||
807                    /* precomposed with accent above, no need to find one */
808                    (c==0xcc || c==0xcd || c==0x128))
809            ) {
810                /*
811                    # Lithuanian
812
813                    # Lithuanian retains the dot in a lowercase i when followed by accents.
814
815                    # Introduce an explicit dot above when lowercasing capital I's and J's
816                    # whenever there are more accents above.
817                    # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
818
819                    0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
820                    004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
821                    012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
822                    00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
823                    00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
824                    0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
825                 */
826                switch(c) {
827                case 0x49:  /* LATIN CAPITAL LETTER I */
828                    *pString=iDot;
829                    return 2;
830                case 0x4a:  /* LATIN CAPITAL LETTER J */
831                    *pString=jDot;
832                    return 2;
833                case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
834                    *pString=iOgonekDot;
835                    return 2;
836                case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
837                    *pString=iDotGrave;
838                    return 3;
839                case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
840                    *pString=iDotAcute;
841                    return 3;
842                case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
843                    *pString=iDotTilde;
844                    return 3;
845                default:
846                    return 0; /* will not occur */
847                }
848            /* # Turkish and Azeri */
849            } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
850                /*
851                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
852                    # The following rules handle those cases.
853
854                    0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
855                    0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
856                 */
857                return 0x69;
858            } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
859                /*
860                    # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
861                    # This matches the behavior of the canonically equivalent I-dot_above
862
863                    0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
864                    0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
865                 */
866                return 0; /* remove the dot (continue without output) */
867            } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
868                /*
869                    # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
870
871                    0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
872                    0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
873                 */
874                return 0x131;
875            } else if(c==0x130) {
876                /*
877                    # Preserve canonical equivalence for I with dot. Turkic is handled below.
878
879                    0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
880                 */
881                *pString=iDot;
882                return 2;
883            } else if(  c==0x3a3 &&
884                        !isFollowedByCasedLetter(csp, iter, context, 1) &&
885                        isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
886            ) {
887                /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
888                /*
889                    # Special case for final form of sigma
890
891                    03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
892                 */
893                return 0x3c2; /* greek small final sigma */
894            } else {
895                /* no known conditional special case mapping, use a normal mapping */
896            }
897        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
898            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
899            full&=UCASE_FULL_LOWER;
900            if(full!=0) {
901                /* set the output pointer to the lowercase mapping */
902                *pString=pe+1;
903
904                /* return the string length */
905                return full;
906            }
907        }
908
909        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
910            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
911        }
912    }
913
914    return (result==c) ? ~result : result;
915}
916
917/* internal */
918static int32_t
919toUpperOrTitle(const UCaseProps *csp, UChar32 c,
920               UCaseContextIterator *iter, void *context,
921               const UChar **pString,
922               const char *locale, int32_t *locCache,
923               UBool upperNotTitle) {
924    UChar32 result=c;
925    uint16_t props=UTRIE2_GET16(&csp->trie, c);
926    if(!PROPS_HAS_EXCEPTION(props)) {
927        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
928            result=c+UCASE_GET_DELTA(props);
929        }
930    } else {
931        const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
932        uint16_t excWord=*pe++;
933        int32_t full, idx;
934
935        pe2=pe;
936
937        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
938            /* use hardcoded conditions and mappings */
939            int32_t loc=ucase_getCaseLocale(locale, locCache);
940
941            if(loc==UCASE_LOC_TURKISH && c==0x69) {
942                /*
943                    # Turkish and Azeri
944
945                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
946                    # The following rules handle those cases.
947
948                    # When uppercasing, i turns into a dotted capital I
949
950                    0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
951                    0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
952                */
953                return 0x130;
954            } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
955                /*
956                    # Lithuanian
957
958                    # Lithuanian retains the dot in a lowercase i when followed by accents.
959
960                    # Remove DOT ABOVE after "i" with upper or titlecase
961
962                    0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
963                 */
964                return 0; /* remove the dot (continue without output) */
965            } else {
966                /* no known conditional special case mapping, use a normal mapping */
967            }
968        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
969            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
970
971            /* start of full case mapping strings */
972            ++pe;
973
974            /* skip the lowercase and case-folding result strings */
975            pe+=full&UCASE_FULL_LOWER;
976            full>>=4;
977            pe+=full&0xf;
978            full>>=4;
979
980            if(upperNotTitle) {
981                full&=0xf;
982            } else {
983                /* skip the uppercase result string */
984                pe+=full&0xf;
985                full=(full>>4)&0xf;
986            }
987
988            if(full!=0) {
989                /* set the output pointer to the result string */
990                *pString=pe;
991
992                /* return the string length */
993                return full;
994            }
995        }
996
997        if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
998            idx=UCASE_EXC_TITLE;
999        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1000            /* here, titlecase is same as uppercase */
1001            idx=UCASE_EXC_UPPER;
1002        } else {
1003            return ~c;
1004        }
1005        GET_SLOT_VALUE(excWord, idx, pe2, result);
1006    }
1007
1008    return (result==c) ? ~result : result;
1009}
1010
1011U_CAPI int32_t U_EXPORT2
1012ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
1013                  UCaseContextIterator *iter, void *context,
1014                  const UChar **pString,
1015                  const char *locale, int32_t *locCache) {
1016    return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
1017}
1018
1019U_CAPI int32_t U_EXPORT2
1020ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
1021                  UCaseContextIterator *iter, void *context,
1022                  const UChar **pString,
1023                  const char *locale, int32_t *locCache) {
1024    return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
1025}
1026
1027/* case folding ------------------------------------------------------------- */
1028
1029/*
1030 * Case folding is similar to lowercasing.
1031 * The result may be a simple mapping, i.e., a single code point, or
1032 * a full mapping, i.e., a string.
1033 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1034 * then only the lowercase mapping is stored.
1035 *
1036 * Some special cases are hardcoded because their conditions cannot be
1037 * parsed and processed from CaseFolding.txt.
1038 *
1039 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1040
1041# C: common case folding, common mappings shared by both simple and full mappings.
1042# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1043# S: simple case folding, mappings to single characters where different from F.
1044# T: special case for uppercase I and dotted uppercase I
1045#    - For non-Turkic languages, this mapping is normally not used.
1046#    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1047#
1048# Usage:
1049#  A. To do a simple case folding, use the mappings with status C + S.
1050#  B. To do a full case folding, use the mappings with status C + F.
1051#
1052#    The mappings with status T can be used or omitted depending on the desired case-folding
1053#    behavior. (The default option is to exclude them.)
1054
1055 * Unicode 3.2 has 'T' mappings as follows:
1056
10570049; T; 0131; # LATIN CAPITAL LETTER I
10580130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1059
1060 * while the default mappings for these code points are:
1061
10620049; C; 0069; # LATIN CAPITAL LETTER I
10630130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1064
1065 * U+0130 has no simple case folding (simple-case-folds to itself).
1066 */
1067
1068/* return the simple case folding mapping for c */
1069U_CAPI UChar32 U_EXPORT2
1070ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
1071    uint16_t props=UTRIE2_GET16(&csp->trie, c);
1072    if(!PROPS_HAS_EXCEPTION(props)) {
1073        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1074            c+=UCASE_GET_DELTA(props);
1075        }
1076    } else {
1077        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
1078        uint16_t excWord=*pe++;
1079        int32_t idx;
1080        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1081            /* special case folding mappings, hardcoded */
1082            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1083                /* default mappings */
1084                if(c==0x49) {
1085                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1086                    return 0x69;
1087                } else if(c==0x130) {
1088                    /* no simple case folding for U+0130 */
1089                    return c;
1090                }
1091            } else {
1092                /* Turkic mappings */
1093                if(c==0x49) {
1094                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1095                    return 0x131;
1096                } else if(c==0x130) {
1097                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1098                    return 0x69;
1099                }
1100            }
1101        }
1102        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1103            idx=UCASE_EXC_FOLD;
1104        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1105            idx=UCASE_EXC_LOWER;
1106        } else {
1107            return c;
1108        }
1109        GET_SLOT_VALUE(excWord, idx, pe, c);
1110    }
1111    return c;
1112}
1113
1114/*
1115 * Issue for canonical caseless match (UAX #21):
1116 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1117 * canonical equivalence, unlike default-option casefolding.
1118 * For example, I-grave and I + grave fold to strings that are not canonically
1119 * equivalent.
1120 * For more details, see the comment in unorm_compare() in unorm.cpp
1121 * and the intermediate prototype changes for Jitterbug 2021.
1122 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1123 *
1124 * This did not get fixed because it appears that it is not possible to fix
1125 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1126 * together in a way that they still fold to common result strings.
1127 */
1128
1129U_CAPI int32_t U_EXPORT2
1130ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
1131                    const UChar **pString,
1132                    uint32_t options)
1133{
1134    UChar32 result=c;
1135    uint16_t props=UTRIE2_GET16(&csp->trie, c);
1136    if(!PROPS_HAS_EXCEPTION(props)) {
1137        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1138            result=c+UCASE_GET_DELTA(props);
1139        }
1140    } else {
1141        const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1142        uint16_t excWord=*pe++;
1143        int32_t full, idx;
1144
1145        pe2=pe;
1146
1147        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1148            /* use hardcoded conditions and mappings */
1149            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1150                /* default mappings */
1151                if(c==0x49) {
1152                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1153                    return 0x69;
1154                } else if(c==0x130) {
1155                    /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1156                    *pString=iDot;
1157                    return 2;
1158                }
1159            } else {
1160                /* Turkic mappings */
1161                if(c==0x49) {
1162                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1163                    return 0x131;
1164                } else if(c==0x130) {
1165                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1166                    return 0x69;
1167                }
1168            }
1169        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1170            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1171
1172            /* start of full case mapping strings */
1173            ++pe;
1174
1175            /* skip the lowercase result string */
1176            pe+=full&UCASE_FULL_LOWER;
1177            full=(full>>4)&0xf;
1178
1179            if(full!=0) {
1180                /* set the output pointer to the result string */
1181                *pString=pe;
1182
1183                /* return the string length */
1184                return full;
1185            }
1186        }
1187
1188        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1189            idx=UCASE_EXC_FOLD;
1190        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1191            idx=UCASE_EXC_LOWER;
1192        } else {
1193            return ~c;
1194        }
1195        GET_SLOT_VALUE(excWord, idx, pe2, result);
1196    }
1197
1198    return (result==c) ? ~result : result;
1199}
1200
1201/* case mapping properties API ---------------------------------------------- */
1202
1203#define GET_CASE_PROPS() &ucase_props_singleton
1204
1205/* public API (see uchar.h) */
1206
1207U_CAPI UBool U_EXPORT2
1208u_isULowercase(UChar32 c) {
1209    return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
1210}
1211
1212U_CAPI UBool U_EXPORT2
1213u_isUUppercase(UChar32 c) {
1214    return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
1215}
1216
1217/* Transforms the Unicode character to its lower case equivalent.*/
1218U_CAPI UChar32 U_EXPORT2
1219u_tolower(UChar32 c) {
1220    return ucase_tolower(GET_CASE_PROPS(), c);
1221}
1222
1223/* Transforms the Unicode character to its upper case equivalent.*/
1224U_CAPI UChar32 U_EXPORT2
1225u_toupper(UChar32 c) {
1226    return ucase_toupper(GET_CASE_PROPS(), c);
1227}
1228
1229/* Transforms the Unicode character to its title case equivalent.*/
1230U_CAPI UChar32 U_EXPORT2
1231u_totitle(UChar32 c) {
1232    return ucase_totitle(GET_CASE_PROPS(), c);
1233}
1234
1235/* return the simple case folding mapping for c */
1236U_CAPI UChar32 U_EXPORT2
1237u_foldCase(UChar32 c, uint32_t options) {
1238    return ucase_fold(GET_CASE_PROPS(), c, options);
1239}
1240
1241U_CFUNC int32_t U_EXPORT2
1242ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1243    /* case mapping properties */
1244    const UChar *resultString;
1245    int32_t locCache;
1246    const UCaseProps *csp=GET_CASE_PROPS();
1247    if(csp==NULL) {
1248        return FALSE;
1249    }
1250    switch(which) {
1251    case UCHAR_LOWERCASE:
1252        return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
1253    case UCHAR_UPPERCASE:
1254        return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
1255    case UCHAR_SOFT_DOTTED:
1256        return ucase_isSoftDotted(csp, c);
1257    case UCHAR_CASE_SENSITIVE:
1258        return ucase_isCaseSensitive(csp, c);
1259    case UCHAR_CASED:
1260        return (UBool)(UCASE_NONE!=ucase_getType(csp, c));
1261    case UCHAR_CASE_IGNORABLE:
1262        return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);
1263    /*
1264     * Note: The following Changes_When_Xyz are defined as testing whether
1265     * the NFD form of the input changes when Xyz-case-mapped.
1266     * However, this simpler implementation of these properties,
1267     * ignoring NFD, passes the tests.
1268     * The implementation needs to be changed if the tests start failing.
1269     * When that happens, optimizations should be used to work with the
1270     * per-single-code point ucase_toFullXyz() functions unless
1271     * the NFD form has more than one code point,
1272     * and the property starts set needs to be the union of the
1273     * start sets for normalization and case mappings.
1274     */
1275    case UCHAR_CHANGES_WHEN_LOWERCASED:
1276        locCache=UCASE_LOC_ROOT;
1277        return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1278    case UCHAR_CHANGES_WHEN_UPPERCASED:
1279        locCache=UCASE_LOC_ROOT;
1280        return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1281    case UCHAR_CHANGES_WHEN_TITLECASED:
1282        locCache=UCASE_LOC_ROOT;
1283        return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1284    /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1285    case UCHAR_CHANGES_WHEN_CASEMAPPED:
1286        locCache=UCASE_LOC_ROOT;
1287        return (UBool)(
1288            ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1289            ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1290            ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1291    default:
1292        return FALSE;
1293    }
1294}
1295