1/*
2*******************************************************************************
3*
4*   Copyright (C) 2004-2014, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  ucase.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2004aug30
14*   created by: Markus W. Scherer
15*
16*   Low-level Unicode character/string case mapping code.
17*   Much code moved here (and modified) from uchar.c.
18*/
19
20#include "unicode/utypes.h"
21#include "unicode/unistr.h"
22#include "unicode/uset.h"
23#include "unicode/udata.h" /* UDataInfo */
24#include "unicode/utf16.h"
25#include "ucmndata.h" /* DataHeader */
26#include "udatamem.h"
27#include "umutex.h"
28#include "uassert.h"
29#include "cmemory.h"
30#include "utrie2.h"
31#include "ucase.h"
32
33struct UCaseProps {
34    UDataMemory *mem;
35    const int32_t *indexes;
36    const uint16_t *exceptions;
37    const uint16_t *unfold;
38
39    UTrie2 trie;
40    uint8_t formatVersion[4];
41};
42
43/* ucase_props_data.h is machine-generated by gencase --csource */
44#define INCLUDED_FROM_UCASE_CPP
45#include "ucase_props_data.h"
46
47/* UCaseProps singleton ----------------------------------------------------- */
48
49U_CAPI const UCaseProps * U_EXPORT2
50ucase_getSingleton() {
51    return &ucase_props_singleton;
52}
53
54/* set of property starts for UnicodeSet ------------------------------------ */
55
56static UBool U_CALLCONV
57_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
58    /* add the start code point to the USet */
59    const USetAdder *sa=(const USetAdder *)context;
60    sa->add(sa->set, start);
61    return TRUE;
62}
63
64U_CFUNC void U_EXPORT2
65ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
66    if(U_FAILURE(*pErrorCode)) {
67        return;
68    }
69
70    /* add the start code point of each same-value range of the trie */
71    utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
72
73    /* add code points with hardcoded properties, plus the ones following them */
74
75    /* (none right now, see comment below) */
76
77    /*
78     * Omit code points with hardcoded specialcasing properties
79     * because we do not build property UnicodeSets for them right now.
80     */
81}
82
83/* data access primitives --------------------------------------------------- */
84
85#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
86
87#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
88
89/* number of bits in an 8-bit integer value */
90static const uint8_t flagsOffset[256]={
91    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
92    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
93    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
96    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
97    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
98    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
99    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
100    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
101    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
104    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
105    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
106    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
107};
108
109#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
110#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
111
112/*
113 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
114 *
115 * @param excWord (in) initial exceptions word
116 * @param idx (in) desired slot index
117 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
118 *               moved to the last uint16_t of the value, use +1 for beginning of next slot
119 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
120 */
121#define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
122    if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
123        (pExc16)+=SLOT_OFFSET(excWord, idx); \
124        (value)=*pExc16; \
125    } else { \
126        (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
127        (value)=*pExc16++; \
128        (value)=((value)<<16)|*pExc16; \
129    }
130
131/* simple case mappings ----------------------------------------------------- */
132
133U_CAPI UChar32 U_EXPORT2
134ucase_tolower(const UCaseProps *csp, UChar32 c) {
135    uint16_t props=UTRIE2_GET16(&csp->trie, c);
136    if(!PROPS_HAS_EXCEPTION(props)) {
137        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
138            c+=UCASE_GET_DELTA(props);
139        }
140    } else {
141        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
142        uint16_t excWord=*pe++;
143        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
144            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
145        }
146    }
147    return c;
148}
149
150U_CAPI UChar32 U_EXPORT2
151ucase_toupper(const UCaseProps *csp, UChar32 c) {
152    uint16_t props=UTRIE2_GET16(&csp->trie, c);
153    if(!PROPS_HAS_EXCEPTION(props)) {
154        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
155            c+=UCASE_GET_DELTA(props);
156        }
157    } else {
158        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
159        uint16_t excWord=*pe++;
160        if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
161            GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
162        }
163    }
164    return c;
165}
166
167U_CAPI UChar32 U_EXPORT2
168ucase_totitle(const UCaseProps *csp, UChar32 c) {
169    uint16_t props=UTRIE2_GET16(&csp->trie, c);
170    if(!PROPS_HAS_EXCEPTION(props)) {
171        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
172            c+=UCASE_GET_DELTA(props);
173        }
174    } else {
175        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
176        uint16_t excWord=*pe++;
177        int32_t idx;
178        if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
179            idx=UCASE_EXC_TITLE;
180        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
181            idx=UCASE_EXC_UPPER;
182        } else {
183            return c;
184        }
185        GET_SLOT_VALUE(excWord, idx, pe, c);
186    }
187    return c;
188}
189
190static const UChar iDot[2] = { 0x69, 0x307 };
191static const UChar jDot[2] = { 0x6a, 0x307 };
192static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
193static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
194static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
195static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
196
197
198U_CFUNC void U_EXPORT2
199ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
200    uint16_t props;
201
202    /*
203     * Hardcode the case closure of i and its relatives and ignore the
204     * data file data for these characters.
205     * The Turkic dotless i and dotted I with their case mapping conditions
206     * and case folding option make the related characters behave specially.
207     * This code matches their closure behavior to their case folding behavior.
208     */
209
210    switch(c) {
211    case 0x49:
212        /* regular i and I are in one equivalence class */
213        sa->add(sa->set, 0x69);
214        return;
215    case 0x69:
216        sa->add(sa->set, 0x49);
217        return;
218    case 0x130:
219        /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
220        sa->addString(sa->set, iDot, 2);
221        return;
222    case 0x131:
223        /* dotless i is in a class by itself */
224        return;
225    default:
226        /* otherwise use the data file data */
227        break;
228    }
229
230    props=UTRIE2_GET16(&csp->trie, c);
231    if(!PROPS_HAS_EXCEPTION(props)) {
232        if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
233            /* add the one simple case mapping, no matter what type it is */
234            int32_t delta=UCASE_GET_DELTA(props);
235            if(delta!=0) {
236                sa->add(sa->set, c+delta);
237            }
238        }
239    } else {
240        /*
241         * c has exceptions, so there may be multiple simple and/or
242         * full case mappings. Add them all.
243         */
244        const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
245        const UChar *closure;
246        uint16_t excWord=*pe++;
247        int32_t idx, closureLength, fullLength, length;
248
249        pe0=pe;
250
251        /* add all simple case mappings */
252        for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
253            if(HAS_SLOT(excWord, idx)) {
254                pe=pe0;
255                GET_SLOT_VALUE(excWord, idx, pe, c);
256                sa->add(sa->set, c);
257            }
258        }
259
260        /* get the closure string pointer & length */
261        if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
262            pe=pe0;
263            GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
264            closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
265            closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
266        } else {
267            closureLength=0;
268            closure=NULL;
269        }
270
271        /* add the full case folding */
272        if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
273            pe=pe0;
274            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
275
276            /* start of full case mapping strings */
277            ++pe;
278
279            fullLength&=0xffff; /* bits 16 and higher are reserved */
280
281            /* skip the lowercase result string */
282            pe+=fullLength&UCASE_FULL_LOWER;
283            fullLength>>=4;
284
285            /* add the full case folding string */
286            length=fullLength&0xf;
287            if(length!=0) {
288                sa->addString(sa->set, (const UChar *)pe, length);
289                pe+=length;
290            }
291
292            /* skip the uppercase and titlecase strings */
293            fullLength>>=4;
294            pe+=fullLength&0xf;
295            fullLength>>=4;
296            pe+=fullLength;
297
298            closure=(const UChar *)pe; /* behind full case mappings */
299        }
300
301        /* add each code point in the closure string */
302        for(idx=0; idx<closureLength;) {
303            U16_NEXT_UNSAFE(closure, idx, c);
304            sa->add(sa->set, c);
305        }
306    }
307}
308
309/*
310 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
311 * must be length>0 and max>0 and length<=max
312 */
313static inline int32_t
314strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
315    int32_t c1, c2;
316
317    max-=length; /* we require length<=max, so no need to decrement max in the loop */
318    do {
319        c1=*s++;
320        c2=*t++;
321        if(c2==0) {
322            return 1; /* reached the end of t but not of s */
323        }
324        c1-=c2;
325        if(c1!=0) {
326            return c1; /* return difference result */
327        }
328    } while(--length>0);
329    /* ends with length==0 */
330
331    if(max==0 || *t==0) {
332        return 0; /* equal to length of both strings */
333    } else {
334        return -max; /* return lengh difference */
335    }
336}
337
338U_CFUNC UBool U_EXPORT2
339ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
340    int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
341
342    if(csp->unfold==NULL || s==NULL) {
343        return FALSE; /* no reverse case folding data, or no string */
344    }
345    if(length<=1) {
346        /* the string is too short to find any match */
347        /*
348         * more precise would be:
349         * if(!u_strHasMoreChar32Than(s, length, 1))
350         * but this does not make much practical difference because
351         * a single supplementary code point would just not be found
352         */
353        return FALSE;
354    }
355
356    const uint16_t *unfold=csp->unfold;
357    unfoldRows=unfold[UCASE_UNFOLD_ROWS];
358    unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
359    unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
360    unfold+=unfoldRowWidth;
361
362    if(length>unfoldStringWidth) {
363        /* the string is too long to find any match */
364        return FALSE;
365    }
366
367    /* do a binary search for the string */
368    start=0;
369    limit=unfoldRows;
370    while(start<limit) {
371        i=(start+limit)/2;
372        const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
373        result=strcmpMax(s, length, p, unfoldStringWidth);
374
375        if(result==0) {
376            /* found the string: add each code point, and its case closure */
377            UChar32 c;
378
379            for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
380                U16_NEXT_UNSAFE(p, i, c);
381                sa->add(sa->set, c);
382                ucase_addCaseClosure(csp, c, sa);
383            }
384            return TRUE;
385        } else if(result<0) {
386            limit=i;
387        } else /* result>0 */ {
388            start=i+1;
389        }
390    }
391
392    return FALSE; /* string not found */
393}
394
395U_NAMESPACE_BEGIN
396
397FullCaseFoldingIterator::FullCaseFoldingIterator()
398        : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
399          unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
400          unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
401          unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
402          currentRow(0),
403          rowCpIndex(unfoldStringWidth) {
404    unfold+=unfoldRowWidth;
405}
406
407UChar32
408FullCaseFoldingIterator::next(UnicodeString &full) {
409    // Advance past the last-delivered code point.
410    const UChar *p=unfold+(currentRow*unfoldRowWidth);
411    if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
412        ++currentRow;
413        p+=unfoldRowWidth;
414        rowCpIndex=unfoldStringWidth;
415    }
416    if(currentRow>=unfoldRows) { return U_SENTINEL; }
417    // Set "full" to the NUL-terminated string in the first unfold column.
418    int32_t length=unfoldStringWidth;
419    while(length>0 && p[length-1]==0) { --length; }
420    full.setTo(FALSE, p, length);
421    // Return the code point.
422    UChar32 c;
423    U16_NEXT_UNSAFE(p, rowCpIndex, c);
424    return c;
425}
426
427U_NAMESPACE_END
428
429/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
430U_CAPI int32_t U_EXPORT2
431ucase_getType(const UCaseProps *csp, UChar32 c) {
432    uint16_t props=UTRIE2_GET16(&csp->trie, c);
433    return UCASE_GET_TYPE(props);
434}
435
436/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
437U_CAPI int32_t U_EXPORT2
438ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
439    uint16_t props=UTRIE2_GET16(&csp->trie, c);
440    return UCASE_GET_TYPE_AND_IGNORABLE(props);
441}
442
443/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
444static inline int32_t
445getDotType(const UCaseProps *csp, UChar32 c) {
446    uint16_t props=UTRIE2_GET16(&csp->trie, c);
447    if(!PROPS_HAS_EXCEPTION(props)) {
448        return props&UCASE_DOT_MASK;
449    } else {
450        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
451        return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
452    }
453}
454
455U_CAPI UBool U_EXPORT2
456ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
457    return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
458}
459
460U_CAPI UBool U_EXPORT2
461ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
462    uint16_t props=UTRIE2_GET16(&csp->trie, c);
463    return (UBool)((props&UCASE_SENSITIVE)!=0);
464}
465
466/* string casing ------------------------------------------------------------ */
467
468/*
469 * These internal functions form the core of string case mappings.
470 * They map single code points to result code points or strings and take
471 * all necessary conditions (context, locale ID, options) into account.
472 *
473 * They do not iterate over the source or write to the destination
474 * so that the same functions are useful for non-standard string storage,
475 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
476 * For the same reason, the "surrounding text" context is passed in as a
477 * UCaseContextIterator which does not make any assumptions about
478 * the underlying storage.
479 *
480 * This section contains helper functions that check for conditions
481 * in the input text surrounding the current code point
482 * according to SpecialCasing.txt.
483 *
484 * Each helper function gets the index
485 * - after the current code point if it looks at following text
486 * - before the current code point if it looks at preceding text
487 *
488 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
489 *
490 * Final_Sigma
491 *   C is preceded by a sequence consisting of
492 *     a cased letter and a case-ignorable sequence,
493 *   and C is not followed by a sequence consisting of
494 *     an ignorable sequence and then a cased letter.
495 *
496 * More_Above
497 *   C is followed by one or more characters of combining class 230 (ABOVE)
498 *   in the combining character sequence.
499 *
500 * After_Soft_Dotted
501 *   The last preceding character with combining class of zero before C
502 *   was Soft_Dotted,
503 *   and there is no intervening combining character class 230 (ABOVE).
504 *
505 * Before_Dot
506 *   C is followed by combining dot above (U+0307).
507 *   Any sequence of characters with a combining class that is neither 0 nor 230
508 *   may intervene between the current character and the combining dot above.
509 *
510 * The erratum from 2002-10-31 adds the condition
511 *
512 * After_I
513 *   The last preceding base character was an uppercase I, and there is no
514 *   intervening combining character class 230 (ABOVE).
515 *
516 *   (See Jitterbug 2344 and the comments on After_I below.)
517 *
518 * Helper definitions in Unicode 3.2 UAX 21:
519 *
520 * D1. A character C is defined to be cased
521 *     if it meets any of the following criteria:
522 *
523 *   - The general category of C is Titlecase Letter (Lt)
524 *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
525 *   - Given D = NFD(C), then it is not the case that:
526 *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
527 *     (This third criterium does not add any characters to the list
528 *      for Unicode 3.2. Ignored.)
529 *
530 * D2. A character C is defined to be case-ignorable
531 *     if it meets either of the following criteria:
532 *
533 *   - The general category of C is
534 *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
535 *     Letter Modifier (Lm), or Symbol Modifier (Sk)
536 *   - C is one of the following characters
537 *     U+0027 APOSTROPHE
538 *     U+00AD SOFT HYPHEN (SHY)
539 *     U+2019 RIGHT SINGLE QUOTATION MARK
540 *            (the preferred character for apostrophe)
541 *
542 * D3. A case-ignorable sequence is a sequence of
543 *     zero or more case-ignorable characters.
544 */
545
546#define is_a(c) ((c)=='a' || (c)=='A')
547#define is_d(c) ((c)=='d' || (c)=='D')
548#define is_e(c) ((c)=='e' || (c)=='E')
549#define is_i(c) ((c)=='i' || (c)=='I')
550#define is_l(c) ((c)=='l' || (c)=='L')
551#define is_n(c) ((c)=='n' || (c)=='N')
552#define is_r(c) ((c)=='r' || (c)=='R')
553#define is_t(c) ((c)=='t' || (c)=='T')
554#define is_u(c) ((c)=='u' || (c)=='U')
555#define is_z(c) ((c)=='z' || (c)=='Z')
556
557/* separator? */
558#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
559
560/**
561 * Requires non-NULL locale ID but otherwise does the equivalent of
562 * checking for language codes as if uloc_getLanguage() were called:
563 * Accepts both 2- and 3-letter codes and accepts case variants.
564 */
565U_CFUNC int32_t
566ucase_getCaseLocale(const char *locale, int32_t *locCache) {
567    int32_t result;
568    char c;
569
570    if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
571        return result;
572    }
573
574    result=UCASE_LOC_ROOT;
575
576    /*
577     * This function used to use uloc_getLanguage(), but the current code
578     * removes the dependency of this low-level code on uloc implementation code
579     * and is faster because not the whole locale ID has to be
580     * examined and copied/transformed.
581     *
582     * Because this code does not want to depend on uloc, the caller must
583     * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
584     */
585    c=*locale++;
586    if(is_t(c)) {
587        /* tr or tur? */
588        c=*locale++;
589        if(is_u(c)) {
590            c=*locale++;
591        }
592        if(is_r(c)) {
593            c=*locale;
594            if(is_sep(c)) {
595                result=UCASE_LOC_TURKISH;
596            }
597        }
598    } else if(is_a(c)) {
599        /* az or aze? */
600        c=*locale++;
601        if(is_z(c)) {
602            c=*locale++;
603            if(is_e(c)) {
604                c=*locale;
605            }
606            if(is_sep(c)) {
607                result=UCASE_LOC_TURKISH;
608            }
609        }
610    } else if(is_l(c)) {
611        /* lt or lit? */
612        c=*locale++;
613        if(is_i(c)) {
614            c=*locale++;
615        }
616        if(is_t(c)) {
617            c=*locale;
618            if(is_sep(c)) {
619                result=UCASE_LOC_LITHUANIAN;
620            }
621        }
622    } else if(is_n(c)) {
623        /* nl or nld? */
624        c=*locale++;
625        if(is_l(c)) {
626            c=*locale++;
627            if(is_d(c)) {
628                c=*locale;
629            }
630            if(is_sep(c)) {
631                result=UCASE_LOC_DUTCH;
632            }
633        }
634    }
635
636    if(locCache!=NULL) {
637        *locCache=result;
638    }
639    return result;
640}
641
642/*
643 * Is followed by
644 *   {case-ignorable}* cased
645 * ?
646 * (dir determines looking forward/backward)
647 * If a character is case-ignorable, it is skipped regardless of whether
648 * it is also cased or not.
649 */
650static UBool
651isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
652    UChar32 c;
653
654    if(iter==NULL) {
655        return FALSE;
656    }
657
658    for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
659        int32_t type=ucase_getTypeOrIgnorable(csp, c);
660        if(type&4) {
661            /* case-ignorable, continue with the loop */
662        } else if(type!=UCASE_NONE) {
663            return TRUE; /* followed by cased letter */
664        } else {
665            return FALSE; /* uncased and not case-ignorable */
666        }
667    }
668
669    return FALSE; /* not followed by cased letter */
670}
671
672/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
673static UBool
674isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
675    UChar32 c;
676    int32_t dotType;
677    int8_t dir;
678
679    if(iter==NULL) {
680        return FALSE;
681    }
682
683    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
684        dotType=getDotType(csp, c);
685        if(dotType==UCASE_SOFT_DOTTED) {
686            return TRUE; /* preceded by TYPE_i */
687        } else if(dotType!=UCASE_OTHER_ACCENT) {
688            return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
689        }
690    }
691
692    return FALSE; /* not preceded by TYPE_i */
693}
694
695/*
696 * See Jitterbug 2344:
697 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
698 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
699 * we made those releases compatible with Unicode 3.2 which had not fixed
700 * a related bug in SpecialCasing.txt.
701 *
702 * From the Jitterbug 2344 text:
703 * ... this bug is listed as a Unicode erratum
704 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
705 * <quote>
706 * There are two errors in SpecialCasing.txt.
707 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
708 * 2. An incorrect context definition. Correct as follows:
709 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
710 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
711 * ---
712 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
713 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
714 * where the context After_I is defined as:
715 * The last preceding base character was an uppercase I, and there is no
716 * intervening combining character class 230 (ABOVE).
717 * </quote>
718 *
719 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
720 *
721 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
722 * # This matches the behavior of the canonically equivalent I-dot_above
723 *
724 * See also the description in this place in older versions of uchar.c (revision 1.100).
725 *
726 * Markus W. Scherer 2003-feb-15
727 */
728
729/* Is preceded by base character 'I' with no intervening cc=230 ? */
730static UBool
731isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
732    UChar32 c;
733    int32_t dotType;
734    int8_t dir;
735
736    if(iter==NULL) {
737        return FALSE;
738    }
739
740    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
741        if(c==0x49) {
742            return TRUE; /* preceded by I */
743        }
744        dotType=getDotType(csp, c);
745        if(dotType!=UCASE_OTHER_ACCENT) {
746            return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
747        }
748    }
749
750    return FALSE; /* not preceded by I */
751}
752
753/* Is followed by one or more cc==230 ? */
754static UBool
755isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
756    UChar32 c;
757    int32_t dotType;
758    int8_t dir;
759
760    if(iter==NULL) {
761        return FALSE;
762    }
763
764    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
765        dotType=getDotType(csp, c);
766        if(dotType==UCASE_ABOVE) {
767            return TRUE; /* at least one cc==230 following */
768        } else if(dotType!=UCASE_OTHER_ACCENT) {
769            return FALSE; /* next base character, no more cc==230 following */
770        }
771    }
772
773    return FALSE; /* no more cc==230 following */
774}
775
776/* Is followed by a dot above (without cc==230 in between) ? */
777static UBool
778isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
779    UChar32 c;
780    int32_t dotType;
781    int8_t dir;
782
783    if(iter==NULL) {
784        return FALSE;
785    }
786
787    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
788        if(c==0x307) {
789            return TRUE;
790        }
791        dotType=getDotType(csp, c);
792        if(dotType!=UCASE_OTHER_ACCENT) {
793            return FALSE; /* next base character or cc==230 in between */
794        }
795    }
796
797    return FALSE; /* no dot above following */
798}
799
800U_CAPI int32_t U_EXPORT2
801ucase_toFullLower(const UCaseProps *csp, UChar32 c,
802                  UCaseContextIterator *iter, void *context,
803                  const UChar **pString,
804                  const char *locale, int32_t *locCache)
805{
806    UChar32 result=c;
807    uint16_t props=UTRIE2_GET16(&csp->trie, c);
808    if(!PROPS_HAS_EXCEPTION(props)) {
809        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
810            result=c+UCASE_GET_DELTA(props);
811        }
812    } else {
813        const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
814        uint16_t excWord=*pe++;
815        int32_t full;
816
817        pe2=pe;
818
819        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
820            /* use hardcoded conditions and mappings */
821            int32_t loc=ucase_getCaseLocale(locale, locCache);
822
823            /*
824             * Test for conditional mappings first
825             *   (otherwise the unconditional default mappings are always taken),
826             * then test for characters that have unconditional mappings in SpecialCasing.txt,
827             * then get the UnicodeData.txt mappings.
828             */
829            if( loc==UCASE_LOC_LITHUANIAN &&
830                    /* base characters, find accents above */
831                    (((c==0x49 || c==0x4a || c==0x12e) &&
832                        isFollowedByMoreAbove(csp, iter, context)) ||
833                    /* precomposed with accent above, no need to find one */
834                    (c==0xcc || c==0xcd || c==0x128))
835            ) {
836                /*
837                    # Lithuanian
838
839                    # Lithuanian retains the dot in a lowercase i when followed by accents.
840
841                    # Introduce an explicit dot above when lowercasing capital I's and J's
842                    # whenever there are more accents above.
843                    # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
844
845                    0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
846                    004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
847                    012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
848                    00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
849                    00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
850                    0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
851                 */
852                switch(c) {
853                case 0x49:  /* LATIN CAPITAL LETTER I */
854                    *pString=iDot;
855                    return 2;
856                case 0x4a:  /* LATIN CAPITAL LETTER J */
857                    *pString=jDot;
858                    return 2;
859                case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
860                    *pString=iOgonekDot;
861                    return 2;
862                case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
863                    *pString=iDotGrave;
864                    return 3;
865                case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
866                    *pString=iDotAcute;
867                    return 3;
868                case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
869                    *pString=iDotTilde;
870                    return 3;
871                default:
872                    return 0; /* will not occur */
873                }
874            /* # Turkish and Azeri */
875            } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
876                /*
877                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
878                    # The following rules handle those cases.
879
880                    0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
881                    0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
882                 */
883                return 0x69;
884            } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
885                /*
886                    # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
887                    # This matches the behavior of the canonically equivalent I-dot_above
888
889                    0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
890                    0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
891                 */
892                return 0; /* remove the dot (continue without output) */
893            } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
894                /*
895                    # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
896
897                    0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
898                    0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
899                 */
900                return 0x131;
901            } else if(c==0x130) {
902                /*
903                    # Preserve canonical equivalence for I with dot. Turkic is handled below.
904
905                    0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
906                 */
907                *pString=iDot;
908                return 2;
909            } else if(  c==0x3a3 &&
910                        !isFollowedByCasedLetter(csp, iter, context, 1) &&
911                        isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
912            ) {
913                /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
914                /*
915                    # Special case for final form of sigma
916
917                    03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
918                 */
919                return 0x3c2; /* greek small final sigma */
920            } else {
921                /* no known conditional special case mapping, use a normal mapping */
922            }
923        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
924            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
925            full&=UCASE_FULL_LOWER;
926            if(full!=0) {
927                /* set the output pointer to the lowercase mapping */
928                *pString=reinterpret_cast<const UChar *>(pe+1);
929
930                /* return the string length */
931                return full;
932            }
933        }
934
935        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
936            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
937        }
938    }
939
940    return (result==c) ? ~result : result;
941}
942
943/* internal */
944static int32_t
945toUpperOrTitle(const UCaseProps *csp, UChar32 c,
946               UCaseContextIterator *iter, void *context,
947               const UChar **pString,
948               const char *locale, int32_t *locCache,
949               UBool upperNotTitle) {
950    UChar32 result=c;
951    uint16_t props=UTRIE2_GET16(&csp->trie, c);
952    if(!PROPS_HAS_EXCEPTION(props)) {
953        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
954            result=c+UCASE_GET_DELTA(props);
955        }
956    } else {
957        const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
958        uint16_t excWord=*pe++;
959        int32_t full, idx;
960
961        pe2=pe;
962
963        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
964            /* use hardcoded conditions and mappings */
965            int32_t loc=ucase_getCaseLocale(locale, locCache);
966
967            if(loc==UCASE_LOC_TURKISH && c==0x69) {
968                /*
969                    # Turkish and Azeri
970
971                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
972                    # The following rules handle those cases.
973
974                    # When uppercasing, i turns into a dotted capital I
975
976                    0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
977                    0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
978                */
979                return 0x130;
980            } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
981                /*
982                    # Lithuanian
983
984                    # Lithuanian retains the dot in a lowercase i when followed by accents.
985
986                    # Remove DOT ABOVE after "i" with upper or titlecase
987
988                    0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
989                 */
990                return 0; /* remove the dot (continue without output) */
991            } else {
992                /* no known conditional special case mapping, use a normal mapping */
993            }
994        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
995            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
996
997            /* start of full case mapping strings */
998            ++pe;
999
1000            /* skip the lowercase and case-folding result strings */
1001            pe+=full&UCASE_FULL_LOWER;
1002            full>>=4;
1003            pe+=full&0xf;
1004            full>>=4;
1005
1006            if(upperNotTitle) {
1007                full&=0xf;
1008            } else {
1009                /* skip the uppercase result string */
1010                pe+=full&0xf;
1011                full=(full>>4)&0xf;
1012            }
1013
1014            if(full!=0) {
1015                /* set the output pointer to the result string */
1016                *pString=reinterpret_cast<const UChar *>(pe);
1017
1018                /* return the string length */
1019                return full;
1020            }
1021        }
1022
1023        if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1024            idx=UCASE_EXC_TITLE;
1025        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1026            /* here, titlecase is same as uppercase */
1027            idx=UCASE_EXC_UPPER;
1028        } else {
1029            return ~c;
1030        }
1031        GET_SLOT_VALUE(excWord, idx, pe2, result);
1032    }
1033
1034    return (result==c) ? ~result : result;
1035}
1036
1037U_CAPI int32_t U_EXPORT2
1038ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
1039                  UCaseContextIterator *iter, void *context,
1040                  const UChar **pString,
1041                  const char *locale, int32_t *locCache) {
1042    return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
1043}
1044
1045U_CAPI int32_t U_EXPORT2
1046ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
1047                  UCaseContextIterator *iter, void *context,
1048                  const UChar **pString,
1049                  const char *locale, int32_t *locCache) {
1050    return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
1051}
1052
1053/* case folding ------------------------------------------------------------- */
1054
1055/*
1056 * Case folding is similar to lowercasing.
1057 * The result may be a simple mapping, i.e., a single code point, or
1058 * a full mapping, i.e., a string.
1059 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1060 * then only the lowercase mapping is stored.
1061 *
1062 * Some special cases are hardcoded because their conditions cannot be
1063 * parsed and processed from CaseFolding.txt.
1064 *
1065 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1066
1067# C: common case folding, common mappings shared by both simple and full mappings.
1068# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1069# S: simple case folding, mappings to single characters where different from F.
1070# T: special case for uppercase I and dotted uppercase I
1071#    - For non-Turkic languages, this mapping is normally not used.
1072#    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1073#
1074# Usage:
1075#  A. To do a simple case folding, use the mappings with status C + S.
1076#  B. To do a full case folding, use the mappings with status C + F.
1077#
1078#    The mappings with status T can be used or omitted depending on the desired case-folding
1079#    behavior. (The default option is to exclude them.)
1080
1081 * Unicode 3.2 has 'T' mappings as follows:
1082
10830049; T; 0131; # LATIN CAPITAL LETTER I
10840130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1085
1086 * while the default mappings for these code points are:
1087
10880049; C; 0069; # LATIN CAPITAL LETTER I
10890130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1090
1091 * U+0130 has no simple case folding (simple-case-folds to itself).
1092 */
1093
1094/* return the simple case folding mapping for c */
1095U_CAPI UChar32 U_EXPORT2
1096ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
1097    uint16_t props=UTRIE2_GET16(&csp->trie, c);
1098    if(!PROPS_HAS_EXCEPTION(props)) {
1099        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1100            c+=UCASE_GET_DELTA(props);
1101        }
1102    } else {
1103        const uint16_t *pe=GET_EXCEPTIONS(csp, props);
1104        uint16_t excWord=*pe++;
1105        int32_t idx;
1106        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1107            /* special case folding mappings, hardcoded */
1108            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1109                /* default mappings */
1110                if(c==0x49) {
1111                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1112                    return 0x69;
1113                } else if(c==0x130) {
1114                    /* no simple case folding for U+0130 */
1115                    return c;
1116                }
1117            } else {
1118                /* Turkic mappings */
1119                if(c==0x49) {
1120                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1121                    return 0x131;
1122                } else if(c==0x130) {
1123                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1124                    return 0x69;
1125                }
1126            }
1127        }
1128        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1129            idx=UCASE_EXC_FOLD;
1130        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1131            idx=UCASE_EXC_LOWER;
1132        } else {
1133            return c;
1134        }
1135        GET_SLOT_VALUE(excWord, idx, pe, c);
1136    }
1137    return c;
1138}
1139
1140/*
1141 * Issue for canonical caseless match (UAX #21):
1142 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1143 * canonical equivalence, unlike default-option casefolding.
1144 * For example, I-grave and I + grave fold to strings that are not canonically
1145 * equivalent.
1146 * For more details, see the comment in unorm_compare() in unorm.cpp
1147 * and the intermediate prototype changes for Jitterbug 2021.
1148 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1149 *
1150 * This did not get fixed because it appears that it is not possible to fix
1151 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1152 * together in a way that they still fold to common result strings.
1153 */
1154
1155U_CAPI int32_t U_EXPORT2
1156ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
1157                    const UChar **pString,
1158                    uint32_t options)
1159{
1160    UChar32 result=c;
1161    uint16_t props=UTRIE2_GET16(&csp->trie, c);
1162    if(!PROPS_HAS_EXCEPTION(props)) {
1163        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1164            result=c+UCASE_GET_DELTA(props);
1165        }
1166    } else {
1167        const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1168        uint16_t excWord=*pe++;
1169        int32_t full, idx;
1170
1171        pe2=pe;
1172
1173        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1174            /* use hardcoded conditions and mappings */
1175            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1176                /* default mappings */
1177                if(c==0x49) {
1178                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1179                    return 0x69;
1180                } else if(c==0x130) {
1181                    /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1182                    *pString=iDot;
1183                    return 2;
1184                }
1185            } else {
1186                /* Turkic mappings */
1187                if(c==0x49) {
1188                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1189                    return 0x131;
1190                } else if(c==0x130) {
1191                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1192                    return 0x69;
1193                }
1194            }
1195        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1196            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1197
1198            /* start of full case mapping strings */
1199            ++pe;
1200
1201            /* skip the lowercase result string */
1202            pe+=full&UCASE_FULL_LOWER;
1203            full=(full>>4)&0xf;
1204
1205            if(full!=0) {
1206                /* set the output pointer to the result string */
1207                *pString=reinterpret_cast<const UChar *>(pe);
1208
1209                /* return the string length */
1210                return full;
1211            }
1212        }
1213
1214        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1215            idx=UCASE_EXC_FOLD;
1216        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1217            idx=UCASE_EXC_LOWER;
1218        } else {
1219            return ~c;
1220        }
1221        GET_SLOT_VALUE(excWord, idx, pe2, result);
1222    }
1223
1224    return (result==c) ? ~result : result;
1225}
1226
1227/* case mapping properties API ---------------------------------------------- */
1228
1229#define GET_CASE_PROPS() &ucase_props_singleton
1230
1231/* public API (see uchar.h) */
1232
1233U_CAPI UBool U_EXPORT2
1234u_isULowercase(UChar32 c) {
1235    return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
1236}
1237
1238U_CAPI UBool U_EXPORT2
1239u_isUUppercase(UChar32 c) {
1240    return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
1241}
1242
1243/* Transforms the Unicode character to its lower case equivalent.*/
1244U_CAPI UChar32 U_EXPORT2
1245u_tolower(UChar32 c) {
1246    return ucase_tolower(GET_CASE_PROPS(), c);
1247}
1248
1249/* Transforms the Unicode character to its upper case equivalent.*/
1250U_CAPI UChar32 U_EXPORT2
1251u_toupper(UChar32 c) {
1252    return ucase_toupper(GET_CASE_PROPS(), c);
1253}
1254
1255/* Transforms the Unicode character to its title case equivalent.*/
1256U_CAPI UChar32 U_EXPORT2
1257u_totitle(UChar32 c) {
1258    return ucase_totitle(GET_CASE_PROPS(), c);
1259}
1260
1261/* return the simple case folding mapping for c */
1262U_CAPI UChar32 U_EXPORT2
1263u_foldCase(UChar32 c, uint32_t options) {
1264    return ucase_fold(GET_CASE_PROPS(), c, options);
1265}
1266
1267U_CFUNC int32_t U_EXPORT2
1268ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1269    /* case mapping properties */
1270    const UChar *resultString;
1271    int32_t locCache;
1272    const UCaseProps *csp=GET_CASE_PROPS();
1273    if(csp==NULL) {
1274        return FALSE;
1275    }
1276    switch(which) {
1277    case UCHAR_LOWERCASE:
1278        return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
1279    case UCHAR_UPPERCASE:
1280        return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
1281    case UCHAR_SOFT_DOTTED:
1282        return ucase_isSoftDotted(csp, c);
1283    case UCHAR_CASE_SENSITIVE:
1284        return ucase_isCaseSensitive(csp, c);
1285    case UCHAR_CASED:
1286        return (UBool)(UCASE_NONE!=ucase_getType(csp, c));
1287    case UCHAR_CASE_IGNORABLE:
1288        return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);
1289    /*
1290     * Note: The following Changes_When_Xyz are defined as testing whether
1291     * the NFD form of the input changes when Xyz-case-mapped.
1292     * However, this simpler implementation of these properties,
1293     * ignoring NFD, passes the tests.
1294     * The implementation needs to be changed if the tests start failing.
1295     * When that happens, optimizations should be used to work with the
1296     * per-single-code point ucase_toFullXyz() functions unless
1297     * the NFD form has more than one code point,
1298     * and the property starts set needs to be the union of the
1299     * start sets for normalization and case mappings.
1300     */
1301    case UCHAR_CHANGES_WHEN_LOWERCASED:
1302        locCache=UCASE_LOC_ROOT;
1303        return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1304    case UCHAR_CHANGES_WHEN_UPPERCASED:
1305        locCache=UCASE_LOC_ROOT;
1306        return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1307    case UCHAR_CHANGES_WHEN_TITLECASED:
1308        locCache=UCASE_LOC_ROOT;
1309        return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1310    /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1311    case UCHAR_CHANGES_WHEN_CASEMAPPED:
1312        locCache=UCASE_LOC_ROOT;
1313        return (UBool)(
1314            ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1315            ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1316            ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1317    default:
1318        return FALSE;
1319    }
1320}
1321