1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2004-2015, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  UCaseProps.java
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2005jan29
16 *   created by: Markus W. Scherer
17 *
18 *   Low-level Unicode character/string case mapping code.
19 *   Java port of ucase.h/.c.
20 */
21
22package com.ibm.icu.impl;
23
24import java.io.IOException;
25import java.nio.ByteBuffer;
26import java.util.Iterator;
27import java.util.Locale;
28
29import com.ibm.icu.lang.UCharacter;
30import com.ibm.icu.lang.UProperty;
31import com.ibm.icu.text.UTF16;
32import com.ibm.icu.text.UnicodeSet;
33import com.ibm.icu.util.ICUUncheckedIOException;
34import com.ibm.icu.util.ULocale;
35
36public final class UCaseProps {
37
38    // constructors etc. --------------------------------------------------- ***
39
40    // port of ucase_openProps()
41    private UCaseProps() throws IOException {
42        ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME);
43        readData(bytes);
44    }
45
46    private final void readData(ByteBuffer bytes) throws IOException {
47        // read the header
48        ICUBinary.readHeader(bytes, FMT, new IsAcceptable());
49
50        // read indexes[]
51        int count=bytes.getInt();
52        if(count<IX_TOP) {
53            throw new IOException("indexes[0] too small in "+DATA_FILE_NAME);
54        }
55        indexes=new int[count];
56
57        indexes[0]=count;
58        for(int i=1; i<count; ++i) {
59            indexes[i]=bytes.getInt();
60        }
61
62        // read the trie
63        trie=Trie2_16.createFromSerialized(bytes);
64        int expectedTrieLength=indexes[IX_TRIE_SIZE];
65        int trieLength=trie.getSerializedLength();
66        if(trieLength>expectedTrieLength) {
67            throw new IOException(DATA_FILE_NAME+": not enough bytes for the trie");
68        }
69        // skip padding after trie bytes
70        ICUBinary.skipBytes(bytes, expectedTrieLength-trieLength);
71
72        // read exceptions[]
73        count=indexes[IX_EXC_LENGTH];
74        if(count>0) {
75            exceptions=ICUBinary.getString(bytes, count, 0);
76        }
77
78        // read unfold[]
79        count=indexes[IX_UNFOLD_LENGTH];
80        if(count>0) {
81            unfold=ICUBinary.getChars(bytes, count, 0);
82        }
83    }
84
85    // implement ICUBinary.Authenticate
86    private final static class IsAcceptable implements ICUBinary.Authenticate {
87        @Override
88        public boolean isDataVersionAcceptable(byte version[]) {
89            return version[0]==3;
90        }
91    }
92
93    // set of property starts for UnicodeSet ------------------------------- ***
94
95    public final void addPropertyStarts(UnicodeSet set) {
96        /* add the start code point of each same-value range of the trie */
97        Iterator<Trie2.Range> trieIterator=trie.iterator();
98        Trie2.Range range;
99        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
100            set.add(range.startCodePoint);
101        }
102
103        /* add code points with hardcoded properties, plus the ones following them */
104
105        /* (none right now, see comment below) */
106
107        /*
108         * Omit code points with hardcoded specialcasing properties
109         * because we do not build property UnicodeSets for them right now.
110         */
111    }
112
113    // data access primitives ---------------------------------------------- ***
114    private static final int getExceptionsOffset(int props) {
115        return props>>EXC_SHIFT;
116    }
117
118    private static final boolean propsHasException(int props) {
119        return (props&EXCEPTION)!=0;
120    }
121
122    /* number of bits in an 8-bit integer value */
123    private static final byte flagsOffset[/*256*/]={
124        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
125        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
126        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
127        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
128        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
129        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
130        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
131        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
132        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
133        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
134        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
135        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
136        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
137        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
138        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
139        4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
140    };
141
142    private static final boolean hasSlot(int flags, int index) {
143        return (flags&(1<<index))!=0;
144    }
145    private static final byte slotOffset(int flags, int index) {
146        return flagsOffset[flags&((1<<index)-1)];
147    }
148
149    /*
150     * Get the value of an optional-value slot where hasSlot(excWord, index).
151     *
152     * @param excWord (in) initial exceptions word
153     * @param index (in) desired slot index
154     * @param excOffset (in) offset into exceptions[] after excWord=exceptions.charAt(excOffset++);
155     * @return bits 31..0: slot value
156     *             63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot
157     */
158    private final long getSlotValueAndOffset(int excWord, int index, int excOffset) {
159        long value;
160        if((excWord&EXC_DOUBLE_SLOTS)==0) {
161            excOffset+=slotOffset(excWord, index);
162            value=exceptions.charAt(excOffset);
163        } else {
164            excOffset+=2*slotOffset(excWord, index);
165            value=exceptions.charAt(excOffset++);
166            value=(value<<16)|exceptions.charAt(excOffset);
167        }
168        return value |((long)excOffset<<32);
169    }
170
171    /* same as getSlotValueAndOffset() but does not return the slot offset */
172    private final int getSlotValue(int excWord, int index, int excOffset) {
173        int value;
174        if((excWord&EXC_DOUBLE_SLOTS)==0) {
175            excOffset+=slotOffset(excWord, index);
176            value=exceptions.charAt(excOffset);
177        } else {
178            excOffset+=2*slotOffset(excWord, index);
179            value=exceptions.charAt(excOffset++);
180            value=(value<<16)|exceptions.charAt(excOffset);
181        }
182        return value;
183    }
184
185    // simple case mappings ------------------------------------------------ ***
186
187    public final int tolower(int c) {
188        int props=trie.get(c);
189        if(!propsHasException(props)) {
190            if(getTypeFromProps(props)>=UPPER) {
191                c+=getDelta(props);
192            }
193        } else {
194            int excOffset=getExceptionsOffset(props);
195            int excWord=exceptions.charAt(excOffset++);
196            if(hasSlot(excWord, EXC_LOWER)) {
197                c=getSlotValue(excWord, EXC_LOWER, excOffset);
198            }
199        }
200        return c;
201    }
202
203    public final int toupper(int c) {
204        int props=trie.get(c);
205        if(!propsHasException(props)) {
206            if(getTypeFromProps(props)==LOWER) {
207                c+=getDelta(props);
208            }
209        } else {
210            int excOffset=getExceptionsOffset(props);
211            int excWord=exceptions.charAt(excOffset++);
212            if(hasSlot(excWord, EXC_UPPER)) {
213                c=getSlotValue(excWord, EXC_UPPER, excOffset);
214            }
215        }
216        return c;
217    }
218
219    public final int totitle(int c) {
220        int props=trie.get(c);
221        if(!propsHasException(props)) {
222            if(getTypeFromProps(props)==LOWER) {
223                c+=getDelta(props);
224            }
225        } else {
226            int excOffset=getExceptionsOffset(props);
227            int excWord=exceptions.charAt(excOffset++);
228            int index;
229            if(hasSlot(excWord, EXC_TITLE)) {
230                index=EXC_TITLE;
231            } else if(hasSlot(excWord, EXC_UPPER)) {
232                index=EXC_UPPER;
233            } else {
234                return c;
235            }
236            c=getSlotValue(excWord, index, excOffset);
237        }
238        return c;
239    }
240
241    /**
242     * Adds all simple case mappings and the full case folding for c to sa,
243     * and also adds special case closure mappings.
244     * c itself is not added.
245     * For example, the mappings
246     * - for s include long s
247     * - for sharp s include ss
248     * - for k include the Kelvin sign
249     */
250    public final void addCaseClosure(int c, UnicodeSet set) {
251        /*
252         * Hardcode the case closure of i and its relatives and ignore the
253         * data file data for these characters.
254         * The Turkic dotless i and dotted I with their case mapping conditions
255         * and case folding option make the related characters behave specially.
256         * This code matches their closure behavior to their case folding behavior.
257         */
258
259        switch(c) {
260        case 0x49:
261            /* regular i and I are in one equivalence class */
262            set.add(0x69);
263            return;
264        case 0x69:
265            set.add(0x49);
266            return;
267        case 0x130:
268            /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
269            set.add(iDot);
270            return;
271        case 0x131:
272            /* dotless i is in a class by itself */
273            return;
274        default:
275            /* otherwise use the data file data */
276            break;
277        }
278
279        int props=trie.get(c);
280        if(!propsHasException(props)) {
281            if(getTypeFromProps(props)!=NONE) {
282                /* add the one simple case mapping, no matter what type it is */
283                int delta=getDelta(props);
284                if(delta!=0) {
285                    set.add(c+delta);
286                }
287            }
288        } else {
289            /*
290             * c has exceptions, so there may be multiple simple and/or
291             * full case mappings. Add them all.
292             */
293            int excOffset0, excOffset=getExceptionsOffset(props);
294            int closureOffset;
295            int excWord=exceptions.charAt(excOffset++);
296            int index, closureLength, fullLength, length;
297
298            excOffset0=excOffset;
299
300            /* add all simple case mappings */
301            for(index=EXC_LOWER; index<=EXC_TITLE; ++index) {
302                if(hasSlot(excWord, index)) {
303                    excOffset=excOffset0;
304                    c=getSlotValue(excWord, index, excOffset);
305                    set.add(c);
306                }
307            }
308
309            /* get the closure string pointer & length */
310            if(hasSlot(excWord, EXC_CLOSURE)) {
311                excOffset=excOffset0;
312                long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
313                closureLength=(int)value&CLOSURE_MAX_LENGTH; /* higher bits are reserved */
314                closureOffset=(int)(value>>32)+1; /* behind this slot, unless there are full case mappings */
315            } else {
316                closureLength=0;
317                closureOffset=0;
318            }
319
320            /* add the full case folding */
321            if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
322                excOffset=excOffset0;
323                long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
324                fullLength=(int)value;
325
326                /* start of full case mapping strings */
327                excOffset=(int)(value>>32)+1;
328
329                fullLength&=0xffff; /* bits 16 and higher are reserved */
330
331                /* skip the lowercase result string */
332                excOffset+=fullLength&FULL_LOWER;
333                fullLength>>=4;
334
335                /* add the full case folding string */
336                length=fullLength&0xf;
337                if(length!=0) {
338                    set.add(exceptions.substring(excOffset, excOffset+length));
339                    excOffset+=length;
340                }
341
342                /* skip the uppercase and titlecase strings */
343                fullLength>>=4;
344                excOffset+=fullLength&0xf;
345                fullLength>>=4;
346                excOffset+=fullLength;
347
348                closureOffset=excOffset; /* behind full case mappings */
349            }
350
351            /* add each code point in the closure string */
352            int limit=closureOffset+closureLength;
353            for(index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
354                c=exceptions.codePointAt(index);
355                set.add(c);
356            }
357        }
358    }
359
360    /*
361     * compare s, which has a length, with t=unfold[unfoldOffset..], which has a maximum length or is NUL-terminated
362     * must be s.length()>0 and max>0 and s.length()<=max
363     */
364    private final int strcmpMax(String s, int unfoldOffset, int max) {
365        int i1, length, c1, c2;
366
367        length=s.length();
368        max-=length; /* we require length<=max, so no need to decrement max in the loop */
369        i1=0;
370        do {
371            c1=s.charAt(i1++);
372            c2=unfold[unfoldOffset++];
373            if(c2==0) {
374                return 1; /* reached the end of t but not of s */
375            }
376            c1-=c2;
377            if(c1!=0) {
378                return c1; /* return difference result */
379            }
380        } while(--length>0);
381        /* ends with length==0 */
382
383        if(max==0 || unfold[unfoldOffset]==0) {
384            return 0; /* equal to length of both strings */
385        } else {
386            return -max; /* return lengh difference */
387        }
388    }
389
390    /**
391     * Maps the string to single code points and adds the associated case closure
392     * mappings.
393     * The string is mapped to code points if it is their full case folding string.
394     * In other words, this performs a reverse full case folding and then
395     * adds the case closure items of the resulting code points.
396     * If the string is found and its closure applied, then
397     * the string itself is added as well as part of its code points' closure.
398     *
399     * @return true if the string was found
400     */
401    public final boolean addStringCaseClosure(String s, UnicodeSet set) {
402        int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;
403
404        if(unfold==null || s==null) {
405            return false; /* no reverse case folding data, or no string */
406        }
407        length=s.length();
408        if(length<=1) {
409            /* the string is too short to find any match */
410            /*
411             * more precise would be:
412             * if(!u_strHasMoreChar32Than(s, length, 1))
413             * but this does not make much practical difference because
414             * a single supplementary code point would just not be found
415             */
416            return false;
417        }
418
419        unfoldRows=unfold[UNFOLD_ROWS];
420        unfoldRowWidth=unfold[UNFOLD_ROW_WIDTH];
421        unfoldStringWidth=unfold[UNFOLD_STRING_WIDTH];
422        //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;
423
424        if(length>unfoldStringWidth) {
425            /* the string is too long to find any match */
426            return false;
427        }
428
429        /* do a binary search for the string */
430        start=0;
431        limit=unfoldRows;
432        while(start<limit) {
433            i=(start+limit)/2;
434            unfoldOffset=((i+1)*unfoldRowWidth); // +1 to skip the header values above
435            result=strcmpMax(s, unfoldOffset, unfoldStringWidth);
436
437            if(result==0) {
438                /* found the string: add each code point, and its case closure */
439                int c;
440
441                for(i=unfoldStringWidth; i<unfoldRowWidth && unfold[unfoldOffset+i]!=0; i+=UTF16.getCharCount(c)) {
442                    c=UTF16.charAt(unfold, unfoldOffset, unfold.length, i);
443                    set.add(c);
444                    addCaseClosure(c, set);
445                }
446                return true;
447            } else if(result<0) {
448                limit=i;
449            } else /* result>0 */ {
450                start=i+1;
451            }
452        }
453
454        return false; /* string not found */
455    }
456
457    /** @return NONE, LOWER, UPPER, TITLE */
458    public final int getType(int c) {
459        return getTypeFromProps(trie.get(c));
460    }
461
462    /** @return like getType() but also sets IGNORABLE if c is case-ignorable */
463    public final int getTypeOrIgnorable(int c) {
464        return getTypeAndIgnorableFromProps(trie.get(c));
465    }
466
467    /** @return NO_DOT, SOFT_DOTTED, ABOVE, OTHER_ACCENT */
468    public final int getDotType(int c) {
469        int props=trie.get(c);
470        if(!propsHasException(props)) {
471            return props&DOT_MASK;
472        } else {
473            return (exceptions.charAt(getExceptionsOffset(props))>>EXC_DOT_SHIFT)&DOT_MASK;
474        }
475    }
476
477    public final boolean isSoftDotted(int c) {
478        return getDotType(c)==SOFT_DOTTED;
479    }
480
481    public final boolean isCaseSensitive(int c) {
482        return (trie.get(c)&SENSITIVE)!=0;
483    }
484
485    // string casing ------------------------------------------------------- ***
486
487    /*
488     * These internal functions form the core of string case mappings.
489     * They map single code points to result code points or strings and take
490     * all necessary conditions (context, locale ID, options) into account.
491     *
492     * They do not iterate over the source or write to the destination
493     * so that the same functions are useful for non-standard string storage,
494     * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
495     * For the same reason, the "surrounding text" context is passed in as a
496     * ContextIterator which does not make any assumptions about
497     * the underlying storage.
498     *
499     * This section contains helper functions that check for conditions
500     * in the input text surrounding the current code point
501     * according to SpecialCasing.txt.
502     *
503     * Each helper function gets the index
504     * - after the current code point if it looks at following text
505     * - before the current code point if it looks at preceding text
506     *
507     * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
508     *
509     * Final_Sigma
510     *   C is preceded by a sequence consisting of
511     *     a cased letter and a case-ignorable sequence,
512     *   and C is not followed by a sequence consisting of
513     *     an ignorable sequence and then a cased letter.
514     *
515     * More_Above
516     *   C is followed by one or more characters of combining class 230 (ABOVE)
517     *   in the combining character sequence.
518     *
519     * After_Soft_Dotted
520     *   The last preceding character with combining class of zero before C
521     *   was Soft_Dotted,
522     *   and there is no intervening combining character class 230 (ABOVE).
523     *
524     * Before_Dot
525     *   C is followed by combining dot above (U+0307).
526     *   Any sequence of characters with a combining class that is neither 0 nor 230
527     *   may intervene between the current character and the combining dot above.
528     *
529     * The erratum from 2002-10-31 adds the condition
530     *
531     * After_I
532     *   The last preceding base character was an uppercase I, and there is no
533     *   intervening combining character class 230 (ABOVE).
534     *
535     *   (See Jitterbug 2344 and the comments on After_I below.)
536     *
537     * Helper definitions in Unicode 3.2 UAX 21:
538     *
539     * D1. A character C is defined to be cased
540     *     if it meets any of the following criteria:
541     *
542     *   - The general category of C is Titlecase Letter (Lt)
543     *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
544     *   - Given D = NFD(C), then it is not the case that:
545     *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
546     *     (This third criterium does not add any characters to the list
547     *      for Unicode 3.2. Ignored.)
548     *
549     * D2. A character C is defined to be case-ignorable
550     *     if it meets either of the following criteria:
551     *
552     *   - The general category of C is
553     *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
554     *     Letter Modifier (Lm), or Symbol Modifier (Sk)
555     *   - C is one of the following characters
556     *     U+0027 APOSTROPHE
557     *     U+00AD SOFT HYPHEN (SHY)
558     *     U+2019 RIGHT SINGLE QUOTATION MARK
559     *            (the preferred character for apostrophe)
560     *
561     * D3. A case-ignorable sequence is a sequence of
562     *     zero or more case-ignorable characters.
563     */
564
565    /**
566     * Iterator for string case mappings, which need to look at the
567     * context (surrounding text) of a given character for conditional mappings.
568     *
569     * The iterator only needs to go backward or forward away from the
570     * character in question. It does not use any indexes on this interface.
571     * It does not support random access or an arbitrary change of
572     * iteration direction.
573     *
574     * The code point being case-mapped itself is never returned by
575     * this iterator.
576     */
577    public interface ContextIterator {
578        /**
579         * Reset the iterator for forward or backward iteration.
580         * @param dir >0: Begin iterating forward from the first code point
581         * after the one that is being case-mapped.
582         *            <0: Begin iterating backward from the first code point
583         * before the one that is being case-mapped.
584         */
585        public void reset(int dir);
586        /**
587         * Iterate and return the next code point, moving in the direction
588         * determined by the reset() call.
589         * @return Next code point, or <0 when the iteration is done.
590         */
591        public int next();
592    }
593
594    /**
595     * For string case mappings, a single character (a code point) is mapped
596     * either to itself (in which case in-place mapping functions do nothing),
597     * or to another single code point, or to a string.
598     * Aside from the string contents, these are indicated with a single int
599     * value as follows:
600     *
601     * Mapping to self: Negative values (~self instead of -self to support U+0000)
602     *
603     * Mapping to another code point: Positive values >MAX_STRING_LENGTH
604     *
605     * Mapping to a string: The string length (0..MAX_STRING_LENGTH) is
606     * returned. Note that the string result may indeed have zero length.
607     */
608    public static final int MAX_STRING_LENGTH=0x1f;
609
610    //ivate static final int LOC_UNKNOWN=0;
611    public static final int LOC_ROOT=1;
612    private static final int LOC_TURKISH=2;
613    private static final int LOC_LITHUANIAN=3;
614    static final int LOC_GREEK=4;
615    public static final int LOC_DUTCH=5;
616
617    public static final int getCaseLocale(Locale locale) {
618        return getCaseLocale(locale.getLanguage());
619    }
620    public static final int getCaseLocale(ULocale locale) {
621        return getCaseLocale(locale.getLanguage());
622    }
623    /** Accepts both 2- and 3-letter language subtags. */
624    private static final int getCaseLocale(String language) {
625        // Check the subtag length to reduce the number of comparisons
626        // for locales without special behavior.
627        // Fastpath for English "en" which is often used for default (=root locale) case mappings,
628        // and for Chinese "zh": Very common but no special case mapping behavior.
629        if(language.length()==2) {
630            if(language.equals("en") || language.charAt(0)>'t') {
631                return LOC_ROOT;
632            } else if(language.equals("tr") || language.equals("az")) {
633                return LOC_TURKISH;
634            } else if(language.equals("el")) {
635                return LOC_GREEK;
636            } else if(language.equals("lt")) {
637                return LOC_LITHUANIAN;
638            } else if(language.equals("nl")) {
639                return LOC_DUTCH;
640            }
641        } else if(language.length()==3) {
642            if(language.equals("tur") || language.equals("aze")) {
643                return LOC_TURKISH;
644            } else if(language.equals("ell")) {
645                return LOC_GREEK;
646            } else if(language.equals("lit")) {
647                return LOC_LITHUANIAN;
648            } else if(language.equals("nld")) {
649                return LOC_DUTCH;
650            }
651        }
652        return LOC_ROOT;
653    }
654
655    /* Is followed by {case-ignorable}* cased  ? (dir determines looking forward/backward) */
656    private final boolean isFollowedByCasedLetter(ContextIterator iter, int dir) {
657        int c;
658
659        if(iter==null) {
660            return false;
661        }
662
663        for(iter.reset(dir); (c=iter.next())>=0;) {
664            int type=getTypeOrIgnorable(c);
665            if((type&4)!=0) {
666                /* case-ignorable, continue with the loop */
667            } else if(type!=NONE) {
668                return true; /* followed by cased letter */
669            } else {
670                return false; /* uncased and not case-ignorable */
671            }
672        }
673
674        return false; /* not followed by cased letter */
675    }
676
677    /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
678    private final boolean isPrecededBySoftDotted(ContextIterator iter) {
679        int c;
680        int dotType;
681
682        if(iter==null) {
683            return false;
684        }
685
686        for(iter.reset(-1); (c=iter.next())>=0;) {
687            dotType=getDotType(c);
688            if(dotType==SOFT_DOTTED) {
689                return true; /* preceded by TYPE_i */
690            } else if(dotType!=OTHER_ACCENT) {
691                return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
692            }
693        }
694
695        return false; /* not preceded by TYPE_i */
696    }
697
698    /*
699     * See Jitterbug 2344:
700     * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
701     * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
702     * we made those releases compatible with Unicode 3.2 which had not fixed
703     * a related bug in SpecialCasing.txt.
704     *
705     * From the Jitterbug 2344 text:
706     * ... this bug is listed as a Unicode erratum
707     * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
708     * <quote>
709     * There are two errors in SpecialCasing.txt.
710     * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
711     * 2. An incorrect context definition. Correct as follows:
712     * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
713     * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
714     * ---
715     * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
716     * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
717     * where the context After_I is defined as:
718     * The last preceding base character was an uppercase I, and there is no
719     * intervening combining character class 230 (ABOVE).
720     * </quote>
721     *
722     * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
723     *
724     * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
725     * # This matches the behavior of the canonically equivalent I-dot_above
726     *
727     * See also the description in this place in older versions of uchar.c (revision 1.100).
728     *
729     * Markus W. Scherer 2003-feb-15
730     */
731
732    /* Is preceded by base character 'I' with no intervening cc=230 ? */
733    private final boolean isPrecededBy_I(ContextIterator iter) {
734        int c;
735        int dotType;
736
737        if(iter==null) {
738            return false;
739        }
740
741        for(iter.reset(-1); (c=iter.next())>=0;) {
742            if(c==0x49) {
743                return true; /* preceded by I */
744            }
745            dotType=getDotType(c);
746            if(dotType!=OTHER_ACCENT) {
747                return false; /* preceded by different base character (not I), or intervening cc==230 */
748            }
749        }
750
751        return false; /* not preceded by I */
752    }
753
754    /* Is followed by one or more cc==230 ? */
755    private final boolean isFollowedByMoreAbove(ContextIterator iter) {
756        int c;
757        int dotType;
758
759        if(iter==null) {
760            return false;
761        }
762
763        for(iter.reset(1); (c=iter.next())>=0;) {
764            dotType=getDotType(c);
765            if(dotType==ABOVE) {
766                return true; /* at least one cc==230 following */
767            } else if(dotType!=OTHER_ACCENT) {
768                return false; /* next base character, no more cc==230 following */
769            }
770        }
771
772        return false; /* no more cc==230 following */
773    }
774
775    /* Is followed by a dot above (without cc==230 in between) ? */
776    private final boolean isFollowedByDotAbove(ContextIterator iter) {
777        int c;
778        int dotType;
779
780        if(iter==null) {
781            return false;
782        }
783
784        for(iter.reset(1); (c=iter.next())>=0; ) {
785            if(c==0x307) {
786                return true;
787            }
788            dotType=getDotType(c);
789            if(dotType!=OTHER_ACCENT) {
790                return false; /* next base character or cc==230 in between */
791            }
792        }
793
794        return false; /* no dot above following */
795    }
796
797    private static final String
798        iDot=       "i\u0307",
799        jDot=       "j\u0307",
800        iOgonekDot= "\u012f\u0307",
801        iDotGrave=  "i\u0307\u0300",
802        iDotAcute=  "i\u0307\u0301",
803        iDotTilde=  "i\u0307\u0303";
804
805    /**
806     * Get the full lowercase mapping for c.
807     *
808     * @param c Character to be mapped.
809     * @param iter Character iterator, used for context-sensitive mappings.
810     *             See ContextIterator for details.
811     *             If iter==null then a context-independent result is returned.
812     * @param out If the mapping result is a string, then it is appended to out.
813     * @param caseLocale Case locale value from ucase_getCaseLocale().
814     * @return Output code point or string length, see MAX_STRING_LENGTH.
815     *
816     * @see ContextIterator
817     * @see #MAX_STRING_LENGTH
818     * @internal
819     */
820    public final int toFullLower(int c, ContextIterator iter, Appendable out, int caseLocale) {
821        int result, props;
822
823        result=c;
824        props=trie.get(c);
825        if(!propsHasException(props)) {
826            if(getTypeFromProps(props)>=UPPER) {
827                result=c+getDelta(props);
828            }
829        } else {
830            int excOffset=getExceptionsOffset(props), excOffset2;
831            int excWord=exceptions.charAt(excOffset++);
832            int full;
833
834            excOffset2=excOffset;
835
836            if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
837                /* use hardcoded conditions and mappings */
838                /*
839                 * Test for conditional mappings first
840                 *   (otherwise the unconditional default mappings are always taken),
841                 * then test for characters that have unconditional mappings in SpecialCasing.txt,
842                 * then get the UnicodeData.txt mappings.
843                 */
844                if( caseLocale==LOC_LITHUANIAN &&
845                        /* base characters, find accents above */
846                        (((c==0x49 || c==0x4a || c==0x12e) &&
847                            isFollowedByMoreAbove(iter)) ||
848                        /* precomposed with accent above, no need to find one */
849                        (c==0xcc || c==0xcd || c==0x128))
850                ) {
851                    /*
852                        # Lithuanian
853
854                        # Lithuanian retains the dot in a lowercase i when followed by accents.
855
856                        # Introduce an explicit dot above when lowercasing capital I's and J's
857                        # whenever there are more accents above.
858                        # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
859
860                        0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
861                        004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
862                        012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
863                        00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
864                        00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
865                        0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
866                     */
867                    try {
868                        switch(c) {
869                        case 0x49:  /* LATIN CAPITAL LETTER I */
870                            out.append(iDot);
871                            return 2;
872                        case 0x4a:  /* LATIN CAPITAL LETTER J */
873                            out.append(jDot);
874                            return 2;
875                        case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
876                            out.append(iOgonekDot);
877                            return 2;
878                        case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
879                            out.append(iDotGrave);
880                            return 3;
881                        case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
882                            out.append(iDotAcute);
883                            return 3;
884                        case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
885                            out.append(iDotTilde);
886                            return 3;
887                        default:
888                            return 0; /* will not occur */
889                        }
890                    } catch (IOException e) {
891                        throw new ICUUncheckedIOException(e);
892                    }
893                /* # Turkish and Azeri */
894                } else if(caseLocale==LOC_TURKISH && c==0x130) {
895                    /*
896                        # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
897                        # The following rules handle those cases.
898
899                        0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
900                        0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
901                     */
902                    return 0x69;
903                } else if(caseLocale==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) {
904                    /*
905                        # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
906                        # This matches the behavior of the canonically equivalent I-dot_above
907
908                        0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
909                        0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
910                     */
911                    return 0; /* remove the dot (continue without output) */
912                } else if(caseLocale==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) {
913                    /*
914                        # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
915
916                        0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
917                        0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
918                     */
919                    return 0x131;
920                } else if(c==0x130) {
921                    /*
922                        # Preserve canonical equivalence for I with dot. Turkic is handled below.
923
924                        0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
925                     */
926                    try {
927                        out.append(iDot);
928                        return 2;
929                    } catch (IOException e) {
930                        throw new ICUUncheckedIOException(e);
931                    }
932                } else if(  c==0x3a3 &&
933                            !isFollowedByCasedLetter(iter, 1) &&
934                            isFollowedByCasedLetter(iter, -1) /* -1=preceded */
935                ) {
936                    /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
937                    /*
938                        # Special case for final form of sigma
939
940                        03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
941                     */
942                    return 0x3c2; /* greek small final sigma */
943                } else {
944                    /* no known conditional special case mapping, use a normal mapping */
945                }
946            } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
947                long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
948                full=(int)value&FULL_LOWER;
949                if(full!=0) {
950                    /* start of full case mapping strings */
951                    excOffset=(int)(value>>32)+1;
952
953                    try {
954                        // append the lowercase mapping
955                        out.append(exceptions, excOffset, excOffset+full);
956
957                        /* return the string length */
958                        return full;
959                    } catch (IOException e) {
960                        throw new ICUUncheckedIOException(e);
961                    }
962                }
963            }
964
965            if(hasSlot(excWord, EXC_LOWER)) {
966                result=getSlotValue(excWord, EXC_LOWER, excOffset2);
967            }
968        }
969
970        return (result==c) ? ~result : result;
971    }
972
973    /* internal */
974    private final int toUpperOrTitle(int c, ContextIterator iter,
975                                     Appendable out,
976                                     int loc,
977                                     boolean upperNotTitle) {
978        int result;
979        int props;
980
981        result=c;
982        props=trie.get(c);
983        if(!propsHasException(props)) {
984            if(getTypeFromProps(props)==LOWER) {
985                result=c+getDelta(props);
986            }
987        } else {
988            int excOffset=getExceptionsOffset(props), excOffset2;
989            int excWord=exceptions.charAt(excOffset++);
990            int full, index;
991
992            excOffset2=excOffset;
993
994            if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
995                /* use hardcoded conditions and mappings */
996                if(loc==LOC_TURKISH && c==0x69) {
997                    /*
998                        # Turkish and Azeri
999
1000                        # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1001                        # The following rules handle those cases.
1002
1003                        # When uppercasing, i turns into a dotted capital I
1004
1005                        0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1006                        0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1007                    */
1008                    return 0x130;
1009                } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter)) {
1010                    /*
1011                        # Lithuanian
1012
1013                        # Lithuanian retains the dot in a lowercase i when followed by accents.
1014
1015                        # Remove DOT ABOVE after "i" with upper or titlecase
1016
1017                        0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1018                     */
1019                    return 0; /* remove the dot (continue without output) */
1020                } else {
1021                    /* no known conditional special case mapping, use a normal mapping */
1022                }
1023            } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
1024                long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
1025                full=(int)value&0xffff;
1026
1027                /* start of full case mapping strings */
1028                excOffset=(int)(value>>32)+1;
1029
1030                /* skip the lowercase and case-folding result strings */
1031                excOffset+=full&FULL_LOWER;
1032                full>>=4;
1033                excOffset+=full&0xf;
1034                full>>=4;
1035
1036                if(upperNotTitle) {
1037                    full&=0xf;
1038                } else {
1039                    /* skip the uppercase result string */
1040                    excOffset+=full&0xf;
1041                    full=(full>>4)&0xf;
1042                }
1043
1044                if(full!=0) {
1045                    try {
1046                        // append the result string
1047                        out.append(exceptions, excOffset, excOffset+full);
1048
1049                        /* return the string length */
1050                        return full;
1051                    } catch (IOException e) {
1052                        throw new ICUUncheckedIOException(e);
1053                    }
1054                }
1055            }
1056
1057            if(!upperNotTitle && hasSlot(excWord, EXC_TITLE)) {
1058                index=EXC_TITLE;
1059            } else if(hasSlot(excWord, EXC_UPPER)) {
1060                /* here, titlecase is same as uppercase */
1061                index=EXC_UPPER;
1062            } else {
1063                return ~c;
1064            }
1065            result=getSlotValue(excWord, index, excOffset2);
1066        }
1067
1068        return (result==c) ? ~result : result;
1069    }
1070
1071    public final int toFullUpper(int c, ContextIterator iter,
1072                                 Appendable out,
1073                                 int caseLocale) {
1074        return toUpperOrTitle(c, iter, out, caseLocale, true);
1075    }
1076
1077    public final int toFullTitle(int c, ContextIterator iter,
1078                                 Appendable out,
1079                                 int caseLocale) {
1080        return toUpperOrTitle(c, iter, out, caseLocale, false);
1081    }
1082
1083    /* case folding ------------------------------------------------------------- */
1084
1085    /*
1086     * Case folding is similar to lowercasing.
1087     * The result may be a simple mapping, i.e., a single code point, or
1088     * a full mapping, i.e., a string.
1089     * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1090     * then only the lowercase mapping is stored.
1091     *
1092     * Some special cases are hardcoded because their conditions cannot be
1093     * parsed and processed from CaseFolding.txt.
1094     *
1095     * Unicode 3.2 CaseFolding.txt specifies for its status field:
1096
1097    # C: common case folding, common mappings shared by both simple and full mappings.
1098    # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1099    # S: simple case folding, mappings to single characters where different from F.
1100    # T: special case for uppercase I and dotted uppercase I
1101    #    - For non-Turkic languages, this mapping is normally not used.
1102    #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1103    #
1104    # Usage:
1105    #  A. To do a simple case folding, use the mappings with status C + S.
1106    #  B. To do a full case folding, use the mappings with status C + F.
1107    #
1108    #    The mappings with status T can be used or omitted depending on the desired case-folding
1109    #    behavior. (The default option is to exclude them.)
1110
1111     * Unicode 3.2 has 'T' mappings as follows:
1112
1113    0049; T; 0131; # LATIN CAPITAL LETTER I
1114    0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1115
1116     * while the default mappings for these code points are:
1117
1118    0049; C; 0069; # LATIN CAPITAL LETTER I
1119    0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1120
1121     * U+0130 has no simple case folding (simple-case-folds to itself).
1122     */
1123
1124    /**
1125     * Bit mask for getting just the options from a string compare options word
1126     * that are relevant for case folding (of a single string or code point).
1127     *
1128     * Currently only bit 0 for FOLD_CASE_EXCLUDE_SPECIAL_I.
1129     * It is conceivable that at some point we might use one more bit for using uppercase sharp s.
1130     * It is conceivable that at some point we might want the option to use only simple case foldings
1131     * when operating on strings.
1132     *
1133     * @internal
1134     */
1135    private static final int FOLD_CASE_OPTIONS_MASK = 7;
1136
1137    /* return the simple case folding mapping for c */
1138    public final int fold(int c, int options) {
1139        int props=trie.get(c);
1140        if(!propsHasException(props)) {
1141            if(getTypeFromProps(props)>=UPPER) {
1142                c+=getDelta(props);
1143            }
1144        } else {
1145            int excOffset=getExceptionsOffset(props);
1146            int excWord=exceptions.charAt(excOffset++);
1147            int index;
1148            if((excWord&EXC_CONDITIONAL_FOLD)!=0) {
1149                /* special case folding mappings, hardcoded */
1150                if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) {
1151                    /* default mappings */
1152                    if(c==0x49) {
1153                        /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1154                        return 0x69;
1155                    } else if(c==0x130) {
1156                        /* no simple case folding for U+0130 */
1157                        return c;
1158                    }
1159                } else {
1160                    /* Turkic mappings */
1161                    if(c==0x49) {
1162                        /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1163                        return 0x131;
1164                    } else if(c==0x130) {
1165                        /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1166                        return 0x69;
1167                    }
1168                }
1169            }
1170            if(hasSlot(excWord, EXC_FOLD)) {
1171                index=EXC_FOLD;
1172            } else if(hasSlot(excWord, EXC_LOWER)) {
1173                index=EXC_LOWER;
1174            } else {
1175                return c;
1176            }
1177            c=getSlotValue(excWord, index, excOffset);
1178        }
1179        return c;
1180    }
1181
1182    /*
1183     * Issue for canonical caseless match (UAX #21):
1184     * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1185     * canonical equivalence, unlike default-option casefolding.
1186     * For example, I-grave and I + grave fold to strings that are not canonically
1187     * equivalent.
1188     * For more details, see the comment in unorm_compare() in unorm.cpp
1189     * and the intermediate prototype changes for Jitterbug 2021.
1190     * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1191     *
1192     * This did not get fixed because it appears that it is not possible to fix
1193     * it for uppercase and lowercase characters (I-grave vs. i-grave)
1194     * together in a way that they still fold to common result strings.
1195     */
1196
1197    public final int toFullFolding(int c, Appendable out, int options) {
1198        int result;
1199        int props;
1200
1201        result=c;
1202        props=trie.get(c);
1203        if(!propsHasException(props)) {
1204            if(getTypeFromProps(props)>=UPPER) {
1205                result=c+getDelta(props);
1206            }
1207        } else {
1208            int excOffset=getExceptionsOffset(props), excOffset2;
1209            int excWord=exceptions.charAt(excOffset++);
1210            int full, index;
1211
1212            excOffset2=excOffset;
1213
1214            if((excWord&EXC_CONDITIONAL_FOLD)!=0) {
1215                /* use hardcoded conditions and mappings */
1216                if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) {
1217                    /* default mappings */
1218                    if(c==0x49) {
1219                        /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1220                        return 0x69;
1221                    } else if(c==0x130) {
1222                        /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1223                        try {
1224                            out.append(iDot);
1225                            return 2;
1226                        } catch (IOException e) {
1227                            throw new ICUUncheckedIOException(e);
1228                        }
1229                    }
1230                } else {
1231                    /* Turkic mappings */
1232                    if(c==0x49) {
1233                        /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1234                        return 0x131;
1235                    } else if(c==0x130) {
1236                        /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1237                        return 0x69;
1238                    }
1239                }
1240            } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
1241                long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
1242                full=(int)value&0xffff;
1243
1244                /* start of full case mapping strings */
1245                excOffset=(int)(value>>32)+1;
1246
1247                /* skip the lowercase result string */
1248                excOffset+=full&FULL_LOWER;
1249                full=(full>>4)&0xf;
1250
1251                if(full!=0) {
1252                    try {
1253                        // append the result string
1254                        out.append(exceptions, excOffset, excOffset+full);
1255
1256                        /* return the string length */
1257                        return full;
1258                    } catch (IOException e) {
1259                        throw new ICUUncheckedIOException(e);
1260                    }
1261                }
1262            }
1263
1264            if(hasSlot(excWord, EXC_FOLD)) {
1265                index=EXC_FOLD;
1266            } else if(hasSlot(excWord, EXC_LOWER)) {
1267                index=EXC_LOWER;
1268            } else {
1269                return ~c;
1270            }
1271            result=getSlotValue(excWord, index, excOffset2);
1272        }
1273
1274        return (result==c) ? ~result : result;
1275    }
1276
1277    /* case mapping properties API ---------------------------------------------- */
1278
1279    /*
1280     * We need a StringBuilder for multi-code point output from the
1281     * full case mapping functions. However, we do not actually use that output,
1282     * we just check whether the input character was mapped to anything else.
1283     * We use a shared StringBuilder to avoid allocating a new one in each call.
1284     * We remove its contents each time so that it does not grow large over time.
1285     *
1286     * @internal
1287     */
1288    public static final StringBuilder dummyStringBuilder = new StringBuilder();
1289
1290    public final boolean hasBinaryProperty(int c, int which) {
1291        switch(which) {
1292        case UProperty.LOWERCASE:
1293            return LOWER==getType(c);
1294        case UProperty.UPPERCASE:
1295            return UPPER==getType(c);
1296        case UProperty.SOFT_DOTTED:
1297            return isSoftDotted(c);
1298        case UProperty.CASE_SENSITIVE:
1299            return isCaseSensitive(c);
1300        case UProperty.CASED:
1301            return NONE!=getType(c);
1302        case UProperty.CASE_IGNORABLE:
1303            return (getTypeOrIgnorable(c)>>2)!=0;
1304        /*
1305         * Note: The following Changes_When_Xyz are defined as testing whether
1306         * the NFD form of the input changes when Xyz-case-mapped.
1307         * However, this simpler implementation of these properties,
1308         * ignoring NFD, passes the tests.
1309         * The implementation needs to be changed if the tests start failing.
1310         * When that happens, optimizations should be used to work with the
1311         * per-single-code point ucase_toFullXyz() functions unless
1312         * the NFD form has more than one code point,
1313         * and the property starts set needs to be the union of the
1314         * start sets for normalization and case mappings.
1315         */
1316        case UProperty.CHANGES_WHEN_LOWERCASED:
1317            dummyStringBuilder.setLength(0);
1318            return toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0;
1319        case UProperty.CHANGES_WHEN_UPPERCASED:
1320            dummyStringBuilder.setLength(0);
1321            return toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0;
1322        case UProperty.CHANGES_WHEN_TITLECASED:
1323            dummyStringBuilder.setLength(0);
1324            return toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0;
1325        /* case UProperty.CHANGES_WHEN_CASEFOLDED: -- in UCharacterProperty.java */
1326        case UProperty.CHANGES_WHEN_CASEMAPPED:
1327            dummyStringBuilder.setLength(0);
1328            return
1329                toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0 ||
1330                toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0 ||
1331                toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0;
1332        default:
1333            return false;
1334        }
1335    }
1336
1337    // data members -------------------------------------------------------- ***
1338    private int indexes[];
1339    private String exceptions;
1340    private char unfold[];
1341
1342    private Trie2_16 trie;
1343
1344    // data format constants ----------------------------------------------- ***
1345    private static final String DATA_NAME="ucase";
1346    private static final String DATA_TYPE="icu";
1347    private static final String DATA_FILE_NAME=DATA_NAME+"."+DATA_TYPE;
1348
1349    /* format "cAsE" */
1350    private static final int FMT=0x63415345;
1351
1352    /* indexes into indexes[] */
1353    //private static final int IX_INDEX_TOP=0;
1354    //private static final int IX_LENGTH=1;
1355    private static final int IX_TRIE_SIZE=2;
1356    private static final int IX_EXC_LENGTH=3;
1357    private static final int IX_UNFOLD_LENGTH=4;
1358
1359    //private static final int IX_MAX_FULL_LENGTH=15;
1360    private static final int IX_TOP=16;
1361
1362    // definitions for 16-bit case properties word ------------------------- ***
1363
1364    /* 2-bit constants for types of cased characters */
1365    public static final int TYPE_MASK=3;
1366    public static final int NONE=0;
1367    public static final int LOWER=1;
1368    public static final int UPPER=2;
1369    public static final int TITLE=3;
1370
1371    /** @return NONE, LOWER, UPPER, TITLE */
1372    private static final int getTypeFromProps(int props) {
1373        return props&TYPE_MASK;
1374    }
1375
1376    /** @return like getTypeFromProps() but also sets IGNORABLE if props indicate case-ignorable */
1377    private static final int getTypeAndIgnorableFromProps(int props) {
1378        return props&7;
1379    }
1380
1381    static final int IGNORABLE=4;
1382    private static final int SENSITIVE=     8;
1383    private static final int EXCEPTION=     0x10;
1384
1385    private static final int DOT_MASK=      0x60;
1386    //private static final int NO_DOT=        0;      /* normal characters with cc=0 */
1387    private static final int SOFT_DOTTED=   0x20;   /* soft-dotted characters with cc=0 */
1388    private static final int ABOVE=         0x40;   /* "above" accents with cc=230 */
1389    private static final int OTHER_ACCENT=  0x60;   /* other accent character (0<cc!=230) */
1390
1391    /* no exception: bits 15..7 are a 9-bit signed case mapping delta */
1392    private static final int DELTA_SHIFT=   7;
1393    //private static final int DELTA_MASK=    0xff80;
1394    //private static final int MAX_DELTA=     0xff;
1395    //private static final int MIN_DELTA=     (-MAX_DELTA-1);
1396
1397    private static final int getDelta(int props) {
1398        return (short)props>>DELTA_SHIFT;
1399    }
1400
1401    /* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */
1402    private static final int EXC_SHIFT=     5;
1403    //private static final int EXC_MASK=      0xffe0;
1404    //private static final int MAX_EXCEPTIONS=((EXC_MASK>>EXC_SHIFT)+1);
1405
1406    /* definitions for 16-bit main exceptions word ------------------------------ */
1407
1408    /* first 8 bits indicate values in optional slots */
1409    private static final int EXC_LOWER=0;
1410    private static final int EXC_FOLD=1;
1411    private static final int EXC_UPPER=2;
1412    private static final int EXC_TITLE=3;
1413    //private static final int EXC_4=4;           /* reserved */
1414    //private static final int EXC_5=5;           /* reserved */
1415    private static final int EXC_CLOSURE=6;
1416    private static final int EXC_FULL_MAPPINGS=7;
1417    //private static final int EXC_ALL_SLOTS=8;   /* one past the last slot */
1418
1419    /* each slot is 2 uint16_t instead of 1 */
1420    private static final int EXC_DOUBLE_SLOTS=          0x100;
1421
1422    /* reserved: exception bits 11..9 */
1423
1424    /* EXC_DOT_MASK=DOT_MASK<<EXC_DOT_SHIFT */
1425    private static final int EXC_DOT_SHIFT=7;
1426
1427    /* normally stored in the main word, but pushed out for larger exception indexes */
1428    //private static final int EXC_DOT_MASK=              0x3000;
1429    //private static final int EXC_NO_DOT=                0;
1430    //private static final int EXC_SOFT_DOTTED=           0x1000;
1431    //private static final int EXC_ABOVE=                 0x2000; /* "above" accents with cc=230 */
1432    //private static final int EXC_OTHER_ACCENT=          0x3000; /* other character (0<cc!=230) */
1433
1434    /* complex/conditional mappings */
1435    private static final int EXC_CONDITIONAL_SPECIAL=   0x4000;
1436    private static final int EXC_CONDITIONAL_FOLD=      0x8000;
1437
1438    /* definitions for lengths word for full case mappings */
1439    private static final int FULL_LOWER=    0xf;
1440    //private static final int FULL_FOLDING=  0xf0;
1441    //private static final int FULL_UPPER=    0xf00;
1442    //private static final int FULL_TITLE=    0xf000;
1443
1444    /* maximum lengths */
1445    //private static final int FULL_MAPPINGS_MAX_LENGTH=4*0xf;
1446    private static final int CLOSURE_MAX_LENGTH=0xf;
1447
1448    /* constants for reverse case folding ("unfold") data */
1449    private static final int UNFOLD_ROWS=0;
1450    private static final int UNFOLD_ROW_WIDTH=1;
1451    private static final int UNFOLD_STRING_WIDTH=2;
1452
1453    /*
1454     * public singleton instance
1455     */
1456    public static final UCaseProps INSTANCE;
1457
1458    // This static initializer block must be placed after
1459    // other static member initialization
1460    static {
1461        try {
1462            INSTANCE = new UCaseProps();
1463        } catch (IOException e) {
1464            throw new ICUUncheckedIOException(e);
1465        }
1466    }
1467}
1468