Normalizer2Impl.java revision f86f25d102340da66b9c7cb6b2d5ecdc0de43ecf
1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 *******************************************************************************
6 *   Copyright (C) 2009-2015, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *******************************************************************************
9 */
10
11package android.icu.impl;
12
13import java.io.IOException;
14import java.nio.ByteBuffer;
15import java.util.ArrayList;
16import java.util.Iterator;
17
18import android.icu.text.UTF16;
19import android.icu.text.UnicodeSet;
20import android.icu.util.ICUUncheckedIOException;
21import android.icu.util.VersionInfo;
22
23/**
24 * @hide Only a subset of ICU is exposed in Android
25 */
26public final class Normalizer2Impl {
27    public static final class Hangul {
28        /* Korean Hangul and Jamo constants */
29        public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
30        public static final int JAMO_L_END=0x1112;
31        public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
32        public static final int JAMO_V_END=0x1175;
33        public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
34        public static final int JAMO_T_END=0x11c2;
35
36        public static final int HANGUL_BASE=0xac00;
37        public static final int HANGUL_END=0xd7a3;
38
39        public static final int JAMO_L_COUNT=19;
40        public static final int JAMO_V_COUNT=21;
41        public static final int JAMO_T_COUNT=28;
42
43        public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;
44        public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;
45
46        public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT;
47
48        public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
49        public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
50
51        public static boolean isHangul(int c) {
52            return HANGUL_BASE<=c && c<HANGUL_LIMIT;
53        }
54        public static boolean isHangulWithoutJamoT(char c) {
55            c-=HANGUL_BASE;
56            return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
57        }
58        public static boolean isJamoL(int c) {
59            return JAMO_L_BASE<=c && c<JAMO_L_LIMIT;
60        }
61        public static boolean isJamoV(int c) {
62            return JAMO_V_BASE<=c && c<JAMO_V_LIMIT;
63        }
64
65        /**
66         * Decomposes c, which must be a Hangul syllable, into buffer
67         * and returns the length of the decomposition (2 or 3).
68         */
69        public static int decompose(int c, Appendable buffer) {
70            try {
71                c-=HANGUL_BASE;
72                int c2=c%JAMO_T_COUNT;
73                c/=JAMO_T_COUNT;
74                buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
75                buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
76                if(c2==0) {
77                    return 2;
78                } else {
79                    buffer.append((char)(JAMO_T_BASE+c2));
80                    return 3;
81                }
82            } catch(IOException e) {
83                // Will not occur because we do not write to I/O.
84                throw new ICUUncheckedIOException(e);
85            }
86        }
87
88        /**
89         * Decomposes c, which must be a Hangul syllable, into buffer.
90         * This is the raw, not recursive, decomposition. Its length is always 2.
91         */
92        public static void getRawDecomposition(int c, Appendable buffer) {
93            try {
94                int orig=c;
95                c-=HANGUL_BASE;
96                int c2=c%JAMO_T_COUNT;
97                if(c2==0) {
98                    c/=JAMO_T_COUNT;
99                    buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
100                    buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
101                } else {
102                    buffer.append((char)(orig-c2));  // LV syllable
103                    buffer.append((char)(JAMO_T_BASE+c2));
104                }
105            } catch(IOException e) {
106                // Will not occur because we do not write to I/O.
107                throw new ICUUncheckedIOException(e);
108            }
109        }
110    }
111
112    /**
113     * Writable buffer that takes care of canonical ordering.
114     * Its Appendable methods behave like the C++ implementation's
115     * appendZeroCC() methods.
116     * <p>
117     * If dest is a StringBuilder, then the buffer writes directly to it.
118     * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
119     * until no further changes are necessary and whole segments are appended.
120     * append() methods that take combining-class values always write to the StringBuilder.
121     * Other append() methods flush and append to the Appendable.
122     */
123    public static final class ReorderingBuffer implements Appendable {
124        public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) {
125            impl=ni;
126            app=dest;
127            if(app instanceof StringBuilder) {
128                appIsStringBuilder=true;
129                str=(StringBuilder)dest;
130                // In Java, the constructor subsumes public void init(int destCapacity) {
131                str.ensureCapacity(destCapacity);
132                reorderStart=0;
133                if(str.length()==0) {
134                    lastCC=0;
135                } else {
136                    setIterator();
137                    lastCC=previousCC();
138                    // Set reorderStart after the last code point with cc<=1 if there is one.
139                    if(lastCC>1) {
140                        while(previousCC()>1) {}
141                    }
142                    reorderStart=codePointLimit;
143                }
144            } else {
145                appIsStringBuilder=false;
146                str=new StringBuilder();
147                reorderStart=0;
148                lastCC=0;
149            }
150        }
151
152        public boolean isEmpty() { return str.length()==0; }
153        public int length() { return str.length(); }
154        public int getLastCC() { return lastCC; }
155
156        public StringBuilder getStringBuilder() { return str; }
157
158        public boolean equals(CharSequence s, int start, int limit) {
159            return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
160        }
161
162        // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
163        public void setLastChar(char c) {
164            str.setCharAt(str.length()-1, c);
165        }
166
167        public void append(int c, int cc) {
168            if(lastCC<=cc || cc==0) {
169                str.appendCodePoint(c);
170                lastCC=cc;
171                if(cc<=1) {
172                    reorderStart=str.length();
173                }
174            } else {
175                insert(c, cc);
176            }
177        }
178        // s must be in NFD, otherwise change the implementation.
179        public void append(CharSequence s, int start, int limit,
180                           int leadCC, int trailCC) {
181            if(start==limit) {
182                return;
183            }
184            if(lastCC<=leadCC || leadCC==0) {
185                if(trailCC<=1) {
186                    reorderStart=str.length()+(limit-start);
187                } else if(leadCC<=1) {
188                    reorderStart=str.length()+1;  // Ok if not a code point boundary.
189                }
190                str.append(s, start, limit);
191                lastCC=trailCC;
192            } else {
193                int c=Character.codePointAt(s, start);
194                start+=Character.charCount(c);
195                insert(c, leadCC);  // insert first code point
196                while(start<limit) {
197                    c=Character.codePointAt(s, start);
198                    start+=Character.charCount(c);
199                    if(start<limit) {
200                        // s must be in NFD, otherwise we need to use getCC().
201                        leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
202                    } else {
203                        leadCC=trailCC;
204                    }
205                    append(c, leadCC);
206                }
207            }
208        }
209        // The following append() methods work like C++ appendZeroCC().
210        // They assume that the cc or trailCC of their input is 0.
211        // Most of them implement Appendable interface methods.
212        // @Override when we switch to Java 6
213        @Override
214        public ReorderingBuffer append(char c) {
215            str.append(c);
216            lastCC=0;
217            reorderStart=str.length();
218            return this;
219        }
220        public void appendZeroCC(int c) {
221            str.appendCodePoint(c);
222            lastCC=0;
223            reorderStart=str.length();
224        }
225        // @Override when we switch to Java 6
226        @Override
227        public ReorderingBuffer append(CharSequence s) {
228            if(s.length()!=0) {
229                str.append(s);
230                lastCC=0;
231                reorderStart=str.length();
232            }
233            return this;
234        }
235        // @Override when we switch to Java 6
236        @Override
237        public ReorderingBuffer append(CharSequence s, int start, int limit) {
238            if(start!=limit) {
239                str.append(s, start, limit);
240                lastCC=0;
241                reorderStart=str.length();
242            }
243            return this;
244        }
245        /**
246         * Flushes from the intermediate StringBuilder to the Appendable,
247         * if they are different objects.
248         * Used after recomposition.
249         * Must be called at the end when writing to a non-StringBuilder Appendable.
250         */
251        public void flush() {
252            if(appIsStringBuilder) {
253                reorderStart=str.length();
254            } else {
255                try {
256                    app.append(str);
257                    str.setLength(0);
258                    reorderStart=0;
259                } catch(IOException e) {
260                    throw new ICUUncheckedIOException(e);  // Avoid declaring "throws IOException".
261                }
262            }
263            lastCC=0;
264        }
265        /**
266         * Flushes from the intermediate StringBuilder to the Appendable,
267         * if they are different objects.
268         * Then appends the new text to the Appendable or StringBuilder.
269         * Normally used after quick check loops find a non-empty sequence.
270         */
271        public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
272            if(appIsStringBuilder) {
273                str.append(s, start, limit);
274                reorderStart=str.length();
275            } else {
276                try {
277                    app.append(str).append(s, start, limit);
278                    str.setLength(0);
279                    reorderStart=0;
280                } catch(IOException e) {
281                    throw new ICUUncheckedIOException(e);  // Avoid declaring "throws IOException".
282                }
283            }
284            lastCC=0;
285            return this;
286        }
287        public void remove() {
288            str.setLength(0);
289            lastCC=0;
290            reorderStart=0;
291        }
292        public void removeSuffix(int suffixLength) {
293            int oldLength=str.length();
294            str.delete(oldLength-suffixLength, oldLength);
295            lastCC=0;
296            reorderStart=str.length();
297        }
298
299        /*
300         * TODO: Revisit whether it makes sense to track reorderStart.
301         * It is set to after the last known character with cc<=1,
302         * which stops previousCC() before it reads that character and looks up its cc.
303         * previousCC() is normally only called from insert().
304         * In other words, reorderStart speeds up the insertion of a combining mark
305         * into a multi-combining mark sequence where it does not belong at the end.
306         * This might not be worth the trouble.
307         * On the other hand, it's not a huge amount of trouble.
308         *
309         * We probably need it for UNORM_SIMPLE_APPEND.
310         */
311
312        // Inserts c somewhere before the last character.
313        // Requires 0<cc<lastCC which implies reorderStart<limit.
314        private void insert(int c, int cc) {
315            for(setIterator(), skipPrevious(); previousCC()>cc;) {}
316            // insert c at codePointLimit, after the character with prevCC<=cc
317            if(c<=0xffff) {
318                str.insert(codePointLimit, (char)c);
319                if(cc<=1) {
320                    reorderStart=codePointLimit+1;
321                }
322            } else {
323                str.insert(codePointLimit, Character.toChars(c));
324                if(cc<=1) {
325                    reorderStart=codePointLimit+2;
326                }
327            }
328        }
329
330        private final Normalizer2Impl impl;
331        private final Appendable app;
332        private final StringBuilder str;
333        private final boolean appIsStringBuilder;
334        private int reorderStart;
335        private int lastCC;
336
337        // private backward iterator
338        private void setIterator() { codePointStart=str.length(); }
339        private void skipPrevious() {  // Requires 0<codePointStart.
340            codePointLimit=codePointStart;
341            codePointStart=str.offsetByCodePoints(codePointStart, -1);
342        }
343        private int previousCC() {  // Returns 0 if there is no previous character.
344            codePointLimit=codePointStart;
345            if(reorderStart>=codePointStart) {
346                return 0;
347            }
348            int c=str.codePointBefore(codePointStart);
349            codePointStart-=Character.charCount(c);
350            if(c<MIN_CCC_LCCC_CP) {
351                return 0;
352            }
353            return getCCFromYesOrMaybe(impl.getNorm16(c));
354        }
355
356        private int codePointStart, codePointLimit;
357    }
358
359    // TODO: Propose as public API on the UTF16 class.
360    // TODO: Propose widening UTF16 methods that take char to take int.
361    // TODO: Propose widening UTF16 methods that take String to take CharSequence.
362    public static final class UTF16Plus {
363        /**
364         * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
365         * is it a lead surrogate?
366         * @param c code unit or code point
367         * @return true or false
368         */
369        public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
370        /**
371         * Compares two CharSequence objects for binary equality.
372         * @param s1 first sequence
373         * @param s2 second sequence
374         * @return true if s1 contains the same text as s2
375         */
376        public static boolean equal(CharSequence s1,  CharSequence s2) {
377            if(s1==s2) {
378                return true;
379            }
380            int length=s1.length();
381            if(length!=s2.length()) {
382                return false;
383            }
384            for(int i=0; i<length; ++i) {
385                if(s1.charAt(i)!=s2.charAt(i)) {
386                    return false;
387                }
388            }
389            return true;
390        }
391        /**
392         * Compares two CharSequence subsequences for binary equality.
393         * @param s1 first sequence
394         * @param start1 start offset in first sequence
395         * @param limit1 limit offset in first sequence
396         * @param s2 second sequence
397         * @param start2 start offset in second sequence
398         * @param limit2 limit offset in second sequence
399         * @return true if s1.subSequence(start1, limit1) contains the same text
400         *              as s2.subSequence(start2, limit2)
401         */
402        public static boolean equal(CharSequence s1, int start1, int limit1,
403                                    CharSequence s2, int start2, int limit2) {
404            if((limit1-start1)!=(limit2-start2)) {
405                return false;
406            }
407            if(s1==s2 && start1==start2) {
408                return true;
409            }
410            while(start1<limit1) {
411                if(s1.charAt(start1++)!=s2.charAt(start2++)) {
412                    return false;
413                }
414            }
415            return true;
416        }
417    }
418
419    public Normalizer2Impl() {}
420
421    private static final class IsAcceptable implements ICUBinary.Authenticate {
422        // @Override when we switch to Java 6
423        @Override
424        public boolean isDataVersionAcceptable(byte version[]) {
425            return version[0]==2;
426        }
427    }
428    private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
429    private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
430
431    public Normalizer2Impl load(ByteBuffer bytes) {
432        try {
433            dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
434            int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
435            if(indexesLength<=IX_MIN_MAYBE_YES) {
436                throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes");
437            }
438            int[] inIndexes=new int[indexesLength];
439            inIndexes[0]=indexesLength*4;
440            for(int i=1; i<indexesLength; ++i) {
441                inIndexes[i]=bytes.getInt();
442            }
443
444            minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
445            minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
446
447            minYesNo=inIndexes[IX_MIN_YES_NO];
448            minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
449            minNoNo=inIndexes[IX_MIN_NO_NO];
450            limitNoNo=inIndexes[IX_LIMIT_NO_NO];
451            minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
452
453            // Read the normTrie.
454            int offset=inIndexes[IX_NORM_TRIE_OFFSET];
455            int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
456            normTrie=Trie2_16.createFromSerialized(bytes);
457            int trieLength=normTrie.getSerializedLength();
458            if(trieLength>(nextOffset-offset)) {
459                throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie");
460            }
461            ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
462
463            // Read the composition and mapping data.
464            offset=nextOffset;
465            nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
466            int numChars=(nextOffset-offset)/2;
467            if(numChars!=0) {
468                maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
469                extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes);
470            }
471
472            // smallFCD: new in formatVersion 2
473            offset=nextOffset;
474            smallFCD=new byte[0x100];
475            bytes.get(smallFCD);
476
477            // Build tccc180[].
478            // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
479            tccc180=new int[0x180];
480            int bits=0;
481            for(int c=0; c<0x180; bits>>=1) {
482                if((c&0xff)==0) {
483                    bits=smallFCD[c>>8];  // one byte per 0x100 code points
484                }
485                if((bits&1)!=0) {
486                    for(int i=0; i<0x20; ++i, ++c) {
487                        tccc180[c]=getFCD16FromNormData(c)&0xff;
488                    }
489                } else {
490                    c+=0x20;
491                }
492            }
493
494            return this;
495        } catch(IOException e) {
496            throw new ICUUncheckedIOException(e);
497        }
498    }
499    public Normalizer2Impl load(String name) {
500        return load(ICUBinary.getRequiredData(name));
501    }
502
503    private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) {
504        if(isAlgorithmicNoNo(norm16)) {
505            // Range of code points with same-norm16-value algorithmic decompositions.
506            // They might have different non-zero FCD16 values.
507            do {
508                int fcd16=getFCD16(start);
509                if(fcd16>0xff) { set.add(start); }
510            } while(++start<=end);
511        } else {
512            int fcd16=getFCD16(start);
513            if(fcd16>0xff) { set.add(start, end); }
514        }
515    }
516
517    private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) {
518        /* add the start code point to the USet */
519        set.add(start);
520        if(start!=end && isAlgorithmicNoNo(value)) {
521            // Range of code points with same-norm16-value algorithmic decompositions.
522            // They might have different non-zero FCD16 values.
523            int prevFCD16=getFCD16(start);
524            while(++start<=end) {
525                int fcd16=getFCD16(start);
526                if(fcd16!=prevFCD16) {
527                    set.add(start);
528                    prevFCD16=fcd16;
529                }
530            }
531        }
532    }
533
534    public void addLcccChars(UnicodeSet set) {
535        /* add the start code point of each same-value range of each trie */
536        Iterator<Trie2.Range> trieIterator=normTrie.iterator();
537        Trie2.Range range;
538        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
539            enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set);
540        }
541    }
542
543    public void addPropertyStarts(UnicodeSet set) {
544        /* add the start code point of each same-value range of each trie */
545        Iterator<Trie2.Range> trieIterator=normTrie.iterator();
546        Trie2.Range range;
547        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
548            enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set);
549        }
550
551        /* add Hangul LV syllables and LV+1 because of skippables */
552        for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) {
553            set.add(c);
554            set.add(c+1);
555        }
556        set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
557    }
558
559    public void addCanonIterPropertyStarts(UnicodeSet set) {
560        /* add the start code point of each same-value range of the canonical iterator data trie */
561        ensureCanonIterData();
562        // currently only used for the SEGMENT_STARTER property
563        Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper);
564        Trie2.Range range;
565        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
566            /* add the start code point to the USet */
567            set.add(range.startCodePoint);
568        }
569    }
570    private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() {
571        @Override
572        public int map(int in) {
573            return in&CANON_NOT_SEGMENT_STARTER;
574        }
575    };
576
577    // low-level properties ------------------------------------------------ ***
578
579    public Trie2_16 getNormTrie() { return normTrie; }
580
581    // Note: Normalizer2Impl.java r30983 (2011-nov-27)
582    // still had getFCDTrie() which built and cached an FCD trie.
583    // That provided faster access to FCD data than getFCD16FromNormData()
584    // but required synchronization and consumed some 10kB of heap memory
585    // in any process that uses FCD (e.g., via collation).
586    // tccc180[] and smallFCD[] are intended to help with any loss of performance,
587    // at least for Latin & CJK.
588
589    /**
590     * Builds the canonical-iterator data for this instance.
591     * This is required before any of {@link #isCanonSegmentStarter(int)} or
592     * {@link #getCanonStartSet(int, UnicodeSet)} are called,
593     * or else they crash.
594     * @return this
595     */
596    public synchronized Normalizer2Impl ensureCanonIterData() {
597        if(canonIterData==null) {
598            Trie2Writable newData=new Trie2Writable(0, 0);
599            canonStartSets=new ArrayList<UnicodeSet>();
600            Iterator<Trie2.Range> trieIterator=normTrie.iterator();
601            Trie2.Range range;
602            while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
603                final int norm16=range.value;
604                if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
605                    // Inert, or 2-way mapping (including Hangul syllable).
606                    // We do not write a canonStartSet for any yesNo character.
607                    // Composites from 2-way mappings are added at runtime from the
608                    // starter's compositions list, and the other characters in
609                    // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
610                    // "maybe" characters.
611                    continue;
612                }
613                for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) {
614                    final int oldValue=newData.get(c);
615                    int newValue=oldValue;
616                    if(norm16>=minMaybeYes) {
617                        // not a segment starter if it occurs in a decomposition or has cc!=0
618                        newValue|=CANON_NOT_SEGMENT_STARTER;
619                        if(norm16<MIN_NORMAL_MAYBE_YES) {
620                            newValue|=CANON_HAS_COMPOSITIONS;
621                        }
622                    } else if(norm16<minYesNo) {
623                        newValue|=CANON_HAS_COMPOSITIONS;
624                    } else {
625                        // c has a one-way decomposition
626                        int c2=c;
627                        int norm16_2=norm16;
628                        while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
629                            c2=this.mapAlgorithmic(c2, norm16_2);
630                            norm16_2=getNorm16(c2);
631                        }
632                        if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
633                            // c decomposes, get everything from the variable-length extra data
634                            int firstUnit=extraData.charAt(norm16_2);
635                            int length=firstUnit&MAPPING_LENGTH_MASK;
636                            if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
637                                if(c==c2 && (extraData.charAt(norm16_2-1)&0xff)!=0) {
638                                    newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
639                                }
640                            }
641                            // Skip empty mappings (no characters in the decomposition).
642                            if(length!=0) {
643                                ++norm16_2;  // skip over the firstUnit
644                                // add c to first code point's start set
645                                int limit=norm16_2+length;
646                                c2=extraData.codePointAt(norm16_2);
647                                addToStartSet(newData, c, c2);
648                                // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
649                                // one-way mapping. A 2-way mapping is possible here after
650                                // intermediate algorithmic mapping.
651                                if(norm16_2>=minNoNo) {
652                                    while((norm16_2+=Character.charCount(c2))<limit) {
653                                        c2=extraData.codePointAt(norm16_2);
654                                        int c2Value=newData.get(c2);
655                                        if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
656                                            newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
657                                        }
658                                    }
659                                }
660                            }
661                        } else {
662                            // c decomposed to c2 algorithmically; c has cc==0
663                            addToStartSet(newData, c, c2);
664                        }
665                    }
666                    if(newValue!=oldValue) {
667                        newData.set(c, newValue);
668                    }
669                }
670            }
671            canonIterData=newData.toTrie2_32();
672        }
673        return this;
674    }
675
676    public int getNorm16(int c) { return normTrie.get(c); }
677
678    public int getCompQuickCheck(int norm16) {
679        if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
680            return 1;  // yes
681        } else if(minMaybeYes<=norm16) {
682            return 2;  // maybe
683        } else {
684            return 0;  // no
685        }
686    }
687    public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
688    public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
689    public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
690
691    public int getCC(int norm16) {
692        if(norm16>=MIN_NORMAL_MAYBE_YES) {
693            return norm16&0xff;
694        }
695        if(norm16<minNoNo || limitNoNo<=norm16) {
696            return 0;
697        }
698        return getCCFromNoNo(norm16);
699    }
700    public static int getCCFromYesOrMaybe(int norm16) {
701        return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0;
702    }
703
704    /**
705     * Returns the FCD data for code point c.
706     * @param c A Unicode code point.
707     * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
708     */
709    public int getFCD16(int c) {
710        if(c<0) {
711            return 0;
712        } else if(c<0x180) {
713            return tccc180[c];
714        } else if(c<=0xffff) {
715            if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
716        }
717        return getFCD16FromNormData(c);
718    }
719    /** Returns the FCD data for U+0000<=c<U+0180. */
720    public int getFCD16FromBelow180(int c) { return tccc180[c]; }
721    /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
722    public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
723        // 0<=lead<=0xffff
724        byte bits=smallFCD[lead>>8];
725        if(bits==0) { return false; }
726        return ((bits>>((lead>>5)&7))&1)!=0;
727    }
728
729    /** Gets the FCD value from the regular normalization data. */
730    public int getFCD16FromNormData(int c) {
731        // Only loops for 1:1 algorithmic mappings.
732        for(;;) {
733            int norm16=getNorm16(c);
734            if(norm16<=minYesNo) {
735                // no decomposition or Hangul syllable, all zeros
736                return 0;
737            } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
738                // combining mark
739                norm16&=0xff;
740                return norm16|(norm16<<8);
741            } else if(norm16>=minMaybeYes) {
742                return 0;
743            } else if(isDecompNoAlgorithmic(norm16)) {
744                c=mapAlgorithmic(c, norm16);
745            } else {
746                // c decomposes, get everything from the variable-length extra data
747                int firstUnit=extraData.charAt(norm16);
748                if((firstUnit&MAPPING_LENGTH_MASK)==0) {
749                    // A character that is deleted (maps to an empty string) must
750                    // get the worst-case lccc and tccc values because arbitrary
751                    // characters on both sides will become adjacent.
752                    return 0x1ff;
753                } else {
754                    int fcd16=firstUnit>>8;  // tccc
755                    if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
756                        fcd16|=extraData.charAt(norm16-1)&0xff00;  // lccc
757                    }
758                    return fcd16;
759                }
760            }
761        }
762    }
763
764    /**
765     * Gets the decomposition for one code point.
766     * @param c code point
767     * @return c's decomposition, if it has one; returns null if it does not have a decomposition
768     */
769    public String getDecomposition(int c) {
770        int decomp=-1;
771        int norm16;
772        for(;;) {
773            if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
774                // c does not decompose
775            } else if(isHangul(norm16)) {
776                // Hangul syllable: decompose algorithmically
777                StringBuilder buffer=new StringBuilder();
778                Hangul.decompose(c, buffer);
779                return buffer.toString();
780            } else if(isDecompNoAlgorithmic(norm16)) {
781                decomp=c=mapAlgorithmic(c, norm16);
782                continue;
783            } else {
784                // c decomposes, get everything from the variable-length extra data
785                int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK;
786                return extraData.substring(norm16, norm16+length);
787            }
788            if(decomp<0) {
789                return null;
790            } else {
791                return UTF16.valueOf(decomp);
792            }
793        }
794    }
795
796    /**
797     * Gets the raw decomposition for one code point.
798     * @param c code point
799     * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition
800     */
801    public String getRawDecomposition(int c) {
802        // We do not loop in this method because an algorithmic mapping itself
803        // becomes a final result rather than having to be decomposed recursively.
804        int norm16;
805        if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
806            // c does not decompose
807            return null;
808        } else if(isHangul(norm16)) {
809            // Hangul syllable: decompose algorithmically
810            StringBuilder buffer=new StringBuilder();
811            Hangul.getRawDecomposition(c, buffer);
812            return buffer.toString();
813        } else if(isDecompNoAlgorithmic(norm16)) {
814            return UTF16.valueOf(mapAlgorithmic(c, norm16));
815        } else {
816            // c decomposes, get everything from the variable-length extra data
817            int firstUnit=extraData.charAt(norm16);
818            int mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
819            if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) {
820                // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
821                // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
822                int rawMapping=norm16-((firstUnit>>7)&1)-1;
823                char rm0=extraData.charAt(rawMapping);
824                if(rm0<=MAPPING_LENGTH_MASK) {
825                    return extraData.substring(rawMapping-rm0, rawMapping);
826                } else {
827                    // Copy the normal mapping and replace its first two code units with rm0.
828                    StringBuilder buffer=new StringBuilder(mLength-1).append(rm0);
829                    norm16+=1+2;  // skip over the firstUnit and the first two mapping code units
830                    return buffer.append(extraData, norm16, norm16+mLength-2).toString();
831                }
832            } else {
833                norm16+=1;  // skip over the firstUnit
834                return extraData.substring(norm16, norm16+mLength);
835            }
836        }
837    }
838
839    /**
840     * Returns true if code point c starts a canonical-iterator string segment.
841     * <b>{@link #ensureCanonIterData()} must have been called before this method,
842     * or else this method will crash.</b>
843     * @param c A Unicode code point.
844     * @return true if c starts a canonical-iterator string segment.
845     */
846    public boolean isCanonSegmentStarter(int c) {
847        return canonIterData.get(c)>=0;
848    }
849    /**
850     * Returns true if there are characters whose decomposition starts with c.
851     * If so, then the set is cleared and then filled with those characters.
852     * <b>{@link #ensureCanonIterData()} must have been called before this method,
853     * or else this method will crash.</b>
854     * @param c A Unicode code point.
855     * @param set A UnicodeSet to receive the characters whose decompositions
856     *        start with c, if there are any.
857     * @return true if there are characters whose decomposition starts with c.
858     */
859    public boolean getCanonStartSet(int c, UnicodeSet set) {
860        int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
861        if(canonValue==0) {
862            return false;
863        }
864        set.clear();
865        int value=canonValue&CANON_VALUE_MASK;
866        if((canonValue&CANON_HAS_SET)!=0) {
867            set.addAll(canonStartSets.get(value));
868        } else if(value!=0) {
869            set.add(value);
870        }
871        if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
872            int norm16=getNorm16(c);
873            if(norm16==JAMO_L) {
874                int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
875                set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
876            } else {
877                addComposites(getCompositionsList(norm16), set);
878            }
879        }
880        return true;
881    }
882
883    public static final int MIN_CCC_LCCC_CP=0x300;
884
885    public static final int MIN_YES_YES_WITH_CC=0xff01;
886    public static final int JAMO_VT=0xff00;
887    public static final int MIN_NORMAL_MAYBE_YES=0xfe00;
888    public static final int JAMO_L=1;
889    public static final int MAX_DELTA=0x40;
890
891    // Byte offsets from the start of the data, after the generic header.
892    public static final int IX_NORM_TRIE_OFFSET=0;
893    public static final int IX_EXTRA_DATA_OFFSET=1;
894    public static final int IX_SMALL_FCD_OFFSET=2;
895    public static final int IX_RESERVED3_OFFSET=3;
896    public static final int IX_TOTAL_SIZE=7;
897
898    // Code point thresholds for quick check codes.
899    public static final int IX_MIN_DECOMP_NO_CP=8;
900    public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
901
902    // Norm16 value thresholds for quick check combinations and types of extra data.
903    // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
904    public static final int IX_MIN_YES_NO=10;
905    public static final int IX_MIN_NO_NO=11;
906    public static final int IX_LIMIT_NO_NO=12;
907    public static final int IX_MIN_MAYBE_YES=13;
908
909    // Mappings only in [minYesNoMappingsOnly..minNoNo[.
910    public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
911
912    public static final int IX_COUNT=16;
913
914    public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
915    public static final int MAPPING_HAS_RAW_MAPPING=0x40;
916    public static final int MAPPING_NO_COMP_BOUNDARY_AFTER=0x20;
917    public static final int MAPPING_LENGTH_MASK=0x1f;
918
919    public static final int COMP_1_LAST_TUPLE=0x8000;
920    public static final int COMP_1_TRIPLE=1;
921    public static final int COMP_1_TRAIL_LIMIT=0x3400;
922    public static final int COMP_1_TRAIL_MASK=0x7ffe;
923    public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
924    public static final int COMP_2_TRAIL_SHIFT=6;
925    public static final int COMP_2_TRAIL_MASK=0xffc0;
926
927    // higher-level functionality ------------------------------------------ ***
928
929    // NFD without an NFD Normalizer2 instance.
930    public Appendable decompose(CharSequence s, StringBuilder dest) {
931        decompose(s, 0, s.length(), dest, s.length());
932        return dest;
933    }
934    /**
935     * Decomposes s[src, limit[ and writes the result to dest.
936     * limit can be NULL if src is NUL-terminated.
937     * destLengthEstimate is the initial dest buffer capacity and can be -1.
938     */
939    public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
940                   int destLengthEstimate) {
941        if(destLengthEstimate<0) {
942            destLengthEstimate=limit-src;
943        }
944        dest.setLength(0);
945        ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
946        decompose(s, src, limit, buffer);
947    }
948
949    // Dual functionality:
950    // buffer!=NULL: normalize
951    // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
952    public int decompose(CharSequence s, int src, int limit,
953                         ReorderingBuffer buffer) {
954        int minNoCP=minDecompNoCP;
955
956        int prevSrc;
957        int c=0;
958        int norm16=0;
959
960        // only for quick check
961        int prevBoundary=src;
962        int prevCC=0;
963
964        for(;;) {
965            // count code units below the minimum or with irrelevant data for the quick check
966            for(prevSrc=src; src!=limit;) {
967                if( (c=s.charAt(src))<minNoCP ||
968                    isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
969                ) {
970                    ++src;
971                } else if(!UTF16.isSurrogate((char)c)) {
972                    break;
973                } else {
974                    char c2;
975                    if(UTF16Plus.isSurrogateLead(c)) {
976                        if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
977                            c=Character.toCodePoint((char)c, c2);
978                        }
979                    } else /* trail surrogate */ {
980                        if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
981                            --src;
982                            c=Character.toCodePoint(c2, (char)c);
983                        }
984                    }
985                    if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
986                        src+=Character.charCount(c);
987                    } else {
988                        break;
989                    }
990                }
991            }
992            // copy these code units all at once
993            if(src!=prevSrc) {
994                if(buffer!=null) {
995                    buffer.flushAndAppendZeroCC(s, prevSrc, src);
996                } else {
997                    prevCC=0;
998                    prevBoundary=src;
999                }
1000            }
1001            if(src==limit) {
1002                break;
1003            }
1004
1005            // Check one above-minimum, relevant code point.
1006            src+=Character.charCount(c);
1007            if(buffer!=null) {
1008                decompose(c, norm16, buffer);
1009            } else {
1010                if(isDecompYes(norm16)) {
1011                    int cc=getCCFromYesOrMaybe(norm16);
1012                    if(prevCC<=cc || cc==0) {
1013                        prevCC=cc;
1014                        if(cc<=1) {
1015                            prevBoundary=src;
1016                        }
1017                        continue;
1018                    }
1019                }
1020                return prevBoundary;  // "no" or cc out of order
1021            }
1022        }
1023        return src;
1024    }
1025    public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
1026        int limit=s.length();
1027        if(limit==0) {
1028            return;
1029        }
1030        if(doDecompose) {
1031            decompose(s, 0, limit, buffer);
1032            return;
1033        }
1034        // Just merge the strings at the boundary.
1035        int c=Character.codePointAt(s, 0);
1036        int src=0;
1037        int firstCC, prevCC, cc;
1038        firstCC=prevCC=cc=getCC(getNorm16(c));
1039        while(cc!=0) {
1040            prevCC=cc;
1041            src+=Character.charCount(c);
1042            if(src>=limit) {
1043                break;
1044            }
1045            c=Character.codePointAt(s, src);
1046            cc=getCC(getNorm16(c));
1047        };
1048        buffer.append(s, 0, src, firstCC, prevCC);
1049        buffer.append(s, src, limit);
1050    }
1051    // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1052    // doCompose: normalize
1053    // !doCompose: isNormalized (buffer must be empty and initialized)
1054    public boolean compose(CharSequence s, int src, int limit,
1055                           boolean onlyContiguous,
1056                           boolean doCompose,
1057                           ReorderingBuffer buffer) {
1058        int minNoMaybeCP=minCompNoMaybeCP;
1059
1060        /*
1061         * prevBoundary points to the last character before the current one
1062         * that has a composition boundary before it with ccc==0 and quick check "yes".
1063         * Keeping track of prevBoundary saves us looking for a composition boundary
1064         * when we find a "no" or "maybe".
1065         *
1066         * When we back out from prevSrc back to prevBoundary,
1067         * then we also remove those same characters (which had been simply copied
1068         * or canonically-order-inserted) from the ReorderingBuffer.
1069         * Therefore, at all times, the [prevBoundary..prevSrc[ source units
1070         * must correspond 1:1 to destination units at the end of the destination buffer.
1071         */
1072        int prevBoundary=src;
1073        int prevSrc;
1074        int c=0;
1075        int norm16=0;
1076
1077        // only for isNormalized
1078        int prevCC=0;
1079
1080        for(;;) {
1081            // count code units below the minimum or with irrelevant data for the quick check
1082            for(prevSrc=src; src!=limit;) {
1083                if( (c=s.charAt(src))<minNoMaybeCP ||
1084                    isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
1085                ) {
1086                    ++src;
1087                } else if(!UTF16.isSurrogate((char)c)) {
1088                    break;
1089                } else {
1090                    char c2;
1091                    if(UTF16Plus.isSurrogateLead(c)) {
1092                        if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1093                            c=Character.toCodePoint((char)c, c2);
1094                        }
1095                    } else /* trail surrogate */ {
1096                        if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1097                            --src;
1098                            c=Character.toCodePoint(c2, (char)c);
1099                        }
1100                    }
1101                    if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1102                        src+=Character.charCount(c);
1103                    } else {
1104                        break;
1105                    }
1106                }
1107            }
1108            // copy these code units all at once
1109            if(src!=prevSrc) {
1110                if(src==limit) {
1111                    if(doCompose) {
1112                        buffer.flushAndAppendZeroCC(s, prevSrc, src);
1113                    }
1114                    break;
1115                }
1116                // Set prevBoundary to the last character in the quick check loop.
1117                prevBoundary=src-1;
1118                if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
1119                    Character.isHighSurrogate(s.charAt(prevBoundary-1))
1120                ) {
1121                    --prevBoundary;
1122                }
1123                if(doCompose) {
1124                    // The last "quick check yes" character is excluded from the
1125                    // flush-and-append call in case it needs to be modified.
1126                    buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1127                    buffer.append(s, prevBoundary, src);
1128                } else {
1129                    prevCC=0;
1130                }
1131                // The start of the current character (c).
1132                prevSrc=src;
1133            } else if(src==limit) {
1134                break;
1135            }
1136
1137            src+=Character.charCount(c);
1138            /*
1139             * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1140             * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1141             * or has ccc!=0.
1142             * Check for Jamo V/T, then for regular characters.
1143             * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1144             */
1145            if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
1146                char prev=s.charAt(prevSrc-1);
1147                boolean needToDecompose=false;
1148                if(c<Hangul.JAMO_T_BASE) {
1149                    // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1150                    prev-=Hangul.JAMO_L_BASE;
1151                    if(prev<Hangul.JAMO_L_COUNT) {
1152                        if(!doCompose) {
1153                            return false;
1154                        }
1155                        char syllable=(char)
1156                            (Hangul.HANGUL_BASE+
1157                             (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1158                             Hangul.JAMO_T_COUNT);
1159                        char t;
1160                        if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1161                            ++src;
1162                            syllable+=t;  // The next character was a Jamo T.
1163                            prevBoundary=src;
1164                            buffer.setLastChar(syllable);
1165                            continue;
1166                        }
1167                        // If we see L+V+x where x!=T then we drop to the slow path,
1168                        // decompose and recompose.
1169                        // This is to deal with NFKC finding normal L and V but a
1170                        // compatibility variant of a T. We need to either fully compose that
1171                        // combination here (which would complicate the code and may not work
1172                        // with strange custom data) or use the slow path -- or else our replacing
1173                        // two input characters (L+V) with one output character (LV syllable)
1174                        // would violate the invariant that [prevBoundary..prevSrc[ has the same
1175                        // length as what we appended to the buffer since prevBoundary.
1176                        needToDecompose=true;
1177                    }
1178                } else if(Hangul.isHangulWithoutJamoT(prev)) {
1179                    // c is a Jamo Trailing consonant,
1180                    // compose with previous Hangul LV that does not contain a Jamo T.
1181                    if(!doCompose) {
1182                        return false;
1183                    }
1184                    buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE));
1185                    prevBoundary=src;
1186                    continue;
1187                }
1188                if(!needToDecompose) {
1189                    // The Jamo V/T did not compose into a Hangul syllable.
1190                    if(doCompose) {
1191                        buffer.append((char)c);
1192                    } else {
1193                        prevCC=0;
1194                    }
1195                    continue;
1196                }
1197            }
1198            /*
1199             * Source buffer pointers:
1200             *
1201             *  all done      quick check   current char  not yet
1202             *                "yes" but     (c)           processed
1203             *                may combine
1204             *                forward
1205             * [-------------[-------------[-------------[-------------[
1206             * |             |             |             |             |
1207             * orig. src     prevBoundary  prevSrc       src           limit
1208             *
1209             *
1210             * Destination buffer pointers inside the ReorderingBuffer:
1211             *
1212             *  all done      might take    not filled yet
1213             *                characters for
1214             *                reordering
1215             * [-------------[-------------[-------------[
1216             * |             |             |             |
1217             * start         reorderStart  limit         |
1218             *                             +remainingCap.+
1219             */
1220            if(norm16>=MIN_YES_YES_WITH_CC) {
1221                int cc=norm16&0xff;  // cc!=0
1222                if( onlyContiguous &&  // FCC
1223                    (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1224                    prevBoundary<prevSrc &&
1225                    // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1226                    // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1227                    // passed the quick check "yes && ccc==0" test.
1228                    // Check whether the last character was a "yesYes" or a "yesNo".
1229                    // If a "yesNo", then we get its trailing ccc from its
1230                    // mapping and check for canonical order.
1231                    // All other cases are ok.
1232                    getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
1233                ) {
1234                    // Fails FCD test, need to decompose and contiguously recompose.
1235                    if(!doCompose) {
1236                        return false;
1237                    }
1238                } else if(doCompose) {
1239                    buffer.append(c, cc);
1240                    continue;
1241                } else if(prevCC<=cc) {
1242                    prevCC=cc;
1243                    continue;
1244                } else {
1245                    return false;
1246                }
1247            } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1248                return false;
1249            }
1250
1251            /*
1252             * Find appropriate boundaries around this character,
1253             * decompose the source text from between the boundaries,
1254             * and recompose it.
1255             *
1256             * We may need to remove the last few characters from the ReorderingBuffer
1257             * to account for source text that was copied or appended
1258             * but needs to take part in the recomposition.
1259             */
1260
1261            /*
1262             * Find the last composition boundary in [prevBoundary..src[.
1263             * It is either the decomposition of the current character (at prevSrc),
1264             * or prevBoundary.
1265             */
1266            if(hasCompBoundaryBefore(c, norm16)) {
1267                prevBoundary=prevSrc;
1268            } else if(doCompose) {
1269                buffer.removeSuffix(prevSrc-prevBoundary);
1270            }
1271
1272            // Find the next composition boundary in [src..limit[ -
1273            // modifies src to point to the next starter.
1274            src=findNextCompBoundary(s, src, limit);
1275
1276            // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1277            int recomposeStartIndex=buffer.length();
1278            decomposeShort(s, prevBoundary, src, buffer);
1279            recompose(buffer, recomposeStartIndex, onlyContiguous);
1280            if(!doCompose) {
1281                if(!buffer.equals(s, prevBoundary, src)) {
1282                    return false;
1283                }
1284                buffer.remove();
1285                prevCC=0;
1286            }
1287
1288            // Move to the next starter. We never need to look back before this point again.
1289            prevBoundary=src;
1290        }
1291        return true;
1292    }
1293    /**
1294     * Very similar to compose(): Make the same changes in both places if relevant.
1295     * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
1296     * !doSpan: quickCheck
1297     * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
1298     *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
1299     *         then the quick check result is "no"
1300     */
1301    public int composeQuickCheck(CharSequence s, int src, int limit,
1302                                 boolean onlyContiguous, boolean doSpan) {
1303        int qcResult=0;
1304        int minNoMaybeCP=minCompNoMaybeCP;
1305
1306        /*
1307         * prevBoundary points to the last character before the current one
1308         * that has a composition boundary before it with ccc==0 and quick check "yes".
1309         */
1310        int prevBoundary=src;
1311        int prevSrc;
1312        int c=0;
1313        int norm16=0;
1314        int prevCC=0;
1315
1316        for(;;) {
1317            // count code units below the minimum or with irrelevant data for the quick check
1318            for(prevSrc=src;;) {
1319                if(src==limit) {
1320                    return (src<<1)|qcResult;  // "yes" or "maybe"
1321                }
1322                if( (c=s.charAt(src))<minNoMaybeCP ||
1323                    isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
1324                ) {
1325                    ++src;
1326                } else if(!UTF16.isSurrogate((char)c)) {
1327                    break;
1328                } else {
1329                    char c2;
1330                    if(UTF16Plus.isSurrogateLead(c)) {
1331                        if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1332                            c=Character.toCodePoint((char)c, c2);
1333                        }
1334                    } else /* trail surrogate */ {
1335                        if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1336                            --src;
1337                            c=Character.toCodePoint(c2, (char)c);
1338                        }
1339                    }
1340                    if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1341                        src+=Character.charCount(c);
1342                    } else {
1343                        break;
1344                    }
1345                }
1346            }
1347            if(src!=prevSrc) {
1348                // Set prevBoundary to the last character in the quick check loop.
1349                prevBoundary=src-1;
1350                if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
1351                        Character.isHighSurrogate(s.charAt(prevBoundary-1))
1352                ) {
1353                    --prevBoundary;
1354                }
1355                prevCC=0;
1356                // The start of the current character (c).
1357                prevSrc=src;
1358            }
1359
1360            src+=Character.charCount(c);
1361            /*
1362             * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1363             * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1364             * or has ccc!=0.
1365             */
1366            if(isMaybeOrNonZeroCC(norm16)) {
1367                int cc=getCCFromYesOrMaybe(norm16);
1368                if( onlyContiguous &&  // FCC
1369                    cc!=0 &&
1370                    prevCC==0 &&
1371                    prevBoundary<prevSrc &&
1372                    // prevCC==0 && prevBoundary<prevSrc tell us that
1373                    // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1374                    // passed the quick check "yes && ccc==0" test.
1375                    // Check whether the last character was a "yesYes" or a "yesNo".
1376                    // If a "yesNo", then we get its trailing ccc from its
1377                    // mapping and check for canonical order.
1378                    // All other cases are ok.
1379                    getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
1380                ) {
1381                    // Fails FCD test.
1382                } else if(prevCC<=cc || cc==0) {
1383                    prevCC=cc;
1384                    if(norm16<MIN_YES_YES_WITH_CC) {
1385                        if(!doSpan) {
1386                            qcResult=1;
1387                        } else {
1388                            return prevBoundary<<1;  // spanYes does not care to know it's "maybe"
1389                        }
1390                    }
1391                    continue;
1392                }
1393            }
1394            return prevBoundary<<1;  // "no"
1395        }
1396    }
1397    public void composeAndAppend(CharSequence s,
1398                                 boolean doCompose,
1399                                 boolean onlyContiguous,
1400                                 ReorderingBuffer buffer) {
1401        int src=0, limit=s.length();
1402        if(!buffer.isEmpty()) {
1403            int firstStarterInSrc=findNextCompBoundary(s, 0, limit);
1404            if(0!=firstStarterInSrc) {
1405                int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1406                                                               buffer.length());
1407                StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1408                                                       firstStarterInSrc+16);
1409                middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1410                buffer.removeSuffix(buffer.length()-lastStarterInDest);
1411                middle.append(s, 0, firstStarterInSrc);
1412                compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1413                src=firstStarterInSrc;
1414            }
1415        }
1416        if(doCompose) {
1417            compose(s, src, limit, onlyContiguous, true, buffer);
1418        } else {
1419            buffer.append(s, src, limit);
1420        }
1421    }
1422    // Dual functionality:
1423    // buffer!=NULL: normalize
1424    // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1425    public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1426        // Note: In this function we use buffer->appendZeroCC() because we track
1427        // the lead and trail combining classes here, rather than leaving it to
1428        // the ReorderingBuffer.
1429        // The exception is the call to decomposeShort() which uses the buffer
1430        // in the normal way.
1431
1432        // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1433        // Similar to the prevBoundary in the compose() implementation.
1434        int prevBoundary=src;
1435        int prevSrc;
1436        int c=0;
1437        int prevFCD16=0;
1438        int fcd16=0;
1439
1440        for(;;) {
1441            // count code units with lccc==0
1442            for(prevSrc=src; src!=limit;) {
1443                if((c=s.charAt(src))<MIN_CCC_LCCC_CP) {
1444                    prevFCD16=~c;
1445                    ++src;
1446                } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1447                    prevFCD16=0;
1448                    ++src;
1449                } else {
1450                    if(UTF16.isSurrogate((char)c)) {
1451                        char c2;
1452                        if(UTF16Plus.isSurrogateLead(c)) {
1453                            if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1454                                c=Character.toCodePoint((char)c, c2);
1455                            }
1456                        } else /* trail surrogate */ {
1457                            if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1458                                --src;
1459                                c=Character.toCodePoint(c2, (char)c);
1460                            }
1461                        }
1462                    }
1463                    if((fcd16=getFCD16FromNormData(c))<=0xff) {
1464                        prevFCD16=fcd16;
1465                        src+=Character.charCount(c);
1466                    } else {
1467                        break;
1468                    }
1469                }
1470            }
1471            // copy these code units all at once
1472            if(src!=prevSrc) {
1473                if(src==limit) {
1474                    if(buffer!=null) {
1475                        buffer.flushAndAppendZeroCC(s, prevSrc, src);
1476                    }
1477                    break;
1478                }
1479                prevBoundary=src;
1480                // We know that the previous character's lccc==0.
1481                if(prevFCD16<0) {
1482                    // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1483                    int prev=~prevFCD16;
1484                    prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
1485                    if(prevFCD16>1) {
1486                        --prevBoundary;
1487                    }
1488                } else {
1489                    int p=src-1;
1490                    if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1491                        Character.isHighSurrogate(s.charAt(p-1))
1492                    ) {
1493                        --p;
1494                        // Need to fetch the previous character's FCD value because
1495                        // prevFCD16 was just for the trail surrogate code point.
1496                        prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1497                        // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1498                    }
1499                    if(prevFCD16>1) {
1500                        prevBoundary=p;
1501                    }
1502                }
1503                if(buffer!=null) {
1504                    // The last lccc==0 character is excluded from the
1505                    // flush-and-append call in case it needs to be modified.
1506                    buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1507                    buffer.append(s, prevBoundary, src);
1508                }
1509                // The start of the current character (c).
1510                prevSrc=src;
1511            } else if(src==limit) {
1512                break;
1513            }
1514
1515            src+=Character.charCount(c);
1516            // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1517            // Check for proper order, and decompose locally if necessary.
1518            if((prevFCD16&0xff)<=(fcd16>>8)) {
1519                // proper order: prev tccc <= current lccc
1520                if((fcd16&0xff)<=1) {
1521                    prevBoundary=src;
1522                }
1523                if(buffer!=null) {
1524                    buffer.appendZeroCC(c);
1525                }
1526                prevFCD16=fcd16;
1527                continue;
1528            } else if(buffer==null) {
1529                return prevBoundary;  // quick check "no"
1530            } else {
1531                /*
1532                 * Back out the part of the source that we copied or appended
1533                 * already but is now going to be decomposed.
1534                 * prevSrc is set to after what was copied/appended.
1535                 */
1536                buffer.removeSuffix(prevSrc-prevBoundary);
1537                /*
1538                 * Find the part of the source that needs to be decomposed,
1539                 * up to the next safe boundary.
1540                 */
1541                src=findNextFCDBoundary(s, src, limit);
1542                /*
1543                 * The source text does not fulfill the conditions for FCD.
1544                 * Decompose and reorder a limited piece of the text.
1545                 */
1546                decomposeShort(s, prevBoundary, src, buffer);
1547                prevBoundary=src;
1548                prevFCD16=0;
1549            }
1550        }
1551        return src;
1552    }
1553    public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) {
1554        int src=0, limit=s.length();
1555        if(!buffer.isEmpty()) {
1556            int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit);
1557            if(0!=firstBoundaryInSrc) {
1558                int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(),
1559                                                               buffer.length());
1560                StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+
1561                                                       firstBoundaryInSrc+16);
1562                middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length());
1563                buffer.removeSuffix(buffer.length()-lastBoundaryInDest);
1564                middle.append(s, 0, firstBoundaryInSrc);
1565                makeFCD(middle, 0, middle.length(), buffer);
1566                src=firstBoundaryInSrc;
1567            }
1568        }
1569        if(doMakeFCD) {
1570            makeFCD(s, src, limit, buffer);
1571        } else {
1572            buffer.append(s, src, limit);
1573        }
1574    }
1575
1576    // Note: hasDecompBoundary() could be implemented as aliases to
1577    // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
1578    // at the cost of building the FCD trie for a decomposition normalizer.
1579    public boolean hasDecompBoundary(int c, boolean before) {
1580        for(;;) {
1581            if(c<minDecompNoCP) {
1582                return true;
1583            }
1584            int norm16=getNorm16(c);
1585            if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
1586                return true;
1587            } else if(norm16>MIN_NORMAL_MAYBE_YES) {
1588                return false;  // ccc!=0
1589            } else if(isDecompNoAlgorithmic(norm16)) {
1590                c=mapAlgorithmic(c, norm16);
1591            } else {
1592                // c decomposes, get everything from the variable-length extra data
1593                int firstUnit=extraData.charAt(norm16);
1594                if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1595                    return false;
1596                }
1597                if(!before) {
1598                    // decomp after-boundary: same as hasFCDBoundaryAfter(),
1599                    // fcd16<=1 || trailCC==0
1600                    if(firstUnit>0x1ff) {
1601                        return false;  // trailCC>1
1602                    }
1603                    if(firstUnit<=0xff) {
1604                        return true;  // trailCC==0
1605                    }
1606                    // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1607                }
1608                // true if leadCC==0 (hasFCDBoundaryBefore())
1609                return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0;
1610            }
1611        }
1612    }
1613    public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
1614
1615    public boolean hasCompBoundaryBefore(int c) {
1616        return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
1617    }
1618    public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous, boolean testInert) {
1619        for(;;) {
1620            int norm16=getNorm16(c);
1621            if(isInert(norm16)) {
1622                return true;
1623            } else if(norm16<=minYesNo) {
1624                // Hangul: norm16==minYesNo
1625                // Hangul LVT has a boundary after it.
1626                // Hangul LV and non-inert yesYes characters combine forward.
1627                return isHangul(norm16) && !Hangul.isHangulWithoutJamoT((char)c);
1628            } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1629                return false;
1630            } else if(isDecompNoAlgorithmic(norm16)) {
1631                c=mapAlgorithmic(c, norm16);
1632            } else {
1633                // c decomposes, get everything from the variable-length extra data.
1634                // If testInert, then c must be a yesNo character which has lccc=0,
1635                // otherwise it could be a noNo.
1636                int firstUnit=extraData.charAt(norm16);
1637                // true if
1638                //   not MAPPING_NO_COMP_BOUNDARY_AFTER
1639                //     (which is set if
1640                //       c is not deleted, and
1641                //       it and its decomposition do not combine forward, and it has a starter)
1642                //   and if FCC then trailCC<=1
1643                return
1644                    (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
1645                    (!onlyContiguous || firstUnit<=0x1ff);
1646            }
1647        }
1648    }
1649
1650    public boolean hasFCDBoundaryBefore(int c) { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; }
1651    public boolean hasFCDBoundaryAfter(int c) {
1652        int fcd16=getFCD16(c);
1653        return fcd16<=1 || (fcd16&0xff)==0;
1654    }
1655    public boolean isFCDInert(int c) { return getFCD16(c)<=1; }
1656
1657    private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
1658    private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
1659    private static boolean isInert(int norm16) { return norm16==0; }
1660    private static boolean isJamoL(int norm16) { return norm16==1; }
1661    private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
1662    private boolean isHangul(int norm16) { return norm16==minYesNo; }
1663    private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
1664    // UBool isCompYes(uint16_t norm16) const {
1665    //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1666    // }
1667    // UBool isCompYesOrMaybe(uint16_t norm16) const {
1668    //     return norm16<minNoNo || minMaybeYes<=norm16;
1669    // }
1670    // private boolean hasZeroCCFromDecompYes(int norm16) {
1671    //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1672    // }
1673    private boolean isDecompYesAndZeroCC(int norm16) {
1674        return norm16<minYesNo ||
1675               norm16==JAMO_VT ||
1676               (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1677    }
1678    /**
1679     * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1680     * the MaybeYes which combine-forward and have ccc=0.
1681     * (Standard Unicode 5.2 normalization does not have such characters.)
1682     */
1683    private boolean isMostDecompYesAndZeroCC(int norm16) {
1684        return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1685    }
1686    private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1687
1688    // For use with isCompYes().
1689    // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1690    // static uint8_t getCCFromYes(uint16_t norm16) {
1691    //     return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
1692    // }
1693    private int getCCFromNoNo(int norm16) {
1694        if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1695            return extraData.charAt(norm16-1)&0xff;
1696        } else {
1697            return 0;
1698        }
1699    }
1700    // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
1701    int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) {
1702        int c;
1703        if(cpStart==(cpLimit-1)) {
1704            c=s.charAt(cpStart);
1705        } else {
1706            c=Character.codePointAt(s, cpStart);
1707        }
1708        int prevNorm16=getNorm16(c);
1709        if(prevNorm16<=minYesNo) {
1710            return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
1711        } else {
1712            return extraData.charAt(prevNorm16)>>8;  // tccc from yesNo
1713        }
1714    }
1715
1716    // Requires algorithmic-NoNo.
1717    private int mapAlgorithmic(int c, int norm16) {
1718        return c+norm16-(minMaybeYes-MAX_DELTA-1);
1719    }
1720
1721    // Requires minYesNo<norm16<limitNoNo.
1722    // private int getMapping(int norm16) { return /*extraData+*/norm16; }
1723
1724    /**
1725     * @return index into maybeYesCompositions, or -1
1726     */
1727    private int getCompositionsListForDecompYes(int norm16) {
1728        if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
1729            return -1;
1730        } else {
1731            if((norm16-=minMaybeYes)<0) {
1732                // norm16<minMaybeYes: index into extraData which is a substring at
1733                //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1734                // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1735                norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list
1736            }
1737            return norm16;
1738        }
1739    }
1740    /**
1741     * @return index into maybeYesCompositions
1742     */
1743    private int getCompositionsListForComposite(int norm16) {
1744        // composite has both mapping & compositions list
1745        int firstUnit=extraData.charAt(norm16);
1746        return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+  // mapping in maybeYesCompositions
1747            1+  // +1 to skip the first unit with the mapping lenth
1748            (firstUnit&MAPPING_LENGTH_MASK);  // + mapping length
1749    }
1750    /**
1751     * @param c code point must have compositions
1752     * @return index into maybeYesCompositions
1753     */
1754    private int getCompositionsList(int norm16) {
1755        return isDecompYes(norm16) ?
1756                getCompositionsListForDecompYes(norm16) :
1757                getCompositionsListForComposite(norm16);
1758    }
1759
1760    // Decompose a short piece of text which is likely to contain characters that
1761    // fail the quick check loop and/or where the quick check loop's overhead
1762    // is unlikely to be amortized.
1763    // Called by the compose() and makeFCD() implementations.
1764    // Public in Java for collation implementation code.
1765    public void decomposeShort(CharSequence s, int src, int limit,
1766                               ReorderingBuffer buffer) {
1767        while(src<limit) {
1768            int c=Character.codePointAt(s, src);
1769            src+=Character.charCount(c);
1770            decompose(c, getNorm16(c), buffer);
1771        }
1772    }
1773    private void decompose(int c, int norm16,
1774                           ReorderingBuffer buffer) {
1775        // Only loops for 1:1 algorithmic mappings.
1776        for(;;) {
1777            // get the decomposition and the lead and trail cc's
1778            if(isDecompYes(norm16)) {
1779                // c does not decompose
1780                buffer.append(c, getCCFromYesOrMaybe(norm16));
1781            } else if(isHangul(norm16)) {
1782                // Hangul syllable: decompose algorithmically
1783                Hangul.decompose(c, buffer);
1784            } else if(isDecompNoAlgorithmic(norm16)) {
1785                c=mapAlgorithmic(c, norm16);
1786                norm16=getNorm16(c);
1787                continue;
1788            } else {
1789                // c decomposes, get everything from the variable-length extra data
1790                int firstUnit=extraData.charAt(norm16);
1791                int length=firstUnit&MAPPING_LENGTH_MASK;
1792                int leadCC, trailCC;
1793                trailCC=firstUnit>>8;
1794                if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1795                    leadCC=extraData.charAt(norm16-1)>>8;
1796                } else {
1797                    leadCC=0;
1798                }
1799                ++norm16;  // skip over the firstUnit
1800                buffer.append(extraData, norm16, norm16+length, leadCC, trailCC);
1801            }
1802            return;
1803        }
1804    }
1805
1806    /**
1807     * Finds the recomposition result for
1808     * a forward-combining "lead" character,
1809     * specified with a pointer to its compositions list,
1810     * and a backward-combining "trail" character.
1811     *
1812     * <p>If the lead and trail characters combine, then this function returns
1813     * the following "compositeAndFwd" value:
1814     * <pre>
1815     * Bits 21..1  composite character
1816     * Bit      0  set if the composite is a forward-combining starter
1817     * </pre>
1818     * otherwise it returns -1.
1819     *
1820     * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1821     * encoded as either pairs or triples of 16-bit units.
1822     * The last entry has the high bit of its first unit set.
1823     *
1824     * <p>The list is sorted by ascending trail characters (there are no duplicates).
1825     * A linear search is used.
1826     *
1827     * <p>See normalizer2impl.h for a more detailed description
1828     * of the compositions list format.
1829     */
1830    private static int combine(String compositions, int list, int trail) {
1831        int key1, firstUnit;
1832        if(trail<COMP_1_TRAIL_LIMIT) {
1833            // trail character is 0..33FF
1834            // result entry may have 2 or 3 units
1835            key1=(trail<<1);
1836            while(key1>(firstUnit=compositions.charAt(list))) {
1837                list+=2+(firstUnit&COMP_1_TRIPLE);
1838            }
1839            if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1840                if((firstUnit&COMP_1_TRIPLE)!=0) {
1841                    return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1842                } else {
1843                    return compositions.charAt(list+1);
1844                }
1845            }
1846        } else {
1847            // trail character is 3400..10FFFF
1848            // result entry has 3 units
1849            key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1850            int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1851            int secondUnit;
1852            for(;;) {
1853                if(key1>(firstUnit=compositions.charAt(list))) {
1854                    list+=2+(firstUnit&COMP_1_TRIPLE);
1855                } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1856                    if(key2>(secondUnit=compositions.charAt(list+1))) {
1857                        if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1858                            break;
1859                        } else {
1860                            list+=3;
1861                        }
1862                    } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1863                        return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
1864                    } else {
1865                        break;
1866                    }
1867                } else {
1868                    break;
1869                }
1870            }
1871        }
1872        return -1;
1873    }
1874    /**
1875     * @param list some character's compositions list
1876     * @param set recursively receives the composites from these compositions
1877     */
1878    private void addComposites(int list, UnicodeSet set) {
1879        int firstUnit, compositeAndFwd;
1880        do {
1881            firstUnit=maybeYesCompositions.charAt(list);
1882            if((firstUnit&COMP_1_TRIPLE)==0) {
1883                compositeAndFwd=maybeYesCompositions.charAt(list+1);
1884                list+=2;
1885            } else {
1886                compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
1887                                maybeYesCompositions.charAt(list+2);
1888                list+=3;
1889            }
1890            int composite=compositeAndFwd>>1;
1891            if((compositeAndFwd&1)!=0) {
1892                addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
1893            }
1894            set.add(composite);
1895        } while((firstUnit&COMP_1_LAST_TUPLE)==0);
1896    }
1897    /*
1898     * Recomposes the buffer text starting at recomposeStartIndex
1899     * (which is in NFD - decomposed and canonically ordered),
1900     * and truncates the buffer contents.
1901     *
1902     * Note that recomposition never lengthens the text:
1903     * Any character consists of either one or two code units;
1904     * a composition may contain at most one more code unit than the original starter,
1905     * while the combining mark that is removed has at least one code unit.
1906     */
1907    private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
1908                           boolean onlyContiguous) {
1909        StringBuilder sb=buffer.getStringBuilder();
1910        int p=recomposeStartIndex;
1911        if(p==sb.length()) {
1912            return;
1913        }
1914
1915        int starter, pRemove;
1916        int compositionsList;
1917        int c, compositeAndFwd;
1918        int norm16;
1919        int cc, prevCC;
1920        boolean starterIsSupplementary;
1921
1922        // Some of the following variables are not used until we have a forward-combining starter
1923        // and are only initialized now to avoid compiler warnings.
1924        compositionsList=-1;  // used as indicator for whether we have a forward-combining starter
1925        starter=-1;
1926        starterIsSupplementary=false;
1927        prevCC=0;
1928
1929        for(;;) {
1930            c=sb.codePointAt(p);
1931            p+=Character.charCount(c);
1932            norm16=getNorm16(c);
1933            cc=getCCFromYesOrMaybe(norm16);
1934            if( // this character combines backward and
1935                isMaybe(norm16) &&
1936                // we have seen a starter that combines forward and
1937                compositionsList>=0 &&
1938                // the backward-combining character is not blocked
1939                (prevCC<cc || prevCC==0)
1940            ) {
1941                if(isJamoVT(norm16)) {
1942                    // c is a Jamo V/T, see if we can compose it with the previous character.
1943                    if(c<Hangul.JAMO_T_BASE) {
1944                        // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1945                        char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1946                        if(prev<Hangul.JAMO_L_COUNT) {
1947                            pRemove=p-1;
1948                            char syllable=(char)
1949                                (Hangul.HANGUL_BASE+
1950                                 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1951                                 Hangul.JAMO_T_COUNT);
1952                            char t;
1953                            if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1954                                ++p;
1955                                syllable+=t;  // The next character was a Jamo T.
1956                            }
1957                            sb.setCharAt(starter, syllable);
1958                            // remove the Jamo V/T
1959                            sb.delete(pRemove, p);
1960                            p=pRemove;
1961                        }
1962                    }
1963                    /*
1964                     * No "else" for Jamo T:
1965                     * Since the input is in NFD, there are no Hangul LV syllables that
1966                     * a Jamo T could combine with.
1967                     * All Jamo Ts are combined above when handling Jamo Vs.
1968                     */
1969                    if(p==sb.length()) {
1970                        break;
1971                    }
1972                    compositionsList=-1;
1973                    continue;
1974                } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
1975                    // The starter and the combining mark (c) do combine.
1976                    int composite=compositeAndFwd>>1;
1977
1978                    // Remove the combining mark.
1979                    pRemove=p-Character.charCount(c);  // pRemove & p: start & limit of the combining mark
1980                    sb.delete(pRemove, p);
1981                    p=pRemove;
1982                    // Replace the starter with the composite.
1983                    if(starterIsSupplementary) {
1984                        if(composite>0xffff) {
1985                            // both are supplementary
1986                            sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1987                            sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
1988                        } else {
1989                            sb.setCharAt(starter, (char)c);
1990                            sb.deleteCharAt(starter+1);
1991                            // The composite is shorter than the starter,
1992                            // move the intermediate characters forward one.
1993                            starterIsSupplementary=false;
1994                            --p;
1995                        }
1996                    } else if(composite>0xffff) {
1997                        // The composite is longer than the starter,
1998                        // move the intermediate characters back one.
1999                        starterIsSupplementary=true;
2000                        sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
2001                        sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
2002                        ++p;
2003                    } else {
2004                        // both are on the BMP
2005                        sb.setCharAt(starter, (char)composite);
2006                    }
2007
2008                    // Keep prevCC because we removed the combining mark.
2009
2010                    if(p==sb.length()) {
2011                        break;
2012                    }
2013                    // Is the composite a starter that combines forward?
2014                    if((compositeAndFwd&1)!=0) {
2015                        compositionsList=
2016                            getCompositionsListForComposite(getNorm16(composite));
2017                    } else {
2018                        compositionsList=-1;
2019                    }
2020
2021                    // We combined; continue with looking for compositions.
2022                    continue;
2023                }
2024            }
2025
2026            // no combination this time
2027            prevCC=cc;
2028            if(p==sb.length()) {
2029                break;
2030            }
2031
2032            // If c did not combine, then check if it is a starter.
2033            if(cc==0) {
2034                // Found a new starter.
2035                if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
2036                    // It may combine with something, prepare for it.
2037                    if(c<=0xffff) {
2038                        starterIsSupplementary=false;
2039                        starter=p-1;
2040                    } else {
2041                        starterIsSupplementary=true;
2042                        starter=p-2;
2043                    }
2044                }
2045            } else if(onlyContiguous) {
2046                // FCC: no discontiguous compositions; any intervening character blocks.
2047                compositionsList=-1;
2048            }
2049        }
2050        buffer.flush();
2051    }
2052
2053    public int composePair(int a, int b) {
2054        int norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
2055        int list;
2056        if(isInert(norm16)) {
2057            return -1;
2058        } else if(norm16<minYesNoMappingsOnly) {
2059            if(isJamoL(norm16)) {
2060                b-=Hangul.JAMO_V_BASE;
2061                if(0<=b && b<Hangul.JAMO_V_COUNT) {
2062                    return
2063                        (Hangul.HANGUL_BASE+
2064                         ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)*
2065                         Hangul.JAMO_T_COUNT);
2066                } else {
2067                    return -1;
2068                }
2069            } else if(isHangul(norm16)) {
2070                b-=Hangul.JAMO_T_BASE;
2071                if(Hangul.isHangulWithoutJamoT((char)a) && 0<b && b<Hangul.JAMO_T_COUNT) {  // not b==0!
2072                    return a+b;
2073                } else {
2074                    return -1;
2075                }
2076            } else {
2077                // 'a' has a compositions list in extraData
2078                list=norm16;
2079                if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
2080                    list+=  // mapping pointer
2081                        1+  // +1 to skip the first unit with the mapping lenth
2082                        (extraData.charAt(list)&MAPPING_LENGTH_MASK);  // + mapping length
2083                }
2084                // Turn the offset-into-extraData into an offset-into-maybeYesCompositions.
2085                list+=MIN_NORMAL_MAYBE_YES-minMaybeYes;
2086            }
2087        } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
2088            return -1;
2089        } else {
2090            list=norm16-minMaybeYes;  // offset into maybeYesCompositions
2091        }
2092        if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
2093            return -1;
2094        }
2095        return combine(maybeYesCompositions, list, b)>>1;
2096    }
2097
2098    /**
2099     * Does c have a composition boundary before it?
2100     * True if its decomposition begins with a character that has
2101     * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
2102     * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
2103     * (isCompYesAndZeroCC()) so we need not decompose.
2104     */
2105    private boolean hasCompBoundaryBefore(int c, int norm16) {
2106        for(;;) {
2107            if(isCompYesAndZeroCC(norm16)) {
2108                return true;
2109            } else if(isMaybeOrNonZeroCC(norm16)) {
2110                return false;
2111            } else if(isDecompNoAlgorithmic(norm16)) {
2112                c=mapAlgorithmic(c, norm16);
2113                norm16=getNorm16(c);
2114            } else {
2115                // c decomposes, get everything from the variable-length extra data
2116                int firstUnit=extraData.charAt(norm16);
2117                if((firstUnit&MAPPING_LENGTH_MASK)==0) {
2118                    return false;
2119                }
2120                if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) {
2121                    return false;  // non-zero leadCC
2122                }
2123                return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1)));
2124            }
2125        }
2126    }
2127    private int findPreviousCompBoundary(CharSequence s, int p) {
2128        while(p>0) {
2129            int c=Character.codePointBefore(s, p);
2130            p-=Character.charCount(c);
2131            if(hasCompBoundaryBefore(c)) {
2132                break;
2133            }
2134            // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
2135            // but that's probably not worth the extra cost.
2136        }
2137        return p;
2138    }
2139    private int findNextCompBoundary(CharSequence s, int p, int limit) {
2140        while(p<limit) {
2141            int c=Character.codePointAt(s, p);
2142            int norm16=normTrie.get(c);
2143            if(hasCompBoundaryBefore(c, norm16)) {
2144                break;
2145            }
2146            p+=Character.charCount(c);
2147        }
2148        return p;
2149    }
2150
2151    private int findPreviousFCDBoundary(CharSequence s, int p) {
2152        while(p>0) {
2153            int c=Character.codePointBefore(s, p);
2154            p-=Character.charCount(c);
2155            if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {
2156                break;
2157            }
2158        }
2159        return p;
2160    }
2161    private int findNextFCDBoundary(CharSequence s, int p, int limit) {
2162        while(p<limit) {
2163            int c=Character.codePointAt(s, p);
2164            if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {
2165                break;
2166            }
2167            p+=Character.charCount(c);
2168        }
2169        return p;
2170    }
2171
2172    private void addToStartSet(Trie2Writable newData, int origin, int decompLead) {
2173        int canonValue=newData.get(decompLead);
2174        if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
2175            // origin is the first character whose decomposition starts with
2176            // the character for which we are setting the value.
2177            newData.set(decompLead, canonValue|origin);
2178        } else {
2179            // origin is not the first character, or it is U+0000.
2180            UnicodeSet set;
2181            if((canonValue&CANON_HAS_SET)==0) {
2182                int firstOrigin=canonValue&CANON_VALUE_MASK;
2183                canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size();
2184                newData.set(decompLead, canonValue);
2185                canonStartSets.add(set=new UnicodeSet());
2186                if(firstOrigin!=0) {
2187                    set.add(firstOrigin);
2188                }
2189            } else {
2190                set=canonStartSets.get(canonValue&CANON_VALUE_MASK);
2191            }
2192            set.add(origin);
2193        }
2194    }
2195
2196    @SuppressWarnings("unused")
2197    private VersionInfo dataVersion;
2198
2199    // Code point thresholds for quick check codes.
2200    private int minDecompNoCP;
2201    private int minCompNoMaybeCP;
2202
2203    // Norm16 value thresholds for quick check combinations and types of extra data.
2204    private int minYesNo;
2205    private int minYesNoMappingsOnly;
2206    private int minNoNo;
2207    private int limitNoNo;
2208    private int minMaybeYes;
2209
2210    private Trie2_16 normTrie;
2211    private String maybeYesCompositions;
2212    private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2213    private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2214    private int[] tccc180;  // [0x180] tccc values for U+0000..U+017F
2215
2216    private Trie2_32 canonIterData;
2217    private ArrayList<UnicodeSet> canonStartSets;
2218
2219    // bits in canonIterData
2220    private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000;
2221    private static final int CANON_HAS_COMPOSITIONS = 0x40000000;
2222    private static final int CANON_HAS_SET = 0x200000;
2223    private static final int CANON_VALUE_MASK = 0x1fffff;
2224}
2225