1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 *******************************************************************************
6 *   Copyright (C) 2009-2015, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *******************************************************************************
9 */
10
11package android.icu.impl;
12
13import java.io.IOException;
14import java.nio.ByteBuffer;
15import java.util.ArrayList;
16import java.util.Iterator;
17
18import android.icu.text.UTF16;
19import android.icu.text.UnicodeSet;
20import android.icu.util.ICUUncheckedIOException;
21import android.icu.util.VersionInfo;
22
23/**
24 * Low-level implementation of the Unicode Normalization Algorithm.
25 * For the data structure and details see the documentation at the end of
26 * C++ normalizer2impl.h and in the design doc at
27 * http://site.icu-project.org/design/normalization/custom
28 * @hide Only a subset of ICU is exposed in Android
29 */
30public final class Normalizer2Impl {
31    public static final class Hangul {
32        /* Korean Hangul and Jamo constants */
33        public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
34        public static final int JAMO_L_END=0x1112;
35        public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
36        public static final int JAMO_V_END=0x1175;
37        public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
38        public static final int JAMO_T_END=0x11c2;
39
40        public static final int HANGUL_BASE=0xac00;
41        public static final int HANGUL_END=0xd7a3;
42
43        public static final int JAMO_L_COUNT=19;
44        public static final int JAMO_V_COUNT=21;
45        public static final int JAMO_T_COUNT=28;
46
47        public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;
48        public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;
49
50        public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT;
51
52        public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
53        public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
54
55        public static boolean isHangul(int c) {
56            return HANGUL_BASE<=c && c<HANGUL_LIMIT;
57        }
58        public static boolean isHangulLV(int c) {
59            c-=HANGUL_BASE;
60            return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
61        }
62        public static boolean isJamoL(int c) {
63            return JAMO_L_BASE<=c && c<JAMO_L_LIMIT;
64        }
65        public static boolean isJamoV(int c) {
66            return JAMO_V_BASE<=c && c<JAMO_V_LIMIT;
67        }
68        public static boolean isJamoT(int c) {
69            int t=c-JAMO_T_BASE;
70            return 0<t && t<JAMO_T_COUNT;  // not JAMO_T_BASE itself
71        }
72        public static boolean isJamo(int c) {
73            return JAMO_L_BASE<=c && c<=JAMO_T_END &&
74                (c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c);
75        }
76
77        /**
78         * Decomposes c, which must be a Hangul syllable, into buffer
79         * and returns the length of the decomposition (2 or 3).
80         */
81        public static int decompose(int c, Appendable buffer) {
82            try {
83                c-=HANGUL_BASE;
84                int c2=c%JAMO_T_COUNT;
85                c/=JAMO_T_COUNT;
86                buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
87                buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
88                if(c2==0) {
89                    return 2;
90                } else {
91                    buffer.append((char)(JAMO_T_BASE+c2));
92                    return 3;
93                }
94            } catch(IOException e) {
95                // Will not occur because we do not write to I/O.
96                throw new ICUUncheckedIOException(e);
97            }
98        }
99
100        /**
101         * Decomposes c, which must be a Hangul syllable, into buffer.
102         * This is the raw, not recursive, decomposition. Its length is always 2.
103         */
104        public static void getRawDecomposition(int c, Appendable buffer) {
105            try {
106                int orig=c;
107                c-=HANGUL_BASE;
108                int c2=c%JAMO_T_COUNT;
109                if(c2==0) {
110                    c/=JAMO_T_COUNT;
111                    buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
112                    buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
113                } else {
114                    buffer.append((char)(orig-c2));  // LV syllable
115                    buffer.append((char)(JAMO_T_BASE+c2));
116                }
117            } catch(IOException e) {
118                // Will not occur because we do not write to I/O.
119                throw new ICUUncheckedIOException(e);
120            }
121        }
122    }
123
124    /**
125     * Writable buffer that takes care of canonical ordering.
126     * Its Appendable methods behave like the C++ implementation's
127     * appendZeroCC() methods.
128     * <p>
129     * If dest is a StringBuilder, then the buffer writes directly to it.
130     * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
131     * until no further changes are necessary and whole segments are appended.
132     * append() methods that take combining-class values always write to the StringBuilder.
133     * Other append() methods flush and append to the Appendable.
134     */
135    public static final class ReorderingBuffer implements Appendable {
136        public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) {
137            impl=ni;
138            app=dest;
139            if(app instanceof StringBuilder) {
140                appIsStringBuilder=true;
141                str=(StringBuilder)dest;
142                // In Java, the constructor subsumes public void init(int destCapacity) {
143                str.ensureCapacity(destCapacity);
144                reorderStart=0;
145                if(str.length()==0) {
146                    lastCC=0;
147                } else {
148                    setIterator();
149                    lastCC=previousCC();
150                    // Set reorderStart after the last code point with cc<=1 if there is one.
151                    if(lastCC>1) {
152                        while(previousCC()>1) {}
153                    }
154                    reorderStart=codePointLimit;
155                }
156            } else {
157                appIsStringBuilder=false;
158                str=new StringBuilder();
159                reorderStart=0;
160                lastCC=0;
161            }
162        }
163
164        public boolean isEmpty() { return str.length()==0; }
165        public int length() { return str.length(); }
166        public int getLastCC() { return lastCC; }
167
168        public StringBuilder getStringBuilder() { return str; }
169
170        public boolean equals(CharSequence s, int start, int limit) {
171            return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
172        }
173
174        public void append(int c, int cc) {
175            if(lastCC<=cc || cc==0) {
176                str.appendCodePoint(c);
177                lastCC=cc;
178                if(cc<=1) {
179                    reorderStart=str.length();
180                }
181            } else {
182                insert(c, cc);
183            }
184        }
185        // s must be in NFD, otherwise change the implementation.
186        public void append(CharSequence s, int start, int limit,
187                           int leadCC, int trailCC) {
188            if(start==limit) {
189                return;
190            }
191            if(lastCC<=leadCC || leadCC==0) {
192                if(trailCC<=1) {
193                    reorderStart=str.length()+(limit-start);
194                } else if(leadCC<=1) {
195                    reorderStart=str.length()+1;  // Ok if not a code point boundary.
196                }
197                str.append(s, start, limit);
198                lastCC=trailCC;
199            } else {
200                int c=Character.codePointAt(s, start);
201                start+=Character.charCount(c);
202                insert(c, leadCC);  // insert first code point
203                while(start<limit) {
204                    c=Character.codePointAt(s, start);
205                    start+=Character.charCount(c);
206                    if(start<limit) {
207                        // s must be in NFD, otherwise we need to use getCC().
208                        leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
209                    } else {
210                        leadCC=trailCC;
211                    }
212                    append(c, leadCC);
213                }
214            }
215        }
216        // The following append() methods work like C++ appendZeroCC().
217        // They assume that the cc or trailCC of their input is 0.
218        // Most of them implement Appendable interface methods.
219        @Override
220        public ReorderingBuffer append(char c) {
221            str.append(c);
222            lastCC=0;
223            reorderStart=str.length();
224            return this;
225        }
226        public void appendZeroCC(int c) {
227            str.appendCodePoint(c);
228            lastCC=0;
229            reorderStart=str.length();
230        }
231        @Override
232        public ReorderingBuffer append(CharSequence s) {
233            if(s.length()!=0) {
234                str.append(s);
235                lastCC=0;
236                reorderStart=str.length();
237            }
238            return this;
239        }
240        @Override
241        public ReorderingBuffer append(CharSequence s, int start, int limit) {
242            if(start!=limit) {
243                str.append(s, start, limit);
244                lastCC=0;
245                reorderStart=str.length();
246            }
247            return this;
248        }
249        /**
250         * Flushes from the intermediate StringBuilder to the Appendable,
251         * if they are different objects.
252         * Used after recomposition.
253         * Must be called at the end when writing to a non-StringBuilder Appendable.
254         */
255        public void flush() {
256            if(appIsStringBuilder) {
257                reorderStart=str.length();
258            } else {
259                try {
260                    app.append(str);
261                    str.setLength(0);
262                    reorderStart=0;
263                } catch(IOException e) {
264                    throw new ICUUncheckedIOException(e);  // Avoid declaring "throws IOException".
265                }
266            }
267            lastCC=0;
268        }
269        /**
270         * Flushes from the intermediate StringBuilder to the Appendable,
271         * if they are different objects.
272         * Then appends the new text to the Appendable or StringBuilder.
273         * Normally used after quick check loops find a non-empty sequence.
274         */
275        public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
276            if(appIsStringBuilder) {
277                str.append(s, start, limit);
278                reorderStart=str.length();
279            } else {
280                try {
281                    app.append(str).append(s, start, limit);
282                    str.setLength(0);
283                    reorderStart=0;
284                } catch(IOException e) {
285                    throw new ICUUncheckedIOException(e);  // Avoid declaring "throws IOException".
286                }
287            }
288            lastCC=0;
289            return this;
290        }
291        public void remove() {
292            str.setLength(0);
293            lastCC=0;
294            reorderStart=0;
295        }
296        public void removeSuffix(int suffixLength) {
297            int oldLength=str.length();
298            str.delete(oldLength-suffixLength, oldLength);
299            lastCC=0;
300            reorderStart=str.length();
301        }
302
303        /*
304         * TODO: Revisit whether it makes sense to track reorderStart.
305         * It is set to after the last known character with cc<=1,
306         * which stops previousCC() before it reads that character and looks up its cc.
307         * previousCC() is normally only called from insert().
308         * In other words, reorderStart speeds up the insertion of a combining mark
309         * into a multi-combining mark sequence where it does not belong at the end.
310         * This might not be worth the trouble.
311         * On the other hand, it's not a huge amount of trouble.
312         *
313         * We probably need it for UNORM_SIMPLE_APPEND.
314         */
315
316        // Inserts c somewhere before the last character.
317        // Requires 0<cc<lastCC which implies reorderStart<limit.
318        private void insert(int c, int cc) {
319            for(setIterator(), skipPrevious(); previousCC()>cc;) {}
320            // insert c at codePointLimit, after the character with prevCC<=cc
321            if(c<=0xffff) {
322                str.insert(codePointLimit, (char)c);
323                if(cc<=1) {
324                    reorderStart=codePointLimit+1;
325                }
326            } else {
327                str.insert(codePointLimit, Character.toChars(c));
328                if(cc<=1) {
329                    reorderStart=codePointLimit+2;
330                }
331            }
332        }
333
334        private final Normalizer2Impl impl;
335        private final Appendable app;
336        private final StringBuilder str;
337        private final boolean appIsStringBuilder;
338        private int reorderStart;
339        private int lastCC;
340
341        // private backward iterator
342        private void setIterator() { codePointStart=str.length(); }
343        private void skipPrevious() {  // Requires 0<codePointStart.
344            codePointLimit=codePointStart;
345            codePointStart=str.offsetByCodePoints(codePointStart, -1);
346        }
347        private int previousCC() {  // Returns 0 if there is no previous character.
348            codePointLimit=codePointStart;
349            if(reorderStart>=codePointStart) {
350                return 0;
351            }
352            int c=str.codePointBefore(codePointStart);
353            codePointStart-=Character.charCount(c);
354            return impl.getCCFromYesOrMaybeCP(c);
355        }
356
357        private int codePointStart, codePointLimit;
358    }
359
360    // TODO: Propose as public API on the UTF16 class.
361    // TODO: Propose widening UTF16 methods that take char to take int.
362    // TODO: Propose widening UTF16 methods that take String to take CharSequence.
363    public static final class UTF16Plus {
364        /**
365         * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
366         * is it a lead surrogate?
367         * @param c code unit or code point
368         * @return true or false
369         */
370        public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
371        /**
372         * Compares two CharSequence objects for binary equality.
373         * @param s1 first sequence
374         * @param s2 second sequence
375         * @return true if s1 contains the same text as s2
376         */
377        public static boolean equal(CharSequence s1,  CharSequence s2) {
378            if(s1==s2) {
379                return true;
380            }
381            int length=s1.length();
382            if(length!=s2.length()) {
383                return false;
384            }
385            for(int i=0; i<length; ++i) {
386                if(s1.charAt(i)!=s2.charAt(i)) {
387                    return false;
388                }
389            }
390            return true;
391        }
392        /**
393         * Compares two CharSequence subsequences for binary equality.
394         * @param s1 first sequence
395         * @param start1 start offset in first sequence
396         * @param limit1 limit offset in first sequence
397         * @param s2 second sequence
398         * @param start2 start offset in second sequence
399         * @param limit2 limit offset in second sequence
400         * @return true if s1.subSequence(start1, limit1) contains the same text
401         *              as s2.subSequence(start2, limit2)
402         */
403        public static boolean equal(CharSequence s1, int start1, int limit1,
404                                    CharSequence s2, int start2, int limit2) {
405            if((limit1-start1)!=(limit2-start2)) {
406                return false;
407            }
408            if(s1==s2 && start1==start2) {
409                return true;
410            }
411            while(start1<limit1) {
412                if(s1.charAt(start1++)!=s2.charAt(start2++)) {
413                    return false;
414                }
415            }
416            return true;
417        }
418    }
419
420    public Normalizer2Impl() {}
421
422    private static final class IsAcceptable implements ICUBinary.Authenticate {
423        @Override
424        public boolean isDataVersionAcceptable(byte version[]) {
425            return version[0]==3;
426        }
427    }
428    private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
429    private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
430
431    public Normalizer2Impl load(ByteBuffer bytes) {
432        try {
433            dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
434            int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
435            if(indexesLength<=IX_MIN_LCCC_CP) {
436                throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes");
437            }
438            int[] inIndexes=new int[indexesLength];
439            inIndexes[0]=indexesLength*4;
440            for(int i=1; i<indexesLength; ++i) {
441                inIndexes[i]=bytes.getInt();
442            }
443
444            minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
445            minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
446            minLcccCP=inIndexes[IX_MIN_LCCC_CP];
447
448            minYesNo=inIndexes[IX_MIN_YES_NO];
449            minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
450            minNoNo=inIndexes[IX_MIN_NO_NO];
451            minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
452            minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
453            minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
454            limitNoNo=inIndexes[IX_LIMIT_NO_NO];
455            minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
456            assert((minMaybeYes&7)==0);  // 8-aligned for noNoDelta bit fields
457            centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
458
459            // Read the normTrie.
460            int offset=inIndexes[IX_NORM_TRIE_OFFSET];
461            int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
462            normTrie=Trie2_16.createFromSerialized(bytes);
463            int trieLength=normTrie.getSerializedLength();
464            if(trieLength>(nextOffset-offset)) {
465                throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie");
466            }
467            ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
468
469            // Read the composition and mapping data.
470            offset=nextOffset;
471            nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
472            int numChars=(nextOffset-offset)/2;
473            if(numChars!=0) {
474                maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
475                extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
476            }
477
478            // smallFCD: new in formatVersion 2
479            offset=nextOffset;
480            smallFCD=new byte[0x100];
481            bytes.get(smallFCD);
482
483            return this;
484        } catch(IOException e) {
485            throw new ICUUncheckedIOException(e);
486        }
487    }
488    public Normalizer2Impl load(String name) {
489        return load(ICUBinary.getRequiredData(name));
490    }
491
492    private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) {
493        if (norm16 > MIN_NORMAL_MAYBE_YES && norm16 != JAMO_VT) {
494            set.add(start, end);
495        } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) {
496            int fcd16=getFCD16(start);
497            if(fcd16>0xff) { set.add(start, end); }
498        }
499    }
500
501    private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) {
502        /* add the start code point to the USet */
503        set.add(start);
504        if(start!=end && isAlgorithmicNoNo(value) && (value & DELTA_TCCC_MASK) > DELTA_TCCC_1) {
505            // Range of code points with same-norm16-value algorithmic decompositions.
506            // They might have different non-zero FCD16 values.
507            int prevFCD16=getFCD16(start);
508            while(++start<=end) {
509                int fcd16=getFCD16(start);
510                if(fcd16!=prevFCD16) {
511                    set.add(start);
512                    prevFCD16=fcd16;
513                }
514            }
515        }
516    }
517
518    public void addLcccChars(UnicodeSet set) {
519        Iterator<Trie2.Range> trieIterator=normTrie.iterator();
520        Trie2.Range range;
521        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
522            enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set);
523        }
524    }
525
526    public void addPropertyStarts(UnicodeSet set) {
527        /* add the start code point of each same-value range of each trie */
528        Iterator<Trie2.Range> trieIterator=normTrie.iterator();
529        Trie2.Range range;
530        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
531            enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set);
532        }
533
534        /* add Hangul LV syllables and LV+1 because of skippables */
535        for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) {
536            set.add(c);
537            set.add(c+1);
538        }
539        set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
540    }
541
542    public void addCanonIterPropertyStarts(UnicodeSet set) {
543        /* add the start code point of each same-value range of the canonical iterator data trie */
544        ensureCanonIterData();
545        // currently only used for the SEGMENT_STARTER property
546        Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper);
547        Trie2.Range range;
548        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
549            /* add the start code point to the USet */
550            set.add(range.startCodePoint);
551        }
552    }
553    private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() {
554        @Override
555        public int map(int in) {
556            return in&CANON_NOT_SEGMENT_STARTER;
557        }
558    };
559
560    // low-level properties ------------------------------------------------ ***
561
562    // Note: Normalizer2Impl.java r30983 (2011-nov-27)
563    // still had getFCDTrie() which built and cached an FCD trie.
564    // That provided faster access to FCD data than getFCD16FromNormData()
565    // but required synchronization and consumed some 10kB of heap memory
566    // in any process that uses FCD (e.g., via collation).
567    // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,
568    // at least for ASCII & CJK.
569
570    /**
571     * Builds the canonical-iterator data for this instance.
572     * This is required before any of {@link #isCanonSegmentStarter(int)} or
573     * {@link #getCanonStartSet(int, UnicodeSet)} are called,
574     * or else they crash.
575     * @return this
576     */
577    public synchronized Normalizer2Impl ensureCanonIterData() {
578        if(canonIterData==null) {
579            Trie2Writable newData=new Trie2Writable(0, 0);
580            canonStartSets=new ArrayList<UnicodeSet>();
581            Iterator<Trie2.Range> trieIterator=normTrie.iterator();
582            Trie2.Range range;
583            while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
584                final int norm16=range.value;
585                if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) {
586                    // Inert, or 2-way mapping (including Hangul syllable).
587                    // We do not write a canonStartSet for any yesNo character.
588                    // Composites from 2-way mappings are added at runtime from the
589                    // starter's compositions list, and the other characters in
590                    // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
591                    // "maybe" characters.
592                    continue;
593                }
594                for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) {
595                    final int oldValue=newData.get(c);
596                    int newValue=oldValue;
597                    if(isMaybeOrNonZeroCC(norm16)) {
598                        // not a segment starter if it occurs in a decomposition or has cc!=0
599                        newValue|=CANON_NOT_SEGMENT_STARTER;
600                        if(norm16<MIN_NORMAL_MAYBE_YES) {
601                            newValue|=CANON_HAS_COMPOSITIONS;
602                        }
603                    } else if(norm16<minYesNo) {
604                        newValue|=CANON_HAS_COMPOSITIONS;
605                    } else {
606                        // c has a one-way decomposition
607                        int c2=c;
608                        // Do not modify the whole-range norm16 value.
609                        int norm16_2=norm16;
610                        if (isDecompNoAlgorithmic(norm16_2)) {
611                            // Maps to an isCompYesAndZeroCC.
612                            c2 = mapAlgorithmic(c2, norm16_2);
613                            norm16_2 = getNorm16(c2);
614                            // No compatibility mappings for the CanonicalIterator.
615                            assert(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2)));
616                        }
617                        if (norm16_2 > minYesNo) {
618                            // c decomposes, get everything from the variable-length extra data
619                            int mapping=norm16_2>>OFFSET_SHIFT;
620                            int firstUnit=extraData.charAt(mapping);
621                            int length=firstUnit&MAPPING_LENGTH_MASK;
622                            if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
623                                if(c==c2 && (extraData.charAt(mapping-1)&0xff)!=0) {
624                                    newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
625                                }
626                            }
627                            // Skip empty mappings (no characters in the decomposition).
628                            if(length!=0) {
629                                ++mapping;  // skip over the firstUnit
630                                // add c to first code point's start set
631                                int limit=mapping+length;
632                                c2=extraData.codePointAt(mapping);
633                                addToStartSet(newData, c, c2);
634                                // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
635                                // one-way mapping. A 2-way mapping is possible here after
636                                // intermediate algorithmic mapping.
637                                if(norm16_2>=minNoNo) {
638                                    while((mapping+=Character.charCount(c2))<limit) {
639                                        c2=extraData.codePointAt(mapping);
640                                        int c2Value=newData.get(c2);
641                                        if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
642                                            newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
643                                        }
644                                    }
645                                }
646                            }
647                        } else {
648                            // c decomposed to c2 algorithmically; c has cc==0
649                            addToStartSet(newData, c, c2);
650                        }
651                    }
652                    if(newValue!=oldValue) {
653                        newData.set(c, newValue);
654                    }
655                }
656            }
657            canonIterData=newData.toTrie2_32();
658        }
659        return this;
660    }
661
662    public int getNorm16(int c) { return normTrie.get(c); }
663
664    public int getCompQuickCheck(int norm16) {
665        if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
666            return 1;  // yes
667        } else if(minMaybeYes<=norm16) {
668            return 2;  // maybe
669        } else {
670            return 0;  // no
671        }
672    }
673    public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
674    public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
675    public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
676
677    public int getCC(int norm16) {
678        if(norm16>=MIN_NORMAL_MAYBE_YES) {
679            return getCCFromNormalYesOrMaybe(norm16);
680        }
681        if(norm16<minNoNo || limitNoNo<=norm16) {
682            return 0;
683        }
684        return getCCFromNoNo(norm16);
685    }
686    public static int getCCFromNormalYesOrMaybe(int norm16) {
687        return (norm16 >> OFFSET_SHIFT) & 0xff;
688    }
689    public static int getCCFromYesOrMaybe(int norm16) {
690        return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
691    }
692    public int getCCFromYesOrMaybeCP(int c) {
693        if (c < minCompNoMaybeCP) { return 0; }
694        return getCCFromYesOrMaybe(getNorm16(c));
695    }
696
697    /**
698     * Returns the FCD data for code point c.
699     * @param c A Unicode code point.
700     * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
701     */
702    public int getFCD16(int c) {
703        if(c<minDecompNoCP) {
704            return 0;
705        } else if(c<=0xffff) {
706            if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
707        }
708        return getFCD16FromNormData(c);
709    }
710    /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
711    public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
712        // 0<=lead<=0xffff
713        byte bits=smallFCD[lead>>8];
714        if(bits==0) { return false; }
715        return ((bits>>((lead>>5)&7))&1)!=0;
716    }
717
718    /** Gets the FCD value from the regular normalization data. */
719    public int getFCD16FromNormData(int c) {
720        int norm16=getNorm16(c);
721        if (norm16 >= limitNoNo) {
722            if(norm16>=MIN_NORMAL_MAYBE_YES) {
723                // combining mark
724                norm16=getCCFromNormalYesOrMaybe(norm16);
725                return norm16|(norm16<<8);
726            } else if(norm16>=minMaybeYes) {
727                return 0;
728            } else {  // isDecompNoAlgorithmic(norm16)
729                int deltaTrailCC = norm16 & DELTA_TCCC_MASK;
730                if (deltaTrailCC <= DELTA_TCCC_1) {
731                    return deltaTrailCC >> OFFSET_SHIFT;
732                }
733                // Maps to an isCompYesAndZeroCC.
734                c=mapAlgorithmic(c, norm16);
735                norm16=getNorm16(c);
736            }
737        }
738        if(norm16<=minYesNo || isHangulLVT(norm16)) {
739            // no decomposition or Hangul syllable, all zeros
740            return 0;
741        }
742        // c decomposes, get everything from the variable-length extra data
743        int mapping=norm16>>OFFSET_SHIFT;
744        int firstUnit=extraData.charAt(mapping);
745        int fcd16=firstUnit>>8;  // tccc
746        if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
747            fcd16|=extraData.charAt(mapping-1)&0xff00;  // lccc
748        }
749        return fcd16;
750    }
751
752    /**
753     * Gets the decomposition for one code point.
754     * @param c code point
755     * @return c's decomposition, if it has one; returns null if it does not have a decomposition
756     */
757    public String getDecomposition(int c) {
758        int norm16;
759        if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
760            // c does not decompose
761            return null;
762        }
763        int decomp = -1;
764        if(isDecompNoAlgorithmic(norm16)) {
765            // Maps to an isCompYesAndZeroCC.
766            decomp=c=mapAlgorithmic(c, norm16);
767            // The mapping might decompose further.
768            norm16 = getNorm16(c);
769        }
770        if (norm16 < minYesNo) {
771            if(decomp<0) {
772                return null;
773            } else {
774                return UTF16.valueOf(decomp);
775            }
776        } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
777            // Hangul syllable: decompose algorithmically
778            StringBuilder buffer=new StringBuilder();
779            Hangul.decompose(c, buffer);
780            return buffer.toString();
781        }
782        // c decomposes, get everything from the variable-length extra data
783        int mapping=norm16>>OFFSET_SHIFT;
784        int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK;
785        return extraData.substring(mapping, mapping+length);
786    }
787
788    /**
789     * Gets the raw decomposition for one code point.
790     * @param c code point
791     * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition
792     */
793    public String getRawDecomposition(int c) {
794        int norm16;
795        if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
796            // c does not decompose
797            return null;
798        } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
799            // Hangul syllable: decompose algorithmically
800            StringBuilder buffer=new StringBuilder();
801            Hangul.getRawDecomposition(c, buffer);
802            return buffer.toString();
803        } else if(isDecompNoAlgorithmic(norm16)) {
804            return UTF16.valueOf(mapAlgorithmic(c, norm16));
805        }
806        // c decomposes, get everything from the variable-length extra data
807        int mapping=norm16>>OFFSET_SHIFT;
808        int firstUnit=extraData.charAt(mapping);
809        int mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
810        if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) {
811            // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
812            // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
813            int rawMapping=mapping-((firstUnit>>7)&1)-1;
814            char rm0=extraData.charAt(rawMapping);
815            if(rm0<=MAPPING_LENGTH_MASK) {
816                return extraData.substring(rawMapping-rm0, rawMapping);
817            } else {
818                // Copy the normal mapping and replace its first two code units with rm0.
819                StringBuilder buffer=new StringBuilder(mLength-1).append(rm0);
820                mapping+=1+2;  // skip over the firstUnit and the first two mapping code units
821                return buffer.append(extraData, mapping, mapping+mLength-2).toString();
822            }
823        } else {
824            mapping+=1;  // skip over the firstUnit
825            return extraData.substring(mapping, mapping+mLength);
826        }
827    }
828
829    /**
830     * Returns true if code point c starts a canonical-iterator string segment.
831     * <b>{@link #ensureCanonIterData()} must have been called before this method,
832     * or else this method will crash.</b>
833     * @param c A Unicode code point.
834     * @return true if c starts a canonical-iterator string segment.
835     */
836    public boolean isCanonSegmentStarter(int c) {
837        return canonIterData.get(c)>=0;
838    }
839    /**
840     * Returns true if there are characters whose decomposition starts with c.
841     * If so, then the set is cleared and then filled with those characters.
842     * <b>{@link #ensureCanonIterData()} must have been called before this method,
843     * or else this method will crash.</b>
844     * @param c A Unicode code point.
845     * @param set A UnicodeSet to receive the characters whose decompositions
846     *        start with c, if there are any.
847     * @return true if there are characters whose decomposition starts with c.
848     */
849    public boolean getCanonStartSet(int c, UnicodeSet set) {
850        int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
851        if(canonValue==0) {
852            return false;
853        }
854        set.clear();
855        int value=canonValue&CANON_VALUE_MASK;
856        if((canonValue&CANON_HAS_SET)!=0) {
857            set.addAll(canonStartSets.get(value));
858        } else if(value!=0) {
859            set.add(value);
860        }
861        if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
862            int norm16=getNorm16(c);
863            if(norm16==JAMO_L) {
864                int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
865                set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
866            } else {
867                addComposites(getCompositionsList(norm16), set);
868            }
869        }
870        return true;
871    }
872
873    // Fixed norm16 values.
874    public static final int MIN_YES_YES_WITH_CC=0xfe02;
875    public static final int JAMO_VT=0xfe00;
876    public static final int MIN_NORMAL_MAYBE_YES=0xfc00;
877    public static final int JAMO_L=2;  // offset=1 hasCompBoundaryAfter=FALSE
878    public static final int INERT=1;  // offset=0 hasCompBoundaryAfter=TRUE
879
880    // norm16 bit 0 is comp-boundary-after.
881    public static final int HAS_COMP_BOUNDARY_AFTER=1;
882    public static final int OFFSET_SHIFT=1;
883
884    // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
885    // tccc (0, 1, >1) for quick FCC boundary-after tests.
886    public static final int DELTA_TCCC_0=0;
887    public static final int DELTA_TCCC_1=2;
888    public static final int DELTA_TCCC_GT_1=4;
889    public static final int DELTA_TCCC_MASK=6;
890    public static final int DELTA_SHIFT=3;
891
892    public static final int MAX_DELTA=0x40;
893
894    // Byte offsets from the start of the data, after the generic header.
895    public static final int IX_NORM_TRIE_OFFSET=0;
896    public static final int IX_EXTRA_DATA_OFFSET=1;
897    public static final int IX_SMALL_FCD_OFFSET=2;
898    public static final int IX_RESERVED3_OFFSET=3;
899    public static final int IX_TOTAL_SIZE=7;
900
901    // Code point thresholds for quick check codes.
902    public static final int IX_MIN_DECOMP_NO_CP=8;
903    public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
904
905    // Norm16 value thresholds for quick check combinations and types of extra data.
906
907    /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
908    public static final int IX_MIN_YES_NO=10;
909    /** Mappings are comp-normalized. */
910    public static final int IX_MIN_NO_NO=11;
911    public static final int IX_LIMIT_NO_NO=12;
912    public static final int IX_MIN_MAYBE_YES=13;
913
914    /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
915    public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
916    /** Mappings are not comp-normalized but have a comp boundary before. */
917    public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
918    /** Mappings do not have a comp boundary before. */
919    public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
920    /** Mappings to the empty string. */
921    public static final int IX_MIN_NO_NO_EMPTY=17;
922
923    public static final int IX_MIN_LCCC_CP=18;
924    public static final int IX_COUNT=20;
925
926    public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
927    public static final int MAPPING_HAS_RAW_MAPPING=0x40;
928    // unused bit 0x20;
929    public static final int MAPPING_LENGTH_MASK=0x1f;
930
931    public static final int COMP_1_LAST_TUPLE=0x8000;
932    public static final int COMP_1_TRIPLE=1;
933    public static final int COMP_1_TRAIL_LIMIT=0x3400;
934    public static final int COMP_1_TRAIL_MASK=0x7ffe;
935    public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
936    public static final int COMP_2_TRAIL_SHIFT=6;
937    public static final int COMP_2_TRAIL_MASK=0xffc0;
938
939    // higher-level functionality ------------------------------------------ ***
940
941    // NFD without an NFD Normalizer2 instance.
942    public Appendable decompose(CharSequence s, StringBuilder dest) {
943        decompose(s, 0, s.length(), dest, s.length());
944        return dest;
945    }
946    /**
947     * Decomposes s[src, limit[ and writes the result to dest.
948     * limit can be NULL if src is NUL-terminated.
949     * destLengthEstimate is the initial dest buffer capacity and can be -1.
950     */
951    public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
952                   int destLengthEstimate) {
953        if(destLengthEstimate<0) {
954            destLengthEstimate=limit-src;
955        }
956        dest.setLength(0);
957        ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
958        decompose(s, src, limit, buffer);
959    }
960
961    // Dual functionality:
962    // buffer!=NULL: normalize
963    // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
964    public int decompose(CharSequence s, int src, int limit,
965                         ReorderingBuffer buffer) {
966        int minNoCP=minDecompNoCP;
967
968        int prevSrc;
969        int c=0;
970        int norm16=0;
971
972        // only for quick check
973        int prevBoundary=src;
974        int prevCC=0;
975
976        for(;;) {
977            // count code units below the minimum or with irrelevant data for the quick check
978            for(prevSrc=src; src!=limit;) {
979                if( (c=s.charAt(src))<minNoCP ||
980                    isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
981                ) {
982                    ++src;
983                } else if(!UTF16.isSurrogate((char)c)) {
984                    break;
985                } else {
986                    char c2;
987                    if(UTF16Plus.isSurrogateLead(c)) {
988                        if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
989                            c=Character.toCodePoint((char)c, c2);
990                        }
991                    } else /* trail surrogate */ {
992                        if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
993                            --src;
994                            c=Character.toCodePoint(c2, (char)c);
995                        }
996                    }
997                    if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
998                        src+=Character.charCount(c);
999                    } else {
1000                        break;
1001                    }
1002                }
1003            }
1004            // copy these code units all at once
1005            if(src!=prevSrc) {
1006                if(buffer!=null) {
1007                    buffer.flushAndAppendZeroCC(s, prevSrc, src);
1008                } else {
1009                    prevCC=0;
1010                    prevBoundary=src;
1011                }
1012            }
1013            if(src==limit) {
1014                break;
1015            }
1016
1017            // Check one above-minimum, relevant code point.
1018            src+=Character.charCount(c);
1019            if(buffer!=null) {
1020                decompose(c, norm16, buffer);
1021            } else {
1022                if(isDecompYes(norm16)) {
1023                    int cc=getCCFromYesOrMaybe(norm16);
1024                    if(prevCC<=cc || cc==0) {
1025                        prevCC=cc;
1026                        if(cc<=1) {
1027                            prevBoundary=src;
1028                        }
1029                        continue;
1030                    }
1031                }
1032                return prevBoundary;  // "no" or cc out of order
1033            }
1034        }
1035        return src;
1036    }
1037    public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
1038        int limit=s.length();
1039        if(limit==0) {
1040            return;
1041        }
1042        if(doDecompose) {
1043            decompose(s, 0, limit, buffer);
1044            return;
1045        }
1046        // Just merge the strings at the boundary.
1047        int c=Character.codePointAt(s, 0);
1048        int src=0;
1049        int firstCC, prevCC, cc;
1050        firstCC=prevCC=cc=getCC(getNorm16(c));
1051        while(cc!=0) {
1052            prevCC=cc;
1053            src+=Character.charCount(c);
1054            if(src>=limit) {
1055                break;
1056            }
1057            c=Character.codePointAt(s, src);
1058            cc=getCC(getNorm16(c));
1059        };
1060        buffer.append(s, 0, src, firstCC, prevCC);
1061        buffer.append(s, src, limit);
1062    }
1063
1064    // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1065    // doCompose: normalize
1066    // !doCompose: isNormalized (buffer must be empty and initialized)
1067    public boolean compose(CharSequence s, int src, int limit,
1068                           boolean onlyContiguous,
1069                           boolean doCompose,
1070                           ReorderingBuffer buffer) {
1071        int prevBoundary=src;
1072        int minNoMaybeCP=minCompNoMaybeCP;
1073
1074        for (;;) {
1075            // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1076            // or with (compYes && ccc==0) properties.
1077            int prevSrc;
1078            int c = 0;
1079            int norm16 = 0;
1080            for (;;) {
1081                if (src == limit) {
1082                    if (prevBoundary != limit && doCompose) {
1083                        buffer.append(s, prevBoundary, limit);
1084                    }
1085                    return true;
1086                }
1087                if( (c=s.charAt(src))<minNoMaybeCP ||
1088                    isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
1089                ) {
1090                    ++src;
1091                } else {
1092                    prevSrc = src++;
1093                    if(!UTF16.isSurrogate((char)c)) {
1094                        break;
1095                    } else {
1096                        char c2;
1097                        if(UTF16Plus.isSurrogateLead(c)) {
1098                            if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
1099                                ++src;
1100                                c=Character.toCodePoint((char)c, c2);
1101                            }
1102                        } else /* trail surrogate */ {
1103                            if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
1104                                --prevSrc;
1105                                c=Character.toCodePoint(c2, (char)c);
1106                            }
1107                        }
1108                        if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
1109                            break;
1110                        }
1111                    }
1112                }
1113            }
1114            // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1115            // The current character is either a "noNo" (has a mapping)
1116            // or a "maybeYes" (combines backward)
1117            // or a "yesYes" with ccc!=0.
1118            // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1119
1120            // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1121            if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
1122                if (!doCompose) {
1123                    return false;
1124                }
1125                // Fast path for mapping a character that is immediately surrounded by boundaries.
1126                // In this case, we need not decompose around the current character.
1127                if (isDecompNoAlgorithmic(norm16)) {
1128                    // Maps to a single isCompYesAndZeroCC character
1129                    // which also implies hasCompBoundaryBefore.
1130                    if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1131                            hasCompBoundaryBefore(s, src, limit)) {
1132                        if (prevBoundary != prevSrc) {
1133                            buffer.append(s, prevBoundary, prevSrc);
1134                        }
1135                        buffer.append(mapAlgorithmic(c, norm16), 0);
1136                        prevBoundary = src;
1137                        continue;
1138                    }
1139                } else if (norm16 < minNoNoCompBoundaryBefore) {
1140                    // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1141                    if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1142                            hasCompBoundaryBefore(s, src, limit)) {
1143                        if (prevBoundary != prevSrc) {
1144                            buffer.append(s, prevBoundary, prevSrc);
1145                        }
1146                        int mapping = norm16 >> OFFSET_SHIFT;
1147                        int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK;
1148                        buffer.append(extraData, mapping, mapping + length);
1149                        prevBoundary = src;
1150                        continue;
1151                    }
1152                } else if (norm16 >= minNoNoEmpty) {
1153                    // The current character maps to nothing.
1154                    // Simply omit it from the output if there is a boundary before _or_ after it.
1155                    // The character itself implies no boundaries.
1156                    if (hasCompBoundaryBefore(s, src, limit) ||
1157                            hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) {
1158                        if (prevBoundary != prevSrc) {
1159                            buffer.append(s, prevBoundary, prevSrc);
1160                        }
1161                        prevBoundary = src;
1162                        continue;
1163                    }
1164                }
1165                // Other "noNo" type, or need to examine more text around this character:
1166                // Fall through to the slow path.
1167            } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
1168                char prev=s.charAt(prevSrc-1);
1169                if(c<Hangul.JAMO_T_BASE) {
1170                    // The current character is a Jamo Vowel,
1171                    // compose with previous Jamo L and following Jamo T.
1172                    char l = (char)(prev-Hangul.JAMO_L_BASE);
1173                    if(l<Hangul.JAMO_L_COUNT) {
1174                        if (!doCompose) {
1175                            return false;
1176                        }
1177                        int t;
1178                        if (src != limit &&
1179                                0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) &&
1180                                t < Hangul.JAMO_T_COUNT) {
1181                            // The next character is a Jamo T.
1182                            ++src;
1183                        } else if (hasCompBoundaryBefore(s, src, limit)) {
1184                            // No Jamo T follows, not even via decomposition.
1185                            t = 0;
1186                        } else {
1187                            t = -1;
1188                        }
1189                        if (t >= 0) {
1190                            int syllable = Hangul.HANGUL_BASE +
1191                                (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) *
1192                                Hangul.JAMO_T_COUNT + t;
1193                            --prevSrc;  // Replace the Jamo L as well.
1194                            if (prevBoundary != prevSrc) {
1195                                buffer.append(s, prevBoundary, prevSrc);
1196                            }
1197                            buffer.append((char)syllable);
1198                            prevBoundary = src;
1199                            continue;
1200                        }
1201                        // If we see L+V+x where x!=T then we drop to the slow path,
1202                        // decompose and recompose.
1203                        // This is to deal with NFKC finding normal L and V but a
1204                        // compatibility variant of a T.
1205                        // We need to either fully compose that combination here
1206                        // (which would complicate the code and may not work with strange custom data)
1207                        // or use the slow path.
1208                    }
1209                } else if (Hangul.isHangulLV(prev)) {
1210                    // The current character is a Jamo Trailing consonant,
1211                    // compose with previous Hangul LV that does not contain a Jamo T.
1212                    if (!doCompose) {
1213                        return false;
1214                    }
1215                    int syllable = prev + c - Hangul.JAMO_T_BASE;
1216                    --prevSrc;  // Replace the Hangul LV as well.
1217                    if (prevBoundary != prevSrc) {
1218                        buffer.append(s, prevBoundary, prevSrc);
1219                    }
1220                    buffer.append((char)syllable);
1221                    prevBoundary = src;
1222                    continue;
1223                }
1224                // No matching context, or may need to decompose surrounding text first:
1225                // Fall through to the slow path.
1226            } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
1227                // One or more combining marks that do not combine-back:
1228                // Check for canonical order, copy unchanged if ok and
1229                // if followed by a character with a boundary-before.
1230                int cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
1231                if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) {
1232                    // Fails FCD test, need to decompose and contiguously recompose.
1233                    if (!doCompose) {
1234                        return false;
1235                    }
1236                } else {
1237                    // If !onlyContiguous (not FCC), then we ignore the tccc of
1238                    // the previous character which passed the quick check "yes && ccc==0" test.
1239                    int n16;
1240                    for (;;) {
1241                        if (src == limit) {
1242                            if (doCompose) {
1243                                buffer.append(s, prevBoundary, limit);
1244                            }
1245                            return true;
1246                        }
1247                        int prevCC = cc;
1248                        c = Character.codePointAt(s, src);
1249                        n16 = normTrie.get(c);
1250                        if (n16 >= MIN_YES_YES_WITH_CC) {
1251                            cc = getCCFromNormalYesOrMaybe(n16);
1252                            if (prevCC > cc) {
1253                                if (!doCompose) {
1254                                    return false;
1255                                }
1256                                break;
1257                            }
1258                        } else {
1259                            break;
1260                        }
1261                        src += Character.charCount(c);
1262                    }
1263                    // p is after the last in-order combining mark.
1264                    // If there is a boundary here, then we continue with no change.
1265                    if (norm16HasCompBoundaryBefore(n16)) {
1266                        if (isCompYesAndZeroCC(n16)) {
1267                            src += Character.charCount(c);
1268                        }
1269                        continue;
1270                    }
1271                    // Use the slow path. There is no boundary in [prevSrc, src[.
1272                }
1273            }
1274
1275            // Slow path: Find the nearest boundaries around the current character,
1276            // decompose and recompose.
1277            if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1278                c = Character.codePointBefore(s, prevSrc);
1279                norm16 = normTrie.get(c);
1280                if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1281                    prevSrc -= Character.charCount(c);
1282                }
1283            }
1284            if (doCompose && prevBoundary != prevSrc) {
1285                buffer.append(s, prevBoundary, prevSrc);
1286            }
1287            int recomposeStartIndex=buffer.length();
1288            // We know there is not a boundary here.
1289            decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous,
1290                           buffer);
1291            // Decompose until the next boundary.
1292            src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous,
1293                                 buffer);
1294            recompose(buffer, recomposeStartIndex, onlyContiguous);
1295            if(!doCompose) {
1296                if(!buffer.equals(s, prevSrc, src)) {
1297                    return false;
1298                }
1299                buffer.remove();
1300            }
1301            prevBoundary=src;
1302        }
1303    }
1304
1305    /**
1306     * Very similar to compose(): Make the same changes in both places if relevant.
1307     * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
1308     * !doSpan: quickCheck
1309     * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
1310     *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
1311     *         then the quick check result is "no"
1312     */
1313    public int composeQuickCheck(CharSequence s, int src, int limit,
1314                                 boolean onlyContiguous, boolean doSpan) {
1315        int qcResult=0;
1316        int prevBoundary=src;
1317        int minNoMaybeCP=minCompNoMaybeCP;
1318
1319        for(;;) {
1320            // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1321            // or with (compYes && ccc==0) properties.
1322            int prevSrc;
1323            int c = 0;
1324            int norm16 = 0;
1325            for (;;) {
1326                if(src==limit) {
1327                    return (src<<1)|qcResult;  // "yes" or "maybe"
1328                }
1329                if( (c=s.charAt(src))<minNoMaybeCP ||
1330                    isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
1331                ) {
1332                    ++src;
1333                } else {
1334                    prevSrc = src++;
1335                    if(!UTF16.isSurrogate((char)c)) {
1336                        break;
1337                    } else {
1338                        char c2;
1339                        if(UTF16Plus.isSurrogateLead(c)) {
1340                            if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
1341                                ++src;
1342                                c=Character.toCodePoint((char)c, c2);
1343                            }
1344                        } else /* trail surrogate */ {
1345                            if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
1346                                --prevSrc;
1347                                c=Character.toCodePoint(c2, (char)c);
1348                            }
1349                        }
1350                        if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
1351                            break;
1352                        }
1353                    }
1354                }
1355            }
1356            // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1357            // The current character is either a "noNo" (has a mapping)
1358            // or a "maybeYes" (combines backward)
1359            // or a "yesYes" with ccc!=0.
1360            // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1361
1362            int prevNorm16 = INERT;
1363            if (prevBoundary != prevSrc) {
1364                prevBoundary = prevSrc;
1365                if (!norm16HasCompBoundaryBefore(norm16)) {
1366                    c = Character.codePointBefore(s, prevSrc);
1367                    int n16 = getNorm16(c);
1368                    if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1369                        prevBoundary -= Character.charCount(c);
1370                        prevNorm16 = n16;
1371                    }
1372                }
1373            }
1374
1375            if(isMaybeOrNonZeroCC(norm16)) {
1376                int cc=getCCFromYesOrMaybe(norm16);
1377                if (onlyContiguous /* FCC */ && cc != 0 &&
1378                        getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1379                    // The [prevBoundary..prevSrc[ character
1380                    // passed the quick check "yes && ccc==0" test
1381                    // but is out of canonical order with the current combining mark.
1382                } else {
1383                    // If !onlyContiguous (not FCC), then we ignore the tccc of
1384                    // the previous character which passed the quick check "yes && ccc==0" test.
1385                    for (;;) {
1386                        if (norm16 < MIN_YES_YES_WITH_CC) {
1387                            if (!doSpan) {
1388                                qcResult = 1;
1389                            } else {
1390                                return prevBoundary << 1;  // spanYes does not care to know it's "maybe"
1391                            }
1392                        }
1393                        if (src == limit) {
1394                            return (src<<1) | qcResult;  // "yes" or "maybe"
1395                        }
1396                        int prevCC = cc;
1397                        c = Character.codePointAt(s, src);
1398                        norm16 = getNorm16(c);
1399                        if (isMaybeOrNonZeroCC(norm16)) {
1400                            cc = getCCFromYesOrMaybe(norm16);
1401                            if (!(prevCC <= cc || cc == 0)) {
1402                                break;
1403                            }
1404                        } else {
1405                            break;
1406                        }
1407                        src += Character.charCount(c);
1408                    }
1409                    // src is after the last in-order combining mark.
1410                    if (isCompYesAndZeroCC(norm16)) {
1411                        prevBoundary = src;
1412                        src += Character.charCount(c);
1413                        continue;
1414                    }
1415                }
1416            }
1417            return prevBoundary<<1;  // "no"
1418        }
1419    }
1420    public void composeAndAppend(CharSequence s,
1421                                 boolean doCompose,
1422                                 boolean onlyContiguous,
1423                                 ReorderingBuffer buffer) {
1424        int src=0, limit=s.length();
1425        if(!buffer.isEmpty()) {
1426            int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous);
1427            if(0!=firstStarterInSrc) {
1428                int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1429                                                               buffer.length(), onlyContiguous);
1430                StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1431                                                       firstStarterInSrc+16);
1432                middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1433                buffer.removeSuffix(buffer.length()-lastStarterInDest);
1434                middle.append(s, 0, firstStarterInSrc);
1435                compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1436                src=firstStarterInSrc;
1437            }
1438        }
1439        if(doCompose) {
1440            compose(s, src, limit, onlyContiguous, true, buffer);
1441        } else {
1442            buffer.append(s, src, limit);
1443        }
1444    }
1445    // Dual functionality:
1446    // buffer!=NULL: normalize
1447    // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1448    public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1449        // Note: In this function we use buffer->appendZeroCC() because we track
1450        // the lead and trail combining classes here, rather than leaving it to
1451        // the ReorderingBuffer.
1452        // The exception is the call to decomposeShort() which uses the buffer
1453        // in the normal way.
1454
1455        // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1456        // Similar to the prevBoundary in the compose() implementation.
1457        int prevBoundary=src;
1458        int prevSrc;
1459        int c=0;
1460        int prevFCD16=0;
1461        int fcd16=0;
1462
1463        for(;;) {
1464            // count code units with lccc==0
1465            for(prevSrc=src; src!=limit;) {
1466                if((c=s.charAt(src))<minLcccCP) {
1467                    prevFCD16=~c;
1468                    ++src;
1469                } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1470                    prevFCD16=0;
1471                    ++src;
1472                } else {
1473                    if(UTF16.isSurrogate((char)c)) {
1474                        char c2;
1475                        if(UTF16Plus.isSurrogateLead(c)) {
1476                            if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1477                                c=Character.toCodePoint((char)c, c2);
1478                            }
1479                        } else /* trail surrogate */ {
1480                            if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1481                                --src;
1482                                c=Character.toCodePoint(c2, (char)c);
1483                            }
1484                        }
1485                    }
1486                    if((fcd16=getFCD16FromNormData(c))<=0xff) {
1487                        prevFCD16=fcd16;
1488                        src+=Character.charCount(c);
1489                    } else {
1490                        break;
1491                    }
1492                }
1493            }
1494            // copy these code units all at once
1495            if(src!=prevSrc) {
1496                if(src==limit) {
1497                    if(buffer!=null) {
1498                        buffer.flushAndAppendZeroCC(s, prevSrc, src);
1499                    }
1500                    break;
1501                }
1502                prevBoundary=src;
1503                // We know that the previous character's lccc==0.
1504                if(prevFCD16<0) {
1505                    // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
1506                    int prev=~prevFCD16;
1507                    if(prev<minDecompNoCP) {
1508                        prevFCD16=0;
1509                    } else {
1510                        prevFCD16=getFCD16FromNormData(prev);
1511                        if(prevFCD16>1) {
1512                            --prevBoundary;
1513                        }
1514                    }
1515                } else {
1516                    int p=src-1;
1517                    if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1518                        Character.isHighSurrogate(s.charAt(p-1))
1519                    ) {
1520                        --p;
1521                        // Need to fetch the previous character's FCD value because
1522                        // prevFCD16 was just for the trail surrogate code point.
1523                        prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1524                        // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1525                    }
1526                    if(prevFCD16>1) {
1527                        prevBoundary=p;
1528                    }
1529                }
1530                if(buffer!=null) {
1531                    // The last lccc==0 character is excluded from the
1532                    // flush-and-append call in case it needs to be modified.
1533                    buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1534                    buffer.append(s, prevBoundary, src);
1535                }
1536                // The start of the current character (c).
1537                prevSrc=src;
1538            } else if(src==limit) {
1539                break;
1540            }
1541
1542            src+=Character.charCount(c);
1543            // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1544            // Check for proper order, and decompose locally if necessary.
1545            if((prevFCD16&0xff)<=(fcd16>>8)) {
1546                // proper order: prev tccc <= current lccc
1547                if((fcd16&0xff)<=1) {
1548                    prevBoundary=src;
1549                }
1550                if(buffer!=null) {
1551                    buffer.appendZeroCC(c);
1552                }
1553                prevFCD16=fcd16;
1554                continue;
1555            } else if(buffer==null) {
1556                return prevBoundary;  // quick check "no"
1557            } else {
1558                /*
1559                 * Back out the part of the source that we copied or appended
1560                 * already but is now going to be decomposed.
1561                 * prevSrc is set to after what was copied/appended.
1562                 */
1563                buffer.removeSuffix(prevSrc-prevBoundary);
1564                /*
1565                 * Find the part of the source that needs to be decomposed,
1566                 * up to the next safe boundary.
1567                 */
1568                src=findNextFCDBoundary(s, src, limit);
1569                /*
1570                 * The source text does not fulfill the conditions for FCD.
1571                 * Decompose and reorder a limited piece of the text.
1572                 */
1573                decomposeShort(s, prevBoundary, src, false, false, buffer);
1574                prevBoundary=src;
1575                prevFCD16=0;
1576            }
1577        }
1578        return src;
1579    }
1580    public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) {
1581        int src=0, limit=s.length();
1582        if(!buffer.isEmpty()) {
1583            int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit);
1584            if(0!=firstBoundaryInSrc) {
1585                int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(),
1586                                                               buffer.length());
1587                StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+
1588                                                       firstBoundaryInSrc+16);
1589                middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length());
1590                buffer.removeSuffix(buffer.length()-lastBoundaryInDest);
1591                middle.append(s, 0, firstBoundaryInSrc);
1592                makeFCD(middle, 0, middle.length(), buffer);
1593                src=firstBoundaryInSrc;
1594            }
1595        }
1596        if(doMakeFCD) {
1597            makeFCD(s, src, limit, buffer);
1598        } else {
1599            buffer.append(s, src, limit);
1600        }
1601    }
1602
1603    public boolean hasDecompBoundaryBefore(int c) {
1604        return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
1605            norm16HasDecompBoundaryBefore(getNorm16(c));
1606    }
1607    public boolean norm16HasDecompBoundaryBefore(int norm16) {
1608        if (norm16 < minNoNoCompNoMaybeCC) {
1609            return true;
1610        }
1611        if (norm16 >= limitNoNo) {
1612            return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1613        }
1614        // c decomposes, get everything from the variable-length extra data
1615        int mapping=norm16>>OFFSET_SHIFT;
1616        int firstUnit=extraData.charAt(mapping);
1617        // true if leadCC==0 (hasFCDBoundaryBefore())
1618        return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1619    }
1620    public boolean hasDecompBoundaryAfter(int c) {
1621        if (c < minDecompNoCP) {
1622            return true;
1623        }
1624        if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
1625            return true;
1626        }
1627        return norm16HasDecompBoundaryAfter(getNorm16(c));
1628    }
1629    public boolean norm16HasDecompBoundaryAfter(int norm16) {
1630        if(norm16 <= minYesNo || isHangulLVT(norm16)) {
1631            return true;
1632        }
1633        if (norm16 >= limitNoNo) {
1634            if (isMaybeOrNonZeroCC(norm16)) {
1635                return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1636            }
1637            // Maps to an isCompYesAndZeroCC.
1638            return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
1639        }
1640        // c decomposes, get everything from the variable-length extra data
1641        int mapping=norm16>>OFFSET_SHIFT;
1642        int firstUnit=extraData.charAt(mapping);
1643        // decomp after-boundary: same as hasFCDBoundaryAfter(),
1644        // fcd16<=1 || trailCC==0
1645        if(firstUnit>0x1ff) {
1646            return false;  // trailCC>1
1647        }
1648        if(firstUnit<=0xff) {
1649            return true;  // trailCC==0
1650        }
1651        // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1652        // true if leadCC==0 (hasFCDBoundaryBefore())
1653        return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1654    }
1655    public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
1656
1657    public boolean hasCompBoundaryBefore(int c) {
1658        return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
1659    }
1660    public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) {
1661        return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
1662    }
1663    public boolean isCompInert(int c, boolean onlyContiguous) {
1664        int norm16=getNorm16(c);
1665        return isCompYesAndZeroCC(norm16) &&
1666            (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
1667            (!onlyContiguous || isInert(norm16) || extraData.charAt(norm16>>OFFSET_SHIFT) <= 0x1ff);
1668    }
1669
1670    public boolean hasFCDBoundaryBefore(int c) { return hasDecompBoundaryBefore(c); }
1671    public boolean hasFCDBoundaryAfter(int c) { return hasDecompBoundaryAfter(c); }
1672    public boolean isFCDInert(int c) { return getFCD16(c)<=1; }
1673
1674    private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
1675    private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
1676    private static boolean isInert(int norm16) { return norm16==INERT; }
1677    private static boolean isJamoL(int norm16) { return norm16==JAMO_L; }
1678    private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
1679    private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
1680    private boolean isHangulLV(int norm16) { return norm16==minYesNo; }
1681    private boolean isHangulLVT(int norm16) {
1682        return norm16==hangulLVT();
1683    }
1684    private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
1685    // UBool isCompYes(uint16_t norm16) const {
1686    //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1687    // }
1688    // UBool isCompYesOrMaybe(uint16_t norm16) const {
1689    //     return norm16<minNoNo || minMaybeYes<=norm16;
1690    // }
1691    // private boolean hasZeroCCFromDecompYes(int norm16) {
1692    //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1693    // }
1694    private boolean isDecompYesAndZeroCC(int norm16) {
1695        return norm16<minYesNo ||
1696               norm16==JAMO_VT ||
1697               (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1698    }
1699    /**
1700     * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1701     * the MaybeYes which combine-forward and have ccc=0.
1702     * (Standard Unicode 10 normalization does not have such characters.)
1703     */
1704    private boolean isMostDecompYesAndZeroCC(int norm16) {
1705        return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1706    }
1707    private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1708
1709    // For use with isCompYes().
1710    // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1711    // static uint8_t getCCFromYes(uint16_t norm16) {
1712    //     return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
1713    // }
1714    private int getCCFromNoNo(int norm16) {
1715        int mapping=norm16>>OFFSET_SHIFT;
1716        if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1717            return extraData.charAt(mapping-1)&0xff;
1718        } else {
1719            return 0;
1720        }
1721    }
1722    int getTrailCCFromCompYesAndZeroCC(int norm16) {
1723        if(norm16<=minYesNo) {
1724            return 0;  // yesYes and Hangul LV have ccc=tccc=0
1725        } else {
1726            // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
1727            return extraData.charAt(norm16>>OFFSET_SHIFT)>>8;  // tccc from yesNo
1728        }
1729    }
1730
1731    // Requires algorithmic-NoNo.
1732    private int mapAlgorithmic(int c, int norm16) {
1733        return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
1734    }
1735
1736    // Requires minYesNo<norm16<limitNoNo.
1737    // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); }
1738
1739    /**
1740     * @return index into maybeYesCompositions, or -1
1741     */
1742    private int getCompositionsListForDecompYes(int norm16) {
1743        if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
1744            return -1;
1745        } else {
1746            if((norm16-=minMaybeYes)<0) {
1747                // norm16<minMaybeYes: index into extraData which is a substring at
1748                //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1749                // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1750                norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list
1751            }
1752            return norm16>>OFFSET_SHIFT;
1753        }
1754    }
1755    /**
1756     * @return index into maybeYesCompositions
1757     */
1758    private int getCompositionsListForComposite(int norm16) {
1759        // A composite has both mapping & compositions list.
1760        int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT;
1761        int firstUnit=maybeYesCompositions.charAt(list);
1762        return list+  // mapping in maybeYesCompositions
1763            1+  // +1 to skip the first unit with the mapping length
1764            (firstUnit&MAPPING_LENGTH_MASK);  // + mapping length
1765    }
1766    private int getCompositionsListForMaybe(int norm16) {
1767        // minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES
1768        return (norm16-minMaybeYes)>>OFFSET_SHIFT;
1769    }
1770    /**
1771     * @param c code point must have compositions
1772     * @return index into maybeYesCompositions
1773     */
1774    private int getCompositionsList(int norm16) {
1775        return isDecompYes(norm16) ?
1776                getCompositionsListForDecompYes(norm16) :
1777                getCompositionsListForComposite(norm16);
1778    }
1779
1780    // Decompose a short piece of text which is likely to contain characters that
1781    // fail the quick check loop and/or where the quick check loop's overhead
1782    // is unlikely to be amortized.
1783    // Called by the compose() and makeFCD() implementations.
1784    // Public in Java for collation implementation code.
1785    private int decomposeShort(
1786            CharSequence s, int src, int limit,
1787            boolean stopAtCompBoundary, boolean onlyContiguous,
1788            ReorderingBuffer buffer) {
1789        while(src<limit) {
1790            int c=Character.codePointAt(s, src);
1791            if (stopAtCompBoundary && c < minCompNoMaybeCP) {
1792                return src;
1793            }
1794            int norm16 = getNorm16(c);
1795            if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
1796                return src;
1797            }
1798            src+=Character.charCount(c);
1799            decompose(c, norm16, buffer);
1800            if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1801                return src;
1802            }
1803        }
1804        return src;
1805    }
1806    private void decompose(int c, int norm16, ReorderingBuffer buffer) {
1807        // get the decomposition and the lead and trail cc's
1808        if (norm16 >= limitNoNo) {
1809            if (isMaybeOrNonZeroCC(norm16)) {
1810                buffer.append(c, getCCFromYesOrMaybe(norm16));
1811                return;
1812            }
1813            // Maps to an isCompYesAndZeroCC.
1814            c=mapAlgorithmic(c, norm16);
1815            norm16=getNorm16(c);
1816        }
1817        if (norm16 < minYesNo) {
1818            // c does not decompose
1819            buffer.append(c, 0);
1820        } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
1821            // Hangul syllable: decompose algorithmically
1822            Hangul.decompose(c, buffer);
1823        } else {
1824            // c decomposes, get everything from the variable-length extra data
1825            int mapping=norm16>>OFFSET_SHIFT;
1826            int firstUnit=extraData.charAt(mapping);
1827            int length=firstUnit&MAPPING_LENGTH_MASK;
1828            int leadCC, trailCC;
1829            trailCC=firstUnit>>8;
1830            if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1831                leadCC=extraData.charAt(mapping-1)>>8;
1832            } else {
1833                leadCC=0;
1834            }
1835            ++mapping;  // skip over the firstUnit
1836            buffer.append(extraData, mapping, mapping+length, leadCC, trailCC);
1837        }
1838    }
1839
1840    /**
1841     * Finds the recomposition result for
1842     * a forward-combining "lead" character,
1843     * specified with a pointer to its compositions list,
1844     * and a backward-combining "trail" character.
1845     *
1846     * <p>If the lead and trail characters combine, then this function returns
1847     * the following "compositeAndFwd" value:
1848     * <pre>
1849     * Bits 21..1  composite character
1850     * Bit      0  set if the composite is a forward-combining starter
1851     * </pre>
1852     * otherwise it returns -1.
1853     *
1854     * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1855     * encoded as either pairs or triples of 16-bit units.
1856     * The last entry has the high bit of its first unit set.
1857     *
1858     * <p>The list is sorted by ascending trail characters (there are no duplicates).
1859     * A linear search is used.
1860     *
1861     * <p>See normalizer2impl.h for a more detailed description
1862     * of the compositions list format.
1863     */
1864    private static int combine(String compositions, int list, int trail) {
1865        int key1, firstUnit;
1866        if(trail<COMP_1_TRAIL_LIMIT) {
1867            // trail character is 0..33FF
1868            // result entry may have 2 or 3 units
1869            key1=(trail<<1);
1870            while(key1>(firstUnit=compositions.charAt(list))) {
1871                list+=2+(firstUnit&COMP_1_TRIPLE);
1872            }
1873            if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1874                if((firstUnit&COMP_1_TRIPLE)!=0) {
1875                    return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1876                } else {
1877                    return compositions.charAt(list+1);
1878                }
1879            }
1880        } else {
1881            // trail character is 3400..10FFFF
1882            // result entry has 3 units
1883            key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1884            int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1885            int secondUnit;
1886            for(;;) {
1887                if(key1>(firstUnit=compositions.charAt(list))) {
1888                    list+=2+(firstUnit&COMP_1_TRIPLE);
1889                } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1890                    if(key2>(secondUnit=compositions.charAt(list+1))) {
1891                        if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1892                            break;
1893                        } else {
1894                            list+=3;
1895                        }
1896                    } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1897                        return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
1898                    } else {
1899                        break;
1900                    }
1901                } else {
1902                    break;
1903                }
1904            }
1905        }
1906        return -1;
1907    }
1908    /**
1909     * @param list some character's compositions list
1910     * @param set recursively receives the composites from these compositions
1911     */
1912    private void addComposites(int list, UnicodeSet set) {
1913        int firstUnit, compositeAndFwd;
1914        do {
1915            firstUnit=maybeYesCompositions.charAt(list);
1916            if((firstUnit&COMP_1_TRIPLE)==0) {
1917                compositeAndFwd=maybeYesCompositions.charAt(list+1);
1918                list+=2;
1919            } else {
1920                compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
1921                                maybeYesCompositions.charAt(list+2);
1922                list+=3;
1923            }
1924            int composite=compositeAndFwd>>1;
1925            if((compositeAndFwd&1)!=0) {
1926                addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
1927            }
1928            set.add(composite);
1929        } while((firstUnit&COMP_1_LAST_TUPLE)==0);
1930    }
1931    /*
1932     * Recomposes the buffer text starting at recomposeStartIndex
1933     * (which is in NFD - decomposed and canonically ordered),
1934     * and truncates the buffer contents.
1935     *
1936     * Note that recomposition never lengthens the text:
1937     * Any character consists of either one or two code units;
1938     * a composition may contain at most one more code unit than the original starter,
1939     * while the combining mark that is removed has at least one code unit.
1940     */
1941    private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
1942                           boolean onlyContiguous) {
1943        StringBuilder sb=buffer.getStringBuilder();
1944        int p=recomposeStartIndex;
1945        if(p==sb.length()) {
1946            return;
1947        }
1948
1949        int starter, pRemove;
1950        int compositionsList;
1951        int c, compositeAndFwd;
1952        int norm16;
1953        int cc, prevCC;
1954        boolean starterIsSupplementary;
1955
1956        // Some of the following variables are not used until we have a forward-combining starter
1957        // and are only initialized now to avoid compiler warnings.
1958        compositionsList=-1;  // used as indicator for whether we have a forward-combining starter
1959        starter=-1;
1960        starterIsSupplementary=false;
1961        prevCC=0;
1962
1963        for(;;) {
1964            c=sb.codePointAt(p);
1965            p+=Character.charCount(c);
1966            norm16=getNorm16(c);
1967            cc=getCCFromYesOrMaybe(norm16);
1968            if( // this character combines backward and
1969                isMaybe(norm16) &&
1970                // we have seen a starter that combines forward and
1971                compositionsList>=0 &&
1972                // the backward-combining character is not blocked
1973                (prevCC<cc || prevCC==0)
1974            ) {
1975                if(isJamoVT(norm16)) {
1976                    // c is a Jamo V/T, see if we can compose it with the previous character.
1977                    if(c<Hangul.JAMO_T_BASE) {
1978                        // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1979                        char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1980                        if(prev<Hangul.JAMO_L_COUNT) {
1981                            pRemove=p-1;
1982                            char syllable=(char)
1983                                (Hangul.HANGUL_BASE+
1984                                 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1985                                 Hangul.JAMO_T_COUNT);
1986                            char t;
1987                            if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1988                                ++p;
1989                                syllable+=t;  // The next character was a Jamo T.
1990                            }
1991                            sb.setCharAt(starter, syllable);
1992                            // remove the Jamo V/T
1993                            sb.delete(pRemove, p);
1994                            p=pRemove;
1995                        }
1996                    }
1997                    /*
1998                     * No "else" for Jamo T:
1999                     * Since the input is in NFD, there are no Hangul LV syllables that
2000                     * a Jamo T could combine with.
2001                     * All Jamo Ts are combined above when handling Jamo Vs.
2002                     */
2003                    if(p==sb.length()) {
2004                        break;
2005                    }
2006                    compositionsList=-1;
2007                    continue;
2008                } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
2009                    // The starter and the combining mark (c) do combine.
2010                    int composite=compositeAndFwd>>1;
2011
2012                    // Remove the combining mark.
2013                    pRemove=p-Character.charCount(c);  // pRemove & p: start & limit of the combining mark
2014                    sb.delete(pRemove, p);
2015                    p=pRemove;
2016                    // Replace the starter with the composite.
2017                    if(starterIsSupplementary) {
2018                        if(composite>0xffff) {
2019                            // both are supplementary
2020                            sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
2021                            sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
2022                        } else {
2023                            sb.setCharAt(starter, (char)c);
2024                            sb.deleteCharAt(starter+1);
2025                            // The composite is shorter than the starter,
2026                            // move the intermediate characters forward one.
2027                            starterIsSupplementary=false;
2028                            --p;
2029                        }
2030                    } else if(composite>0xffff) {
2031                        // The composite is longer than the starter,
2032                        // move the intermediate characters back one.
2033                        starterIsSupplementary=true;
2034                        sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
2035                        sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
2036                        ++p;
2037                    } else {
2038                        // both are on the BMP
2039                        sb.setCharAt(starter, (char)composite);
2040                    }
2041
2042                    // Keep prevCC because we removed the combining mark.
2043
2044                    if(p==sb.length()) {
2045                        break;
2046                    }
2047                    // Is the composite a starter that combines forward?
2048                    if((compositeAndFwd&1)!=0) {
2049                        compositionsList=
2050                            getCompositionsListForComposite(getNorm16(composite));
2051                    } else {
2052                        compositionsList=-1;
2053                    }
2054
2055                    // We combined; continue with looking for compositions.
2056                    continue;
2057                }
2058            }
2059
2060            // no combination this time
2061            prevCC=cc;
2062            if(p==sb.length()) {
2063                break;
2064            }
2065
2066            // If c did not combine, then check if it is a starter.
2067            if(cc==0) {
2068                // Found a new starter.
2069                if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
2070                    // It may combine with something, prepare for it.
2071                    if(c<=0xffff) {
2072                        starterIsSupplementary=false;
2073                        starter=p-1;
2074                    } else {
2075                        starterIsSupplementary=true;
2076                        starter=p-2;
2077                    }
2078                }
2079            } else if(onlyContiguous) {
2080                // FCC: no discontiguous compositions; any intervening character blocks.
2081                compositionsList=-1;
2082            }
2083        }
2084        buffer.flush();
2085    }
2086
2087    public int composePair(int a, int b) {
2088        int norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
2089        int list;
2090        if(isInert(norm16)) {
2091            return -1;
2092        } else if(norm16<minYesNoMappingsOnly) {
2093            // a combines forward.
2094            if(isJamoL(norm16)) {
2095                b-=Hangul.JAMO_V_BASE;
2096                if(0<=b && b<Hangul.JAMO_V_COUNT) {
2097                    return
2098                        (Hangul.HANGUL_BASE+
2099                         ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)*
2100                         Hangul.JAMO_T_COUNT);
2101                } else {
2102                    return -1;
2103                }
2104            } else if(isHangulLV(norm16)) {
2105                b-=Hangul.JAMO_T_BASE;
2106                if(0<b && b<Hangul.JAMO_T_COUNT) {  // not b==0!
2107                    return a+b;
2108                } else {
2109                    return -1;
2110                }
2111            } else {
2112                // 'a' has a compositions list in extraData
2113                list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT;
2114                if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
2115                    list+=  // mapping pointer
2116                        1+  // +1 to skip the first unit with the mapping length
2117                        (maybeYesCompositions.charAt(list)&MAPPING_LENGTH_MASK);  // + mapping length
2118                }
2119            }
2120        } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
2121            return -1;
2122        } else {
2123            list=getCompositionsListForMaybe(norm16);  // offset into maybeYesCompositions
2124        }
2125        if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
2126            return -1;
2127        }
2128        return combine(maybeYesCompositions, list, b)>>1;
2129    }
2130
2131    /**
2132     * Does c have a composition boundary before it?
2133     * True if its decomposition begins with a character that has
2134     * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
2135     * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
2136     * (isCompYesAndZeroCC()) so we need not decompose.
2137     */
2138    private boolean hasCompBoundaryBefore(int c, int norm16) {
2139        return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
2140    }
2141    private boolean norm16HasCompBoundaryBefore(int norm16) {
2142        return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
2143    }
2144    private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) {
2145        return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src));
2146    }
2147    private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) {
2148        return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
2149            (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
2150    }
2151    private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) {
2152        return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous);
2153    }
2154    /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
2155    private boolean isTrailCC01ForCompBoundaryAfter(int norm16) {
2156        return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
2157            (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff);
2158    }
2159
2160    private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) {
2161        while(p>0) {
2162            int c=Character.codePointBefore(s, p);
2163            int norm16 = getNorm16(c);
2164            if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2165                break;
2166            }
2167            p-=Character.charCount(c);
2168            if(hasCompBoundaryBefore(c, norm16)) {
2169                break;
2170            }
2171        }
2172        return p;
2173    }
2174    private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) {
2175        while(p<limit) {
2176            int c=Character.codePointAt(s, p);
2177            int norm16=normTrie.get(c);
2178            if(hasCompBoundaryBefore(c, norm16)) {
2179                break;
2180            }
2181            p+=Character.charCount(c);
2182            if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2183                break;
2184            }
2185        }
2186        return p;
2187    }
2188
2189    private int findPreviousFCDBoundary(CharSequence s, int p) {
2190        while(p>0) {
2191            int c=Character.codePointBefore(s, p);
2192            int norm16;
2193            if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16 = getNorm16(c))) {
2194                break;
2195            }
2196            p-=Character.charCount(c);
2197            if (norm16HasDecompBoundaryBefore(norm16)) {
2198                break;
2199            }
2200        }
2201        return p;
2202    }
2203    private int findNextFCDBoundary(CharSequence s, int p, int limit) {
2204        while(p<limit) {
2205            int c=Character.codePointAt(s, p);
2206            int norm16;
2207            if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) {
2208                break;
2209            }
2210            p+=Character.charCount(c);
2211            if (norm16HasDecompBoundaryAfter(norm16)) {
2212                break;
2213            }
2214        }
2215        return p;
2216    }
2217
2218    private int getPreviousTrailCC(CharSequence s, int start, int p) {
2219        if (start == p) {
2220            return 0;
2221        }
2222        return getFCD16(Character.codePointBefore(s, p));
2223    }
2224
2225    private void addToStartSet(Trie2Writable newData, int origin, int decompLead) {
2226        int canonValue=newData.get(decompLead);
2227        if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
2228            // origin is the first character whose decomposition starts with
2229            // the character for which we are setting the value.
2230            newData.set(decompLead, canonValue|origin);
2231        } else {
2232            // origin is not the first character, or it is U+0000.
2233            UnicodeSet set;
2234            if((canonValue&CANON_HAS_SET)==0) {
2235                int firstOrigin=canonValue&CANON_VALUE_MASK;
2236                canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size();
2237                newData.set(decompLead, canonValue);
2238                canonStartSets.add(set=new UnicodeSet());
2239                if(firstOrigin!=0) {
2240                    set.add(firstOrigin);
2241                }
2242            } else {
2243                set=canonStartSets.get(canonValue&CANON_VALUE_MASK);
2244            }
2245            set.add(origin);
2246        }
2247    }
2248
2249    @SuppressWarnings("unused")
2250    private VersionInfo dataVersion;
2251
2252    // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
2253    private int minDecompNoCP;
2254    private int minCompNoMaybeCP;
2255    private int minLcccCP;
2256
2257    // Norm16 value thresholds for quick check combinations and types of extra data.
2258    private int minYesNo;
2259    private int minYesNoMappingsOnly;
2260    private int minNoNo;
2261    private int minNoNoCompBoundaryBefore;
2262    private int minNoNoCompNoMaybeCC;
2263    private int minNoNoEmpty;
2264    private int limitNoNo;
2265    private int centerNoNoDelta;
2266    private int minMaybeYes;
2267
2268    private Trie2_16 normTrie;
2269    private String maybeYesCompositions;
2270    private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2271    private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2272
2273    private Trie2_32 canonIterData;
2274    private ArrayList<UnicodeSet> canonStartSets;
2275
2276    // bits in canonIterData
2277    private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000;
2278    private static final int CANON_HAS_COMPOSITIONS = 0x40000000;
2279    private static final int CANON_HAS_SET = 0x200000;
2280    private static final int CANON_VALUE_MASK = 0x1fffff;
2281}
2282