1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 *   Copyright (C) 2009-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 */
9
10package com.ibm.icu.impl;
11
12import java.io.IOException;
13import java.nio.ByteBuffer;
14import java.util.ArrayList;
15import java.util.Iterator;
16
17import com.ibm.icu.text.UTF16;
18import com.ibm.icu.text.UnicodeSet;
19import com.ibm.icu.util.ICUUncheckedIOException;
20import com.ibm.icu.util.VersionInfo;
21
22public final class Normalizer2Impl {
23    public static final class Hangul {
24        /* Korean Hangul and Jamo constants */
25        public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
26        public static final int JAMO_L_END=0x1112;
27        public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
28        public static final int JAMO_V_END=0x1175;
29        public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
30        public static final int JAMO_T_END=0x11c2;
31
32        public static final int HANGUL_BASE=0xac00;
33        public static final int HANGUL_END=0xd7a3;
34
35        public static final int JAMO_L_COUNT=19;
36        public static final int JAMO_V_COUNT=21;
37        public static final int JAMO_T_COUNT=28;
38
39        public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;
40        public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;
41
42        public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT;
43
44        public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
45        public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
46
47        public static boolean isHangul(int c) {
48            return HANGUL_BASE<=c && c<HANGUL_LIMIT;
49        }
50        public static boolean isHangulWithoutJamoT(char c) {
51            c-=HANGUL_BASE;
52            return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
53        }
54        public static boolean isJamoL(int c) {
55            return JAMO_L_BASE<=c && c<JAMO_L_LIMIT;
56        }
57        public static boolean isJamoV(int c) {
58            return JAMO_V_BASE<=c && c<JAMO_V_LIMIT;
59        }
60
61        /**
62         * Decomposes c, which must be a Hangul syllable, into buffer
63         * and returns the length of the decomposition (2 or 3).
64         */
65        public static int decompose(int c, Appendable buffer) {
66            try {
67                c-=HANGUL_BASE;
68                int c2=c%JAMO_T_COUNT;
69                c/=JAMO_T_COUNT;
70                buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
71                buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
72                if(c2==0) {
73                    return 2;
74                } else {
75                    buffer.append((char)(JAMO_T_BASE+c2));
76                    return 3;
77                }
78            } catch(IOException e) {
79                // Will not occur because we do not write to I/O.
80                throw new ICUUncheckedIOException(e);
81            }
82        }
83
84        /**
85         * Decomposes c, which must be a Hangul syllable, into buffer.
86         * This is the raw, not recursive, decomposition. Its length is always 2.
87         */
88        public static void getRawDecomposition(int c, Appendable buffer) {
89            try {
90                int orig=c;
91                c-=HANGUL_BASE;
92                int c2=c%JAMO_T_COUNT;
93                if(c2==0) {
94                    c/=JAMO_T_COUNT;
95                    buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
96                    buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
97                } else {
98                    buffer.append((char)(orig-c2));  // LV syllable
99                    buffer.append((char)(JAMO_T_BASE+c2));
100                }
101            } catch(IOException e) {
102                // Will not occur because we do not write to I/O.
103                throw new ICUUncheckedIOException(e);
104            }
105        }
106    }
107
108    /**
109     * Writable buffer that takes care of canonical ordering.
110     * Its Appendable methods behave like the C++ implementation's
111     * appendZeroCC() methods.
112     * <p>
113     * If dest is a StringBuilder, then the buffer writes directly to it.
114     * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
115     * until no further changes are necessary and whole segments are appended.
116     * append() methods that take combining-class values always write to the StringBuilder.
117     * Other append() methods flush and append to the Appendable.
118     */
119    public static final class ReorderingBuffer implements Appendable {
120        public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) {
121            impl=ni;
122            app=dest;
123            if(app instanceof StringBuilder) {
124                appIsStringBuilder=true;
125                str=(StringBuilder)dest;
126                // In Java, the constructor subsumes public void init(int destCapacity) {
127                str.ensureCapacity(destCapacity);
128                reorderStart=0;
129                if(str.length()==0) {
130                    lastCC=0;
131                } else {
132                    setIterator();
133                    lastCC=previousCC();
134                    // Set reorderStart after the last code point with cc<=1 if there is one.
135                    if(lastCC>1) {
136                        while(previousCC()>1) {}
137                    }
138                    reorderStart=codePointLimit;
139                }
140            } else {
141                appIsStringBuilder=false;
142                str=new StringBuilder();
143                reorderStart=0;
144                lastCC=0;
145            }
146        }
147
148        public boolean isEmpty() { return str.length()==0; }
149        public int length() { return str.length(); }
150        public int getLastCC() { return lastCC; }
151
152        public StringBuilder getStringBuilder() { return str; }
153
154        public boolean equals(CharSequence s, int start, int limit) {
155            return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
156        }
157
158        // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
159        public void setLastChar(char c) {
160            str.setCharAt(str.length()-1, c);
161        }
162
163        public void append(int c, int cc) {
164            if(lastCC<=cc || cc==0) {
165                str.appendCodePoint(c);
166                lastCC=cc;
167                if(cc<=1) {
168                    reorderStart=str.length();
169                }
170            } else {
171                insert(c, cc);
172            }
173        }
174        // s must be in NFD, otherwise change the implementation.
175        public void append(CharSequence s, int start, int limit,
176                           int leadCC, int trailCC) {
177            if(start==limit) {
178                return;
179            }
180            if(lastCC<=leadCC || leadCC==0) {
181                if(trailCC<=1) {
182                    reorderStart=str.length()+(limit-start);
183                } else if(leadCC<=1) {
184                    reorderStart=str.length()+1;  // Ok if not a code point boundary.
185                }
186                str.append(s, start, limit);
187                lastCC=trailCC;
188            } else {
189                int c=Character.codePointAt(s, start);
190                start+=Character.charCount(c);
191                insert(c, leadCC);  // insert first code point
192                while(start<limit) {
193                    c=Character.codePointAt(s, start);
194                    start+=Character.charCount(c);
195                    if(start<limit) {
196                        // s must be in NFD, otherwise we need to use getCC().
197                        leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
198                    } else {
199                        leadCC=trailCC;
200                    }
201                    append(c, leadCC);
202                }
203            }
204        }
205        // The following append() methods work like C++ appendZeroCC().
206        // They assume that the cc or trailCC of their input is 0.
207        // Most of them implement Appendable interface methods.
208        // @Override when we switch to Java 6
209        @Override
210        public ReorderingBuffer append(char c) {
211            str.append(c);
212            lastCC=0;
213            reorderStart=str.length();
214            return this;
215        }
216        public void appendZeroCC(int c) {
217            str.appendCodePoint(c);
218            lastCC=0;
219            reorderStart=str.length();
220        }
221        // @Override when we switch to Java 6
222        @Override
223        public ReorderingBuffer append(CharSequence s) {
224            if(s.length()!=0) {
225                str.append(s);
226                lastCC=0;
227                reorderStart=str.length();
228            }
229            return this;
230        }
231        // @Override when we switch to Java 6
232        @Override
233        public ReorderingBuffer append(CharSequence s, int start, int limit) {
234            if(start!=limit) {
235                str.append(s, start, limit);
236                lastCC=0;
237                reorderStart=str.length();
238            }
239            return this;
240        }
241        /**
242         * Flushes from the intermediate StringBuilder to the Appendable,
243         * if they are different objects.
244         * Used after recomposition.
245         * Must be called at the end when writing to a non-StringBuilder Appendable.
246         */
247        public void flush() {
248            if(appIsStringBuilder) {
249                reorderStart=str.length();
250            } else {
251                try {
252                    app.append(str);
253                    str.setLength(0);
254                    reorderStart=0;
255                } catch(IOException e) {
256                    throw new ICUUncheckedIOException(e);  // Avoid declaring "throws IOException".
257                }
258            }
259            lastCC=0;
260        }
261        /**
262         * Flushes from the intermediate StringBuilder to the Appendable,
263         * if they are different objects.
264         * Then appends the new text to the Appendable or StringBuilder.
265         * Normally used after quick check loops find a non-empty sequence.
266         */
267        public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
268            if(appIsStringBuilder) {
269                str.append(s, start, limit);
270                reorderStart=str.length();
271            } else {
272                try {
273                    app.append(str).append(s, start, limit);
274                    str.setLength(0);
275                    reorderStart=0;
276                } catch(IOException e) {
277                    throw new ICUUncheckedIOException(e);  // Avoid declaring "throws IOException".
278                }
279            }
280            lastCC=0;
281            return this;
282        }
283        public void remove() {
284            str.setLength(0);
285            lastCC=0;
286            reorderStart=0;
287        }
288        public void removeSuffix(int suffixLength) {
289            int oldLength=str.length();
290            str.delete(oldLength-suffixLength, oldLength);
291            lastCC=0;
292            reorderStart=str.length();
293        }
294
295        /*
296         * TODO: Revisit whether it makes sense to track reorderStart.
297         * It is set to after the last known character with cc<=1,
298         * which stops previousCC() before it reads that character and looks up its cc.
299         * previousCC() is normally only called from insert().
300         * In other words, reorderStart speeds up the insertion of a combining mark
301         * into a multi-combining mark sequence where it does not belong at the end.
302         * This might not be worth the trouble.
303         * On the other hand, it's not a huge amount of trouble.
304         *
305         * We probably need it for UNORM_SIMPLE_APPEND.
306         */
307
308        // Inserts c somewhere before the last character.
309        // Requires 0<cc<lastCC which implies reorderStart<limit.
310        private void insert(int c, int cc) {
311            for(setIterator(), skipPrevious(); previousCC()>cc;) {}
312            // insert c at codePointLimit, after the character with prevCC<=cc
313            if(c<=0xffff) {
314                str.insert(codePointLimit, (char)c);
315                if(cc<=1) {
316                    reorderStart=codePointLimit+1;
317                }
318            } else {
319                str.insert(codePointLimit, Character.toChars(c));
320                if(cc<=1) {
321                    reorderStart=codePointLimit+2;
322                }
323            }
324        }
325
326        private final Normalizer2Impl impl;
327        private final Appendable app;
328        private final StringBuilder str;
329        private final boolean appIsStringBuilder;
330        private int reorderStart;
331        private int lastCC;
332
333        // private backward iterator
334        private void setIterator() { codePointStart=str.length(); }
335        private void skipPrevious() {  // Requires 0<codePointStart.
336            codePointLimit=codePointStart;
337            codePointStart=str.offsetByCodePoints(codePointStart, -1);
338        }
339        private int previousCC() {  // Returns 0 if there is no previous character.
340            codePointLimit=codePointStart;
341            if(reorderStart>=codePointStart) {
342                return 0;
343            }
344            int c=str.codePointBefore(codePointStart);
345            codePointStart-=Character.charCount(c);
346            if(c<MIN_CCC_LCCC_CP) {
347                return 0;
348            }
349            return getCCFromYesOrMaybe(impl.getNorm16(c));
350        }
351
352        private int codePointStart, codePointLimit;
353    }
354
355    // TODO: Propose as public API on the UTF16 class.
356    // TODO: Propose widening UTF16 methods that take char to take int.
357    // TODO: Propose widening UTF16 methods that take String to take CharSequence.
358    public static final class UTF16Plus {
359        /**
360         * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
361         * is it a lead surrogate?
362         * @param c code unit or code point
363         * @return true or false
364         */
365        public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
366        /**
367         * Compares two CharSequence objects for binary equality.
368         * @param s1 first sequence
369         * @param s2 second sequence
370         * @return true if s1 contains the same text as s2
371         */
372        public static boolean equal(CharSequence s1,  CharSequence s2) {
373            if(s1==s2) {
374                return true;
375            }
376            int length=s1.length();
377            if(length!=s2.length()) {
378                return false;
379            }
380            for(int i=0; i<length; ++i) {
381                if(s1.charAt(i)!=s2.charAt(i)) {
382                    return false;
383                }
384            }
385            return true;
386        }
387        /**
388         * Compares two CharSequence subsequences for binary equality.
389         * @param s1 first sequence
390         * @param start1 start offset in first sequence
391         * @param limit1 limit offset in first sequence
392         * @param s2 second sequence
393         * @param start2 start offset in second sequence
394         * @param limit2 limit offset in second sequence
395         * @return true if s1.subSequence(start1, limit1) contains the same text
396         *              as s2.subSequence(start2, limit2)
397         */
398        public static boolean equal(CharSequence s1, int start1, int limit1,
399                                    CharSequence s2, int start2, int limit2) {
400            if((limit1-start1)!=(limit2-start2)) {
401                return false;
402            }
403            if(s1==s2 && start1==start2) {
404                return true;
405            }
406            while(start1<limit1) {
407                if(s1.charAt(start1++)!=s2.charAt(start2++)) {
408                    return false;
409                }
410            }
411            return true;
412        }
413    }
414
415    public Normalizer2Impl() {}
416
417    private static final class IsAcceptable implements ICUBinary.Authenticate {
418        // @Override when we switch to Java 6
419        @Override
420        public boolean isDataVersionAcceptable(byte version[]) {
421            return version[0]==2;
422        }
423    }
424    private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
425    private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
426
427    public Normalizer2Impl load(ByteBuffer bytes) {
428        try {
429            dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
430            int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
431            if(indexesLength<=IX_MIN_MAYBE_YES) {
432                throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes");
433            }
434            int[] inIndexes=new int[indexesLength];
435            inIndexes[0]=indexesLength*4;
436            for(int i=1; i<indexesLength; ++i) {
437                inIndexes[i]=bytes.getInt();
438            }
439
440            minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
441            minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
442
443            minYesNo=inIndexes[IX_MIN_YES_NO];
444            minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
445            minNoNo=inIndexes[IX_MIN_NO_NO];
446            limitNoNo=inIndexes[IX_LIMIT_NO_NO];
447            minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
448
449            // Read the normTrie.
450            int offset=inIndexes[IX_NORM_TRIE_OFFSET];
451            int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
452            normTrie=Trie2_16.createFromSerialized(bytes);
453            int trieLength=normTrie.getSerializedLength();
454            if(trieLength>(nextOffset-offset)) {
455                throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie");
456            }
457            ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
458
459            // Read the composition and mapping data.
460            offset=nextOffset;
461            nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
462            int numChars=(nextOffset-offset)/2;
463            if(numChars!=0) {
464                maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
465                extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes);
466            }
467
468            // smallFCD: new in formatVersion 2
469            offset=nextOffset;
470            smallFCD=new byte[0x100];
471            bytes.get(smallFCD);
472
473            // Build tccc180[].
474            // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
475            tccc180=new int[0x180];
476            int bits=0;
477            for(int c=0; c<0x180; bits>>=1) {
478                if((c&0xff)==0) {
479                    bits=smallFCD[c>>8];  // one byte per 0x100 code points
480                }
481                if((bits&1)!=0) {
482                    for(int i=0; i<0x20; ++i, ++c) {
483                        tccc180[c]=getFCD16FromNormData(c)&0xff;
484                    }
485                } else {
486                    c+=0x20;
487                }
488            }
489
490            return this;
491        } catch(IOException e) {
492            throw new ICUUncheckedIOException(e);
493        }
494    }
495    public Normalizer2Impl load(String name) {
496        return load(ICUBinary.getRequiredData(name));
497    }
498
499    private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) {
500        if(isAlgorithmicNoNo(norm16)) {
501            // Range of code points with same-norm16-value algorithmic decompositions.
502            // They might have different non-zero FCD16 values.
503            do {
504                int fcd16=getFCD16(start);
505                if(fcd16>0xff) { set.add(start); }
506            } while(++start<=end);
507        } else {
508            int fcd16=getFCD16(start);
509            if(fcd16>0xff) { set.add(start, end); }
510        }
511    }
512
513    private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) {
514        /* add the start code point to the USet */
515        set.add(start);
516        if(start!=end && isAlgorithmicNoNo(value)) {
517            // Range of code points with same-norm16-value algorithmic decompositions.
518            // They might have different non-zero FCD16 values.
519            int prevFCD16=getFCD16(start);
520            while(++start<=end) {
521                int fcd16=getFCD16(start);
522                if(fcd16!=prevFCD16) {
523                    set.add(start);
524                    prevFCD16=fcd16;
525                }
526            }
527        }
528    }
529
530    public void addLcccChars(UnicodeSet set) {
531        /* add the start code point of each same-value range of each trie */
532        Iterator<Trie2.Range> trieIterator=normTrie.iterator();
533        Trie2.Range range;
534        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
535            enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set);
536        }
537    }
538
539    public void addPropertyStarts(UnicodeSet set) {
540        /* add the start code point of each same-value range of each trie */
541        Iterator<Trie2.Range> trieIterator=normTrie.iterator();
542        Trie2.Range range;
543        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
544            enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set);
545        }
546
547        /* add Hangul LV syllables and LV+1 because of skippables */
548        for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) {
549            set.add(c);
550            set.add(c+1);
551        }
552        set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
553    }
554
555    public void addCanonIterPropertyStarts(UnicodeSet set) {
556        /* add the start code point of each same-value range of the canonical iterator data trie */
557        ensureCanonIterData();
558        // currently only used for the SEGMENT_STARTER property
559        Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper);
560        Trie2.Range range;
561        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
562            /* add the start code point to the USet */
563            set.add(range.startCodePoint);
564        }
565    }
566    private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() {
567        @Override
568        public int map(int in) {
569            return in&CANON_NOT_SEGMENT_STARTER;
570        }
571    };
572
573    // low-level properties ------------------------------------------------ ***
574
575    public Trie2_16 getNormTrie() { return normTrie; }
576
577    // Note: Normalizer2Impl.java r30983 (2011-nov-27)
578    // still had getFCDTrie() which built and cached an FCD trie.
579    // That provided faster access to FCD data than getFCD16FromNormData()
580    // but required synchronization and consumed some 10kB of heap memory
581    // in any process that uses FCD (e.g., via collation).
582    // tccc180[] and smallFCD[] are intended to help with any loss of performance,
583    // at least for Latin & CJK.
584
585    /**
586     * Builds the canonical-iterator data for this instance.
587     * This is required before any of {@link #isCanonSegmentStarter(int)} or
588     * {@link #getCanonStartSet(int, UnicodeSet)} are called,
589     * or else they crash.
590     * @return this
591     */
592    public synchronized Normalizer2Impl ensureCanonIterData() {
593        if(canonIterData==null) {
594            Trie2Writable newData=new Trie2Writable(0, 0);
595            canonStartSets=new ArrayList<UnicodeSet>();
596            Iterator<Trie2.Range> trieIterator=normTrie.iterator();
597            Trie2.Range range;
598            while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
599                final int norm16=range.value;
600                if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
601                    // Inert, or 2-way mapping (including Hangul syllable).
602                    // We do not write a canonStartSet for any yesNo character.
603                    // Composites from 2-way mappings are added at runtime from the
604                    // starter's compositions list, and the other characters in
605                    // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
606                    // "maybe" characters.
607                    continue;
608                }
609                for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) {
610                    final int oldValue=newData.get(c);
611                    int newValue=oldValue;
612                    if(norm16>=minMaybeYes) {
613                        // not a segment starter if it occurs in a decomposition or has cc!=0
614                        newValue|=CANON_NOT_SEGMENT_STARTER;
615                        if(norm16<MIN_NORMAL_MAYBE_YES) {
616                            newValue|=CANON_HAS_COMPOSITIONS;
617                        }
618                    } else if(norm16<minYesNo) {
619                        newValue|=CANON_HAS_COMPOSITIONS;
620                    } else {
621                        // c has a one-way decomposition
622                        int c2=c;
623                        int norm16_2=norm16;
624                        while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
625                            c2=this.mapAlgorithmic(c2, norm16_2);
626                            norm16_2=getNorm16(c2);
627                        }
628                        if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
629                            // c decomposes, get everything from the variable-length extra data
630                            int firstUnit=extraData.charAt(norm16_2);
631                            int length=firstUnit&MAPPING_LENGTH_MASK;
632                            if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
633                                if(c==c2 && (extraData.charAt(norm16_2-1)&0xff)!=0) {
634                                    newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
635                                }
636                            }
637                            // Skip empty mappings (no characters in the decomposition).
638                            if(length!=0) {
639                                ++norm16_2;  // skip over the firstUnit
640                                // add c to first code point's start set
641                                int limit=norm16_2+length;
642                                c2=extraData.codePointAt(norm16_2);
643                                addToStartSet(newData, c, c2);
644                                // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
645                                // one-way mapping. A 2-way mapping is possible here after
646                                // intermediate algorithmic mapping.
647                                if(norm16_2>=minNoNo) {
648                                    while((norm16_2+=Character.charCount(c2))<limit) {
649                                        c2=extraData.codePointAt(norm16_2);
650                                        int c2Value=newData.get(c2);
651                                        if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
652                                            newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
653                                        }
654                                    }
655                                }
656                            }
657                        } else {
658                            // c decomposed to c2 algorithmically; c has cc==0
659                            addToStartSet(newData, c, c2);
660                        }
661                    }
662                    if(newValue!=oldValue) {
663                        newData.set(c, newValue);
664                    }
665                }
666            }
667            canonIterData=newData.toTrie2_32();
668        }
669        return this;
670    }
671
672    public int getNorm16(int c) { return normTrie.get(c); }
673
674    public int getCompQuickCheck(int norm16) {
675        if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
676            return 1;  // yes
677        } else if(minMaybeYes<=norm16) {
678            return 2;  // maybe
679        } else {
680            return 0;  // no
681        }
682    }
683    public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
684    public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
685    public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
686
687    public int getCC(int norm16) {
688        if(norm16>=MIN_NORMAL_MAYBE_YES) {
689            return norm16&0xff;
690        }
691        if(norm16<minNoNo || limitNoNo<=norm16) {
692            return 0;
693        }
694        return getCCFromNoNo(norm16);
695    }
696    public static int getCCFromYesOrMaybe(int norm16) {
697        return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0;
698    }
699
700    /**
701     * Returns the FCD data for code point c.
702     * @param c A Unicode code point.
703     * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
704     */
705    public int getFCD16(int c) {
706        if(c<0) {
707            return 0;
708        } else if(c<0x180) {
709            return tccc180[c];
710        } else if(c<=0xffff) {
711            if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
712        }
713        return getFCD16FromNormData(c);
714    }
715    /** Returns the FCD data for U+0000<=c<U+0180. */
716    public int getFCD16FromBelow180(int c) { return tccc180[c]; }
717    /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
718    public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
719        // 0<=lead<=0xffff
720        byte bits=smallFCD[lead>>8];
721        if(bits==0) { return false; }
722        return ((bits>>((lead>>5)&7))&1)!=0;
723    }
724
725    /** Gets the FCD value from the regular normalization data. */
726    public int getFCD16FromNormData(int c) {
727        // Only loops for 1:1 algorithmic mappings.
728        for(;;) {
729            int norm16=getNorm16(c);
730            if(norm16<=minYesNo) {
731                // no decomposition or Hangul syllable, all zeros
732                return 0;
733            } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
734                // combining mark
735                norm16&=0xff;
736                return norm16|(norm16<<8);
737            } else if(norm16>=minMaybeYes) {
738                return 0;
739            } else if(isDecompNoAlgorithmic(norm16)) {
740                c=mapAlgorithmic(c, norm16);
741            } else {
742                // c decomposes, get everything from the variable-length extra data
743                int firstUnit=extraData.charAt(norm16);
744                if((firstUnit&MAPPING_LENGTH_MASK)==0) {
745                    // A character that is deleted (maps to an empty string) must
746                    // get the worst-case lccc and tccc values because arbitrary
747                    // characters on both sides will become adjacent.
748                    return 0x1ff;
749                } else {
750                    int fcd16=firstUnit>>8;  // tccc
751                    if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
752                        fcd16|=extraData.charAt(norm16-1)&0xff00;  // lccc
753                    }
754                    return fcd16;
755                }
756            }
757        }
758    }
759
760    /**
761     * Gets the decomposition for one code point.
762     * @param c code point
763     * @return c's decomposition, if it has one; returns null if it does not have a decomposition
764     */
765    public String getDecomposition(int c) {
766        int decomp=-1;
767        int norm16;
768        for(;;) {
769            if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
770                // c does not decompose
771            } else if(isHangul(norm16)) {
772                // Hangul syllable: decompose algorithmically
773                StringBuilder buffer=new StringBuilder();
774                Hangul.decompose(c, buffer);
775                return buffer.toString();
776            } else if(isDecompNoAlgorithmic(norm16)) {
777                decomp=c=mapAlgorithmic(c, norm16);
778                continue;
779            } else {
780                // c decomposes, get everything from the variable-length extra data
781                int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK;
782                return extraData.substring(norm16, norm16+length);
783            }
784            if(decomp<0) {
785                return null;
786            } else {
787                return UTF16.valueOf(decomp);
788            }
789        }
790    }
791
792    /**
793     * Gets the raw decomposition for one code point.
794     * @param c code point
795     * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition
796     */
797    public String getRawDecomposition(int c) {
798        // We do not loop in this method because an algorithmic mapping itself
799        // becomes a final result rather than having to be decomposed recursively.
800        int norm16;
801        if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
802            // c does not decompose
803            return null;
804        } else if(isHangul(norm16)) {
805            // Hangul syllable: decompose algorithmically
806            StringBuilder buffer=new StringBuilder();
807            Hangul.getRawDecomposition(c, buffer);
808            return buffer.toString();
809        } else if(isDecompNoAlgorithmic(norm16)) {
810            return UTF16.valueOf(mapAlgorithmic(c, norm16));
811        } else {
812            // c decomposes, get everything from the variable-length extra data
813            int firstUnit=extraData.charAt(norm16);
814            int mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
815            if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) {
816                // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
817                // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
818                int rawMapping=norm16-((firstUnit>>7)&1)-1;
819                char rm0=extraData.charAt(rawMapping);
820                if(rm0<=MAPPING_LENGTH_MASK) {
821                    return extraData.substring(rawMapping-rm0, rawMapping);
822                } else {
823                    // Copy the normal mapping and replace its first two code units with rm0.
824                    StringBuilder buffer=new StringBuilder(mLength-1).append(rm0);
825                    norm16+=1+2;  // skip over the firstUnit and the first two mapping code units
826                    return buffer.append(extraData, norm16, norm16+mLength-2).toString();
827                }
828            } else {
829                norm16+=1;  // skip over the firstUnit
830                return extraData.substring(norm16, norm16+mLength);
831            }
832        }
833    }
834
835    /**
836     * Returns true if code point c starts a canonical-iterator string segment.
837     * <b>{@link #ensureCanonIterData()} must have been called before this method,
838     * or else this method will crash.</b>
839     * @param c A Unicode code point.
840     * @return true if c starts a canonical-iterator string segment.
841     */
842    public boolean isCanonSegmentStarter(int c) {
843        return canonIterData.get(c)>=0;
844    }
845    /**
846     * Returns true if there are characters whose decomposition starts with c.
847     * If so, then the set is cleared and then filled with those characters.
848     * <b>{@link #ensureCanonIterData()} must have been called before this method,
849     * or else this method will crash.</b>
850     * @param c A Unicode code point.
851     * @param set A UnicodeSet to receive the characters whose decompositions
852     *        start with c, if there are any.
853     * @return true if there are characters whose decomposition starts with c.
854     */
855    public boolean getCanonStartSet(int c, UnicodeSet set) {
856        int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
857        if(canonValue==0) {
858            return false;
859        }
860        set.clear();
861        int value=canonValue&CANON_VALUE_MASK;
862        if((canonValue&CANON_HAS_SET)!=0) {
863            set.addAll(canonStartSets.get(value));
864        } else if(value!=0) {
865            set.add(value);
866        }
867        if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
868            int norm16=getNorm16(c);
869            if(norm16==JAMO_L) {
870                int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
871                set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
872            } else {
873                addComposites(getCompositionsList(norm16), set);
874            }
875        }
876        return true;
877    }
878
879    public static final int MIN_CCC_LCCC_CP=0x300;
880
881    public static final int MIN_YES_YES_WITH_CC=0xff01;
882    public static final int JAMO_VT=0xff00;
883    public static final int MIN_NORMAL_MAYBE_YES=0xfe00;
884    public static final int JAMO_L=1;
885    public static final int MAX_DELTA=0x40;
886
887    // Byte offsets from the start of the data, after the generic header.
888    public static final int IX_NORM_TRIE_OFFSET=0;
889    public static final int IX_EXTRA_DATA_OFFSET=1;
890    public static final int IX_SMALL_FCD_OFFSET=2;
891    public static final int IX_RESERVED3_OFFSET=3;
892    public static final int IX_TOTAL_SIZE=7;
893
894    // Code point thresholds for quick check codes.
895    public static final int IX_MIN_DECOMP_NO_CP=8;
896    public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
897
898    // Norm16 value thresholds for quick check combinations and types of extra data.
899    // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
900    public static final int IX_MIN_YES_NO=10;
901    public static final int IX_MIN_NO_NO=11;
902    public static final int IX_LIMIT_NO_NO=12;
903    public static final int IX_MIN_MAYBE_YES=13;
904
905    // Mappings only in [minYesNoMappingsOnly..minNoNo[.
906    public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
907
908    public static final int IX_COUNT=16;
909
910    public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
911    public static final int MAPPING_HAS_RAW_MAPPING=0x40;
912    public static final int MAPPING_NO_COMP_BOUNDARY_AFTER=0x20;
913    public static final int MAPPING_LENGTH_MASK=0x1f;
914
915    public static final int COMP_1_LAST_TUPLE=0x8000;
916    public static final int COMP_1_TRIPLE=1;
917    public static final int COMP_1_TRAIL_LIMIT=0x3400;
918    public static final int COMP_1_TRAIL_MASK=0x7ffe;
919    public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
920    public static final int COMP_2_TRAIL_SHIFT=6;
921    public static final int COMP_2_TRAIL_MASK=0xffc0;
922
923    // higher-level functionality ------------------------------------------ ***
924
925    // NFD without an NFD Normalizer2 instance.
926    public Appendable decompose(CharSequence s, StringBuilder dest) {
927        decompose(s, 0, s.length(), dest, s.length());
928        return dest;
929    }
930    /**
931     * Decomposes s[src, limit[ and writes the result to dest.
932     * limit can be NULL if src is NUL-terminated.
933     * destLengthEstimate is the initial dest buffer capacity and can be -1.
934     */
935    public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
936                   int destLengthEstimate) {
937        if(destLengthEstimate<0) {
938            destLengthEstimate=limit-src;
939        }
940        dest.setLength(0);
941        ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
942        decompose(s, src, limit, buffer);
943    }
944
945    // Dual functionality:
946    // buffer!=NULL: normalize
947    // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
948    public int decompose(CharSequence s, int src, int limit,
949                         ReorderingBuffer buffer) {
950        int minNoCP=minDecompNoCP;
951
952        int prevSrc;
953        int c=0;
954        int norm16=0;
955
956        // only for quick check
957        int prevBoundary=src;
958        int prevCC=0;
959
960        for(;;) {
961            // count code units below the minimum or with irrelevant data for the quick check
962            for(prevSrc=src; src!=limit;) {
963                if( (c=s.charAt(src))<minNoCP ||
964                    isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
965                ) {
966                    ++src;
967                } else if(!UTF16.isSurrogate((char)c)) {
968                    break;
969                } else {
970                    char c2;
971                    if(UTF16Plus.isSurrogateLead(c)) {
972                        if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
973                            c=Character.toCodePoint((char)c, c2);
974                        }
975                    } else /* trail surrogate */ {
976                        if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
977                            --src;
978                            c=Character.toCodePoint(c2, (char)c);
979                        }
980                    }
981                    if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
982                        src+=Character.charCount(c);
983                    } else {
984                        break;
985                    }
986                }
987            }
988            // copy these code units all at once
989            if(src!=prevSrc) {
990                if(buffer!=null) {
991                    buffer.flushAndAppendZeroCC(s, prevSrc, src);
992                } else {
993                    prevCC=0;
994                    prevBoundary=src;
995                }
996            }
997            if(src==limit) {
998                break;
999            }
1000
1001            // Check one above-minimum, relevant code point.
1002            src+=Character.charCount(c);
1003            if(buffer!=null) {
1004                decompose(c, norm16, buffer);
1005            } else {
1006                if(isDecompYes(norm16)) {
1007                    int cc=getCCFromYesOrMaybe(norm16);
1008                    if(prevCC<=cc || cc==0) {
1009                        prevCC=cc;
1010                        if(cc<=1) {
1011                            prevBoundary=src;
1012                        }
1013                        continue;
1014                    }
1015                }
1016                return prevBoundary;  // "no" or cc out of order
1017            }
1018        }
1019        return src;
1020    }
1021    public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
1022        int limit=s.length();
1023        if(limit==0) {
1024            return;
1025        }
1026        if(doDecompose) {
1027            decompose(s, 0, limit, buffer);
1028            return;
1029        }
1030        // Just merge the strings at the boundary.
1031        int c=Character.codePointAt(s, 0);
1032        int src=0;
1033        int firstCC, prevCC, cc;
1034        firstCC=prevCC=cc=getCC(getNorm16(c));
1035        while(cc!=0) {
1036            prevCC=cc;
1037            src+=Character.charCount(c);
1038            if(src>=limit) {
1039                break;
1040            }
1041            c=Character.codePointAt(s, src);
1042            cc=getCC(getNorm16(c));
1043        };
1044        buffer.append(s, 0, src, firstCC, prevCC);
1045        buffer.append(s, src, limit);
1046    }
1047    // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1048    // doCompose: normalize
1049    // !doCompose: isNormalized (buffer must be empty and initialized)
1050    public boolean compose(CharSequence s, int src, int limit,
1051                           boolean onlyContiguous,
1052                           boolean doCompose,
1053                           ReorderingBuffer buffer) {
1054        int minNoMaybeCP=minCompNoMaybeCP;
1055
1056        /*
1057         * prevBoundary points to the last character before the current one
1058         * that has a composition boundary before it with ccc==0 and quick check "yes".
1059         * Keeping track of prevBoundary saves us looking for a composition boundary
1060         * when we find a "no" or "maybe".
1061         *
1062         * When we back out from prevSrc back to prevBoundary,
1063         * then we also remove those same characters (which had been simply copied
1064         * or canonically-order-inserted) from the ReorderingBuffer.
1065         * Therefore, at all times, the [prevBoundary..prevSrc[ source units
1066         * must correspond 1:1 to destination units at the end of the destination buffer.
1067         */
1068        int prevBoundary=src;
1069        int prevSrc;
1070        int c=0;
1071        int norm16=0;
1072
1073        // only for isNormalized
1074        int prevCC=0;
1075
1076        for(;;) {
1077            // count code units below the minimum or with irrelevant data for the quick check
1078            for(prevSrc=src; src!=limit;) {
1079                if( (c=s.charAt(src))<minNoMaybeCP ||
1080                    isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
1081                ) {
1082                    ++src;
1083                } else if(!UTF16.isSurrogate((char)c)) {
1084                    break;
1085                } else {
1086                    char c2;
1087                    if(UTF16Plus.isSurrogateLead(c)) {
1088                        if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1089                            c=Character.toCodePoint((char)c, c2);
1090                        }
1091                    } else /* trail surrogate */ {
1092                        if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1093                            --src;
1094                            c=Character.toCodePoint(c2, (char)c);
1095                        }
1096                    }
1097                    if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1098                        src+=Character.charCount(c);
1099                    } else {
1100                        break;
1101                    }
1102                }
1103            }
1104            // copy these code units all at once
1105            if(src!=prevSrc) {
1106                if(src==limit) {
1107                    if(doCompose) {
1108                        buffer.flushAndAppendZeroCC(s, prevSrc, src);
1109                    }
1110                    break;
1111                }
1112                // Set prevBoundary to the last character in the quick check loop.
1113                prevBoundary=src-1;
1114                if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
1115                    Character.isHighSurrogate(s.charAt(prevBoundary-1))
1116                ) {
1117                    --prevBoundary;
1118                }
1119                if(doCompose) {
1120                    // The last "quick check yes" character is excluded from the
1121                    // flush-and-append call in case it needs to be modified.
1122                    buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1123                    buffer.append(s, prevBoundary, src);
1124                } else {
1125                    prevCC=0;
1126                }
1127                // The start of the current character (c).
1128                prevSrc=src;
1129            } else if(src==limit) {
1130                break;
1131            }
1132
1133            src+=Character.charCount(c);
1134            /*
1135             * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1136             * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1137             * or has ccc!=0.
1138             * Check for Jamo V/T, then for regular characters.
1139             * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1140             */
1141            if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
1142                char prev=s.charAt(prevSrc-1);
1143                boolean needToDecompose=false;
1144                if(c<Hangul.JAMO_T_BASE) {
1145                    // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1146                    prev-=Hangul.JAMO_L_BASE;
1147                    if(prev<Hangul.JAMO_L_COUNT) {
1148                        if(!doCompose) {
1149                            return false;
1150                        }
1151                        char syllable=(char)
1152                            (Hangul.HANGUL_BASE+
1153                             (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1154                             Hangul.JAMO_T_COUNT);
1155                        char t;
1156                        if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1157                            ++src;
1158                            syllable+=t;  // The next character was a Jamo T.
1159                            prevBoundary=src;
1160                            buffer.setLastChar(syllable);
1161                            continue;
1162                        }
1163                        // If we see L+V+x where x!=T then we drop to the slow path,
1164                        // decompose and recompose.
1165                        // This is to deal with NFKC finding normal L and V but a
1166                        // compatibility variant of a T. We need to either fully compose that
1167                        // combination here (which would complicate the code and may not work
1168                        // with strange custom data) or use the slow path -- or else our replacing
1169                        // two input characters (L+V) with one output character (LV syllable)
1170                        // would violate the invariant that [prevBoundary..prevSrc[ has the same
1171                        // length as what we appended to the buffer since prevBoundary.
1172                        needToDecompose=true;
1173                    }
1174                } else if(Hangul.isHangulWithoutJamoT(prev)) {
1175                    // c is a Jamo Trailing consonant,
1176                    // compose with previous Hangul LV that does not contain a Jamo T.
1177                    if(!doCompose) {
1178                        return false;
1179                    }
1180                    buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE));
1181                    prevBoundary=src;
1182                    continue;
1183                }
1184                if(!needToDecompose) {
1185                    // The Jamo V/T did not compose into a Hangul syllable.
1186                    if(doCompose) {
1187                        buffer.append((char)c);
1188                    } else {
1189                        prevCC=0;
1190                    }
1191                    continue;
1192                }
1193            }
1194            /*
1195             * Source buffer pointers:
1196             *
1197             *  all done      quick check   current char  not yet
1198             *                "yes" but     (c)           processed
1199             *                may combine
1200             *                forward
1201             * [-------------[-------------[-------------[-------------[
1202             * |             |             |             |             |
1203             * orig. src     prevBoundary  prevSrc       src           limit
1204             *
1205             *
1206             * Destination buffer pointers inside the ReorderingBuffer:
1207             *
1208             *  all done      might take    not filled yet
1209             *                characters for
1210             *                reordering
1211             * [-------------[-------------[-------------[
1212             * |             |             |             |
1213             * start         reorderStart  limit         |
1214             *                             +remainingCap.+
1215             */
1216            if(norm16>=MIN_YES_YES_WITH_CC) {
1217                int cc=norm16&0xff;  // cc!=0
1218                if( onlyContiguous &&  // FCC
1219                    (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1220                    prevBoundary<prevSrc &&
1221                    // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1222                    // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1223                    // passed the quick check "yes && ccc==0" test.
1224                    // Check whether the last character was a "yesYes" or a "yesNo".
1225                    // If a "yesNo", then we get its trailing ccc from its
1226                    // mapping and check for canonical order.
1227                    // All other cases are ok.
1228                    getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
1229                ) {
1230                    // Fails FCD test, need to decompose and contiguously recompose.
1231                    if(!doCompose) {
1232                        return false;
1233                    }
1234                } else if(doCompose) {
1235                    buffer.append(c, cc);
1236                    continue;
1237                } else if(prevCC<=cc) {
1238                    prevCC=cc;
1239                    continue;
1240                } else {
1241                    return false;
1242                }
1243            } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1244                return false;
1245            }
1246
1247            /*
1248             * Find appropriate boundaries around this character,
1249             * decompose the source text from between the boundaries,
1250             * and recompose it.
1251             *
1252             * We may need to remove the last few characters from the ReorderingBuffer
1253             * to account for source text that was copied or appended
1254             * but needs to take part in the recomposition.
1255             */
1256
1257            /*
1258             * Find the last composition boundary in [prevBoundary..src[.
1259             * It is either the decomposition of the current character (at prevSrc),
1260             * or prevBoundary.
1261             */
1262            if(hasCompBoundaryBefore(c, norm16)) {
1263                prevBoundary=prevSrc;
1264            } else if(doCompose) {
1265                buffer.removeSuffix(prevSrc-prevBoundary);
1266            }
1267
1268            // Find the next composition boundary in [src..limit[ -
1269            // modifies src to point to the next starter.
1270            src=findNextCompBoundary(s, src, limit);
1271
1272            // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1273            int recomposeStartIndex=buffer.length();
1274            decomposeShort(s, prevBoundary, src, buffer);
1275            recompose(buffer, recomposeStartIndex, onlyContiguous);
1276            if(!doCompose) {
1277                if(!buffer.equals(s, prevBoundary, src)) {
1278                    return false;
1279                }
1280                buffer.remove();
1281                prevCC=0;
1282            }
1283
1284            // Move to the next starter. We never need to look back before this point again.
1285            prevBoundary=src;
1286        }
1287        return true;
1288    }
1289    /**
1290     * Very similar to compose(): Make the same changes in both places if relevant.
1291     * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
1292     * !doSpan: quickCheck
1293     * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
1294     *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
1295     *         then the quick check result is "no"
1296     */
1297    public int composeQuickCheck(CharSequence s, int src, int limit,
1298                                 boolean onlyContiguous, boolean doSpan) {
1299        int qcResult=0;
1300        int minNoMaybeCP=minCompNoMaybeCP;
1301
1302        /*
1303         * prevBoundary points to the last character before the current one
1304         * that has a composition boundary before it with ccc==0 and quick check "yes".
1305         */
1306        int prevBoundary=src;
1307        int prevSrc;
1308        int c=0;
1309        int norm16=0;
1310        int prevCC=0;
1311
1312        for(;;) {
1313            // count code units below the minimum or with irrelevant data for the quick check
1314            for(prevSrc=src;;) {
1315                if(src==limit) {
1316                    return (src<<1)|qcResult;  // "yes" or "maybe"
1317                }
1318                if( (c=s.charAt(src))<minNoMaybeCP ||
1319                    isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
1320                ) {
1321                    ++src;
1322                } else if(!UTF16.isSurrogate((char)c)) {
1323                    break;
1324                } else {
1325                    char c2;
1326                    if(UTF16Plus.isSurrogateLead(c)) {
1327                        if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1328                            c=Character.toCodePoint((char)c, c2);
1329                        }
1330                    } else /* trail surrogate */ {
1331                        if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1332                            --src;
1333                            c=Character.toCodePoint(c2, (char)c);
1334                        }
1335                    }
1336                    if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1337                        src+=Character.charCount(c);
1338                    } else {
1339                        break;
1340                    }
1341                }
1342            }
1343            if(src!=prevSrc) {
1344                // Set prevBoundary to the last character in the quick check loop.
1345                prevBoundary=src-1;
1346                if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
1347                        Character.isHighSurrogate(s.charAt(prevBoundary-1))
1348                ) {
1349                    --prevBoundary;
1350                }
1351                prevCC=0;
1352                // The start of the current character (c).
1353                prevSrc=src;
1354            }
1355
1356            src+=Character.charCount(c);
1357            /*
1358             * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1359             * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1360             * or has ccc!=0.
1361             */
1362            if(isMaybeOrNonZeroCC(norm16)) {
1363                int cc=getCCFromYesOrMaybe(norm16);
1364                if( onlyContiguous &&  // FCC
1365                    cc!=0 &&
1366                    prevCC==0 &&
1367                    prevBoundary<prevSrc &&
1368                    // prevCC==0 && prevBoundary<prevSrc tell us that
1369                    // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1370                    // passed the quick check "yes && ccc==0" test.
1371                    // Check whether the last character was a "yesYes" or a "yesNo".
1372                    // If a "yesNo", then we get its trailing ccc from its
1373                    // mapping and check for canonical order.
1374                    // All other cases are ok.
1375                    getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
1376                ) {
1377                    // Fails FCD test.
1378                } else if(prevCC<=cc || cc==0) {
1379                    prevCC=cc;
1380                    if(norm16<MIN_YES_YES_WITH_CC) {
1381                        if(!doSpan) {
1382                            qcResult=1;
1383                        } else {
1384                            return prevBoundary<<1;  // spanYes does not care to know it's "maybe"
1385                        }
1386                    }
1387                    continue;
1388                }
1389            }
1390            return prevBoundary<<1;  // "no"
1391        }
1392    }
1393    public void composeAndAppend(CharSequence s,
1394                                 boolean doCompose,
1395                                 boolean onlyContiguous,
1396                                 ReorderingBuffer buffer) {
1397        int src=0, limit=s.length();
1398        if(!buffer.isEmpty()) {
1399            int firstStarterInSrc=findNextCompBoundary(s, 0, limit);
1400            if(0!=firstStarterInSrc) {
1401                int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1402                                                               buffer.length());
1403                StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1404                                                       firstStarterInSrc+16);
1405                middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1406                buffer.removeSuffix(buffer.length()-lastStarterInDest);
1407                middle.append(s, 0, firstStarterInSrc);
1408                compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1409                src=firstStarterInSrc;
1410            }
1411        }
1412        if(doCompose) {
1413            compose(s, src, limit, onlyContiguous, true, buffer);
1414        } else {
1415            buffer.append(s, src, limit);
1416        }
1417    }
1418    // Dual functionality:
1419    // buffer!=NULL: normalize
1420    // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1421    public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1422        // Note: In this function we use buffer->appendZeroCC() because we track
1423        // the lead and trail combining classes here, rather than leaving it to
1424        // the ReorderingBuffer.
1425        // The exception is the call to decomposeShort() which uses the buffer
1426        // in the normal way.
1427
1428        // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1429        // Similar to the prevBoundary in the compose() implementation.
1430        int prevBoundary=src;
1431        int prevSrc;
1432        int c=0;
1433        int prevFCD16=0;
1434        int fcd16=0;
1435
1436        for(;;) {
1437            // count code units with lccc==0
1438            for(prevSrc=src; src!=limit;) {
1439                if((c=s.charAt(src))<MIN_CCC_LCCC_CP) {
1440                    prevFCD16=~c;
1441                    ++src;
1442                } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1443                    prevFCD16=0;
1444                    ++src;
1445                } else {
1446                    if(UTF16.isSurrogate((char)c)) {
1447                        char c2;
1448                        if(UTF16Plus.isSurrogateLead(c)) {
1449                            if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1450                                c=Character.toCodePoint((char)c, c2);
1451                            }
1452                        } else /* trail surrogate */ {
1453                            if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1454                                --src;
1455                                c=Character.toCodePoint(c2, (char)c);
1456                            }
1457                        }
1458                    }
1459                    if((fcd16=getFCD16FromNormData(c))<=0xff) {
1460                        prevFCD16=fcd16;
1461                        src+=Character.charCount(c);
1462                    } else {
1463                        break;
1464                    }
1465                }
1466            }
1467            // copy these code units all at once
1468            if(src!=prevSrc) {
1469                if(src==limit) {
1470                    if(buffer!=null) {
1471                        buffer.flushAndAppendZeroCC(s, prevSrc, src);
1472                    }
1473                    break;
1474                }
1475                prevBoundary=src;
1476                // We know that the previous character's lccc==0.
1477                if(prevFCD16<0) {
1478                    // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1479                    int prev=~prevFCD16;
1480                    prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
1481                    if(prevFCD16>1) {
1482                        --prevBoundary;
1483                    }
1484                } else {
1485                    int p=src-1;
1486                    if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1487                        Character.isHighSurrogate(s.charAt(p-1))
1488                    ) {
1489                        --p;
1490                        // Need to fetch the previous character's FCD value because
1491                        // prevFCD16 was just for the trail surrogate code point.
1492                        prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1493                        // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1494                    }
1495                    if(prevFCD16>1) {
1496                        prevBoundary=p;
1497                    }
1498                }
1499                if(buffer!=null) {
1500                    // The last lccc==0 character is excluded from the
1501                    // flush-and-append call in case it needs to be modified.
1502                    buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1503                    buffer.append(s, prevBoundary, src);
1504                }
1505                // The start of the current character (c).
1506                prevSrc=src;
1507            } else if(src==limit) {
1508                break;
1509            }
1510
1511            src+=Character.charCount(c);
1512            // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1513            // Check for proper order, and decompose locally if necessary.
1514            if((prevFCD16&0xff)<=(fcd16>>8)) {
1515                // proper order: prev tccc <= current lccc
1516                if((fcd16&0xff)<=1) {
1517                    prevBoundary=src;
1518                }
1519                if(buffer!=null) {
1520                    buffer.appendZeroCC(c);
1521                }
1522                prevFCD16=fcd16;
1523                continue;
1524            } else if(buffer==null) {
1525                return prevBoundary;  // quick check "no"
1526            } else {
1527                /*
1528                 * Back out the part of the source that we copied or appended
1529                 * already but is now going to be decomposed.
1530                 * prevSrc is set to after what was copied/appended.
1531                 */
1532                buffer.removeSuffix(prevSrc-prevBoundary);
1533                /*
1534                 * Find the part of the source that needs to be decomposed,
1535                 * up to the next safe boundary.
1536                 */
1537                src=findNextFCDBoundary(s, src, limit);
1538                /*
1539                 * The source text does not fulfill the conditions for FCD.
1540                 * Decompose and reorder a limited piece of the text.
1541                 */
1542                decomposeShort(s, prevBoundary, src, buffer);
1543                prevBoundary=src;
1544                prevFCD16=0;
1545            }
1546        }
1547        return src;
1548    }
1549    public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) {
1550        int src=0, limit=s.length();
1551        if(!buffer.isEmpty()) {
1552            int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit);
1553            if(0!=firstBoundaryInSrc) {
1554                int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(),
1555                                                               buffer.length());
1556                StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+
1557                                                       firstBoundaryInSrc+16);
1558                middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length());
1559                buffer.removeSuffix(buffer.length()-lastBoundaryInDest);
1560                middle.append(s, 0, firstBoundaryInSrc);
1561                makeFCD(middle, 0, middle.length(), buffer);
1562                src=firstBoundaryInSrc;
1563            }
1564        }
1565        if(doMakeFCD) {
1566            makeFCD(s, src, limit, buffer);
1567        } else {
1568            buffer.append(s, src, limit);
1569        }
1570    }
1571
1572    // Note: hasDecompBoundary() could be implemented as aliases to
1573    // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
1574    // at the cost of building the FCD trie for a decomposition normalizer.
1575    public boolean hasDecompBoundary(int c, boolean before) {
1576        for(;;) {
1577            if(c<minDecompNoCP) {
1578                return true;
1579            }
1580            int norm16=getNorm16(c);
1581            if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
1582                return true;
1583            } else if(norm16>MIN_NORMAL_MAYBE_YES) {
1584                return false;  // ccc!=0
1585            } else if(isDecompNoAlgorithmic(norm16)) {
1586                c=mapAlgorithmic(c, norm16);
1587            } else {
1588                // c decomposes, get everything from the variable-length extra data
1589                int firstUnit=extraData.charAt(norm16);
1590                if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1591                    return false;
1592                }
1593                if(!before) {
1594                    // decomp after-boundary: same as hasFCDBoundaryAfter(),
1595                    // fcd16<=1 || trailCC==0
1596                    if(firstUnit>0x1ff) {
1597                        return false;  // trailCC>1
1598                    }
1599                    if(firstUnit<=0xff) {
1600                        return true;  // trailCC==0
1601                    }
1602                    // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1603                }
1604                // true if leadCC==0 (hasFCDBoundaryBefore())
1605                return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0;
1606            }
1607        }
1608    }
1609    public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
1610
1611    public boolean hasCompBoundaryBefore(int c) {
1612        return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
1613    }
1614    public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous, boolean testInert) {
1615        for(;;) {
1616            int norm16=getNorm16(c);
1617            if(isInert(norm16)) {
1618                return true;
1619            } else if(norm16<=minYesNo) {
1620                // Hangul: norm16==minYesNo
1621                // Hangul LVT has a boundary after it.
1622                // Hangul LV and non-inert yesYes characters combine forward.
1623                return isHangul(norm16) && !Hangul.isHangulWithoutJamoT((char)c);
1624            } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1625                return false;
1626            } else if(isDecompNoAlgorithmic(norm16)) {
1627                c=mapAlgorithmic(c, norm16);
1628            } else {
1629                // c decomposes, get everything from the variable-length extra data.
1630                // If testInert, then c must be a yesNo character which has lccc=0,
1631                // otherwise it could be a noNo.
1632                int firstUnit=extraData.charAt(norm16);
1633                // true if
1634                //   not MAPPING_NO_COMP_BOUNDARY_AFTER
1635                //     (which is set if
1636                //       c is not deleted, and
1637                //       it and its decomposition do not combine forward, and it has a starter)
1638                //   and if FCC then trailCC<=1
1639                return
1640                    (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
1641                    (!onlyContiguous || firstUnit<=0x1ff);
1642            }
1643        }
1644    }
1645
1646    public boolean hasFCDBoundaryBefore(int c) { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; }
1647    public boolean hasFCDBoundaryAfter(int c) {
1648        int fcd16=getFCD16(c);
1649        return fcd16<=1 || (fcd16&0xff)==0;
1650    }
1651    public boolean isFCDInert(int c) { return getFCD16(c)<=1; }
1652
1653    private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
1654    private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
1655    private static boolean isInert(int norm16) { return norm16==0; }
1656    private static boolean isJamoL(int norm16) { return norm16==1; }
1657    private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
1658    private boolean isHangul(int norm16) { return norm16==minYesNo; }
1659    private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
1660    // UBool isCompYes(uint16_t norm16) const {
1661    //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1662    // }
1663    // UBool isCompYesOrMaybe(uint16_t norm16) const {
1664    //     return norm16<minNoNo || minMaybeYes<=norm16;
1665    // }
1666    // private boolean hasZeroCCFromDecompYes(int norm16) {
1667    //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1668    // }
1669    private boolean isDecompYesAndZeroCC(int norm16) {
1670        return norm16<minYesNo ||
1671               norm16==JAMO_VT ||
1672               (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1673    }
1674    /**
1675     * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1676     * the MaybeYes which combine-forward and have ccc=0.
1677     * (Standard Unicode 5.2 normalization does not have such characters.)
1678     */
1679    private boolean isMostDecompYesAndZeroCC(int norm16) {
1680        return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1681    }
1682    private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1683
1684    // For use with isCompYes().
1685    // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1686    // static uint8_t getCCFromYes(uint16_t norm16) {
1687    //     return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
1688    // }
1689    private int getCCFromNoNo(int norm16) {
1690        if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1691            return extraData.charAt(norm16-1)&0xff;
1692        } else {
1693            return 0;
1694        }
1695    }
1696    // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
1697    int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) {
1698        int c;
1699        if(cpStart==(cpLimit-1)) {
1700            c=s.charAt(cpStart);
1701        } else {
1702            c=Character.codePointAt(s, cpStart);
1703        }
1704        int prevNorm16=getNorm16(c);
1705        if(prevNorm16<=minYesNo) {
1706            return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
1707        } else {
1708            return extraData.charAt(prevNorm16)>>8;  // tccc from yesNo
1709        }
1710    }
1711
1712    // Requires algorithmic-NoNo.
1713    private int mapAlgorithmic(int c, int norm16) {
1714        return c+norm16-(minMaybeYes-MAX_DELTA-1);
1715    }
1716
1717    // Requires minYesNo<norm16<limitNoNo.
1718    // private int getMapping(int norm16) { return /*extraData+*/norm16; }
1719
1720    /**
1721     * @return index into maybeYesCompositions, or -1
1722     */
1723    private int getCompositionsListForDecompYes(int norm16) {
1724        if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
1725            return -1;
1726        } else {
1727            if((norm16-=minMaybeYes)<0) {
1728                // norm16<minMaybeYes: index into extraData which is a substring at
1729                //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1730                // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1731                norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list
1732            }
1733            return norm16;
1734        }
1735    }
1736    /**
1737     * @return index into maybeYesCompositions
1738     */
1739    private int getCompositionsListForComposite(int norm16) {
1740        // composite has both mapping & compositions list
1741        int firstUnit=extraData.charAt(norm16);
1742        return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+  // mapping in maybeYesCompositions
1743            1+  // +1 to skip the first unit with the mapping lenth
1744            (firstUnit&MAPPING_LENGTH_MASK);  // + mapping length
1745    }
1746    /**
1747     * @param c code point must have compositions
1748     * @return index into maybeYesCompositions
1749     */
1750    private int getCompositionsList(int norm16) {
1751        return isDecompYes(norm16) ?
1752                getCompositionsListForDecompYes(norm16) :
1753                getCompositionsListForComposite(norm16);
1754    }
1755
1756    // Decompose a short piece of text which is likely to contain characters that
1757    // fail the quick check loop and/or where the quick check loop's overhead
1758    // is unlikely to be amortized.
1759    // Called by the compose() and makeFCD() implementations.
1760    // Public in Java for collation implementation code.
1761    public void decomposeShort(CharSequence s, int src, int limit,
1762                               ReorderingBuffer buffer) {
1763        while(src<limit) {
1764            int c=Character.codePointAt(s, src);
1765            src+=Character.charCount(c);
1766            decompose(c, getNorm16(c), buffer);
1767        }
1768    }
1769    private void decompose(int c, int norm16,
1770                           ReorderingBuffer buffer) {
1771        // Only loops for 1:1 algorithmic mappings.
1772        for(;;) {
1773            // get the decomposition and the lead and trail cc's
1774            if(isDecompYes(norm16)) {
1775                // c does not decompose
1776                buffer.append(c, getCCFromYesOrMaybe(norm16));
1777            } else if(isHangul(norm16)) {
1778                // Hangul syllable: decompose algorithmically
1779                Hangul.decompose(c, buffer);
1780            } else if(isDecompNoAlgorithmic(norm16)) {
1781                c=mapAlgorithmic(c, norm16);
1782                norm16=getNorm16(c);
1783                continue;
1784            } else {
1785                // c decomposes, get everything from the variable-length extra data
1786                int firstUnit=extraData.charAt(norm16);
1787                int length=firstUnit&MAPPING_LENGTH_MASK;
1788                int leadCC, trailCC;
1789                trailCC=firstUnit>>8;
1790                if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1791                    leadCC=extraData.charAt(norm16-1)>>8;
1792                } else {
1793                    leadCC=0;
1794                }
1795                ++norm16;  // skip over the firstUnit
1796                buffer.append(extraData, norm16, norm16+length, leadCC, trailCC);
1797            }
1798            return;
1799        }
1800    }
1801
1802    /**
1803     * Finds the recomposition result for
1804     * a forward-combining "lead" character,
1805     * specified with a pointer to its compositions list,
1806     * and a backward-combining "trail" character.
1807     *
1808     * <p>If the lead and trail characters combine, then this function returns
1809     * the following "compositeAndFwd" value:
1810     * <pre>
1811     * Bits 21..1  composite character
1812     * Bit      0  set if the composite is a forward-combining starter
1813     * </pre>
1814     * otherwise it returns -1.
1815     *
1816     * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1817     * encoded as either pairs or triples of 16-bit units.
1818     * The last entry has the high bit of its first unit set.
1819     *
1820     * <p>The list is sorted by ascending trail characters (there are no duplicates).
1821     * A linear search is used.
1822     *
1823     * <p>See normalizer2impl.h for a more detailed description
1824     * of the compositions list format.
1825     */
1826    private static int combine(String compositions, int list, int trail) {
1827        int key1, firstUnit;
1828        if(trail<COMP_1_TRAIL_LIMIT) {
1829            // trail character is 0..33FF
1830            // result entry may have 2 or 3 units
1831            key1=(trail<<1);
1832            while(key1>(firstUnit=compositions.charAt(list))) {
1833                list+=2+(firstUnit&COMP_1_TRIPLE);
1834            }
1835            if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1836                if((firstUnit&COMP_1_TRIPLE)!=0) {
1837                    return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1838                } else {
1839                    return compositions.charAt(list+1);
1840                }
1841            }
1842        } else {
1843            // trail character is 3400..10FFFF
1844            // result entry has 3 units
1845            key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1846            int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1847            int secondUnit;
1848            for(;;) {
1849                if(key1>(firstUnit=compositions.charAt(list))) {
1850                    list+=2+(firstUnit&COMP_1_TRIPLE);
1851                } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1852                    if(key2>(secondUnit=compositions.charAt(list+1))) {
1853                        if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1854                            break;
1855                        } else {
1856                            list+=3;
1857                        }
1858                    } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1859                        return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
1860                    } else {
1861                        break;
1862                    }
1863                } else {
1864                    break;
1865                }
1866            }
1867        }
1868        return -1;
1869    }
1870    /**
1871     * @param list some character's compositions list
1872     * @param set recursively receives the composites from these compositions
1873     */
1874    private void addComposites(int list, UnicodeSet set) {
1875        int firstUnit, compositeAndFwd;
1876        do {
1877            firstUnit=maybeYesCompositions.charAt(list);
1878            if((firstUnit&COMP_1_TRIPLE)==0) {
1879                compositeAndFwd=maybeYesCompositions.charAt(list+1);
1880                list+=2;
1881            } else {
1882                compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
1883                                maybeYesCompositions.charAt(list+2);
1884                list+=3;
1885            }
1886            int composite=compositeAndFwd>>1;
1887            if((compositeAndFwd&1)!=0) {
1888                addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
1889            }
1890            set.add(composite);
1891        } while((firstUnit&COMP_1_LAST_TUPLE)==0);
1892    }
1893    /*
1894     * Recomposes the buffer text starting at recomposeStartIndex
1895     * (which is in NFD - decomposed and canonically ordered),
1896     * and truncates the buffer contents.
1897     *
1898     * Note that recomposition never lengthens the text:
1899     * Any character consists of either one or two code units;
1900     * a composition may contain at most one more code unit than the original starter,
1901     * while the combining mark that is removed has at least one code unit.
1902     */
1903    private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
1904                           boolean onlyContiguous) {
1905        StringBuilder sb=buffer.getStringBuilder();
1906        int p=recomposeStartIndex;
1907        if(p==sb.length()) {
1908            return;
1909        }
1910
1911        int starter, pRemove;
1912        int compositionsList;
1913        int c, compositeAndFwd;
1914        int norm16;
1915        int cc, prevCC;
1916        boolean starterIsSupplementary;
1917
1918        // Some of the following variables are not used until we have a forward-combining starter
1919        // and are only initialized now to avoid compiler warnings.
1920        compositionsList=-1;  // used as indicator for whether we have a forward-combining starter
1921        starter=-1;
1922        starterIsSupplementary=false;
1923        prevCC=0;
1924
1925        for(;;) {
1926            c=sb.codePointAt(p);
1927            p+=Character.charCount(c);
1928            norm16=getNorm16(c);
1929            cc=getCCFromYesOrMaybe(norm16);
1930            if( // this character combines backward and
1931                isMaybe(norm16) &&
1932                // we have seen a starter that combines forward and
1933                compositionsList>=0 &&
1934                // the backward-combining character is not blocked
1935                (prevCC<cc || prevCC==0)
1936            ) {
1937                if(isJamoVT(norm16)) {
1938                    // c is a Jamo V/T, see if we can compose it with the previous character.
1939                    if(c<Hangul.JAMO_T_BASE) {
1940                        // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1941                        char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1942                        if(prev<Hangul.JAMO_L_COUNT) {
1943                            pRemove=p-1;
1944                            char syllable=(char)
1945                                (Hangul.HANGUL_BASE+
1946                                 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1947                                 Hangul.JAMO_T_COUNT);
1948                            char t;
1949                            if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1950                                ++p;
1951                                syllable+=t;  // The next character was a Jamo T.
1952                            }
1953                            sb.setCharAt(starter, syllable);
1954                            // remove the Jamo V/T
1955                            sb.delete(pRemove, p);
1956                            p=pRemove;
1957                        }
1958                    }
1959                    /*
1960                     * No "else" for Jamo T:
1961                     * Since the input is in NFD, there are no Hangul LV syllables that
1962                     * a Jamo T could combine with.
1963                     * All Jamo Ts are combined above when handling Jamo Vs.
1964                     */
1965                    if(p==sb.length()) {
1966                        break;
1967                    }
1968                    compositionsList=-1;
1969                    continue;
1970                } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
1971                    // The starter and the combining mark (c) do combine.
1972                    int composite=compositeAndFwd>>1;
1973
1974                    // Remove the combining mark.
1975                    pRemove=p-Character.charCount(c);  // pRemove & p: start & limit of the combining mark
1976                    sb.delete(pRemove, p);
1977                    p=pRemove;
1978                    // Replace the starter with the composite.
1979                    if(starterIsSupplementary) {
1980                        if(composite>0xffff) {
1981                            // both are supplementary
1982                            sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1983                            sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
1984                        } else {
1985                            sb.setCharAt(starter, (char)c);
1986                            sb.deleteCharAt(starter+1);
1987                            // The composite is shorter than the starter,
1988                            // move the intermediate characters forward one.
1989                            starterIsSupplementary=false;
1990                            --p;
1991                        }
1992                    } else if(composite>0xffff) {
1993                        // The composite is longer than the starter,
1994                        // move the intermediate characters back one.
1995                        starterIsSupplementary=true;
1996                        sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1997                        sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
1998                        ++p;
1999                    } else {
2000                        // both are on the BMP
2001                        sb.setCharAt(starter, (char)composite);
2002                    }
2003
2004                    // Keep prevCC because we removed the combining mark.
2005
2006                    if(p==sb.length()) {
2007                        break;
2008                    }
2009                    // Is the composite a starter that combines forward?
2010                    if((compositeAndFwd&1)!=0) {
2011                        compositionsList=
2012                            getCompositionsListForComposite(getNorm16(composite));
2013                    } else {
2014                        compositionsList=-1;
2015                    }
2016
2017                    // We combined; continue with looking for compositions.
2018                    continue;
2019                }
2020            }
2021
2022            // no combination this time
2023            prevCC=cc;
2024            if(p==sb.length()) {
2025                break;
2026            }
2027
2028            // If c did not combine, then check if it is a starter.
2029            if(cc==0) {
2030                // Found a new starter.
2031                if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
2032                    // It may combine with something, prepare for it.
2033                    if(c<=0xffff) {
2034                        starterIsSupplementary=false;
2035                        starter=p-1;
2036                    } else {
2037                        starterIsSupplementary=true;
2038                        starter=p-2;
2039                    }
2040                }
2041            } else if(onlyContiguous) {
2042                // FCC: no discontiguous compositions; any intervening character blocks.
2043                compositionsList=-1;
2044            }
2045        }
2046        buffer.flush();
2047    }
2048
2049    public int composePair(int a, int b) {
2050        int norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
2051        int list;
2052        if(isInert(norm16)) {
2053            return -1;
2054        } else if(norm16<minYesNoMappingsOnly) {
2055            if(isJamoL(norm16)) {
2056                b-=Hangul.JAMO_V_BASE;
2057                if(0<=b && b<Hangul.JAMO_V_COUNT) {
2058                    return
2059                        (Hangul.HANGUL_BASE+
2060                         ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)*
2061                         Hangul.JAMO_T_COUNT);
2062                } else {
2063                    return -1;
2064                }
2065            } else if(isHangul(norm16)) {
2066                b-=Hangul.JAMO_T_BASE;
2067                if(Hangul.isHangulWithoutJamoT((char)a) && 0<b && b<Hangul.JAMO_T_COUNT) {  // not b==0!
2068                    return a+b;
2069                } else {
2070                    return -1;
2071                }
2072            } else {
2073                // 'a' has a compositions list in extraData
2074                list=norm16;
2075                if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
2076                    list+=  // mapping pointer
2077                        1+  // +1 to skip the first unit with the mapping lenth
2078                        (extraData.charAt(list)&MAPPING_LENGTH_MASK);  // + mapping length
2079                }
2080                // Turn the offset-into-extraData into an offset-into-maybeYesCompositions.
2081                list+=MIN_NORMAL_MAYBE_YES-minMaybeYes;
2082            }
2083        } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
2084            return -1;
2085        } else {
2086            list=norm16-minMaybeYes;  // offset into maybeYesCompositions
2087        }
2088        if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
2089            return -1;
2090        }
2091        return combine(maybeYesCompositions, list, b)>>1;
2092    }
2093
2094    /**
2095     * Does c have a composition boundary before it?
2096     * True if its decomposition begins with a character that has
2097     * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
2098     * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
2099     * (isCompYesAndZeroCC()) so we need not decompose.
2100     */
2101    private boolean hasCompBoundaryBefore(int c, int norm16) {
2102        for(;;) {
2103            if(isCompYesAndZeroCC(norm16)) {
2104                return true;
2105            } else if(isMaybeOrNonZeroCC(norm16)) {
2106                return false;
2107            } else if(isDecompNoAlgorithmic(norm16)) {
2108                c=mapAlgorithmic(c, norm16);
2109                norm16=getNorm16(c);
2110            } else {
2111                // c decomposes, get everything from the variable-length extra data
2112                int firstUnit=extraData.charAt(norm16);
2113                if((firstUnit&MAPPING_LENGTH_MASK)==0) {
2114                    return false;
2115                }
2116                if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) {
2117                    return false;  // non-zero leadCC
2118                }
2119                return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1)));
2120            }
2121        }
2122    }
2123    private int findPreviousCompBoundary(CharSequence s, int p) {
2124        while(p>0) {
2125            int c=Character.codePointBefore(s, p);
2126            p-=Character.charCount(c);
2127            if(hasCompBoundaryBefore(c)) {
2128                break;
2129            }
2130            // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
2131            // but that's probably not worth the extra cost.
2132        }
2133        return p;
2134    }
2135    private int findNextCompBoundary(CharSequence s, int p, int limit) {
2136        while(p<limit) {
2137            int c=Character.codePointAt(s, p);
2138            int norm16=normTrie.get(c);
2139            if(hasCompBoundaryBefore(c, norm16)) {
2140                break;
2141            }
2142            p+=Character.charCount(c);
2143        }
2144        return p;
2145    }
2146
2147    private int findPreviousFCDBoundary(CharSequence s, int p) {
2148        while(p>0) {
2149            int c=Character.codePointBefore(s, p);
2150            p-=Character.charCount(c);
2151            if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {
2152                break;
2153            }
2154        }
2155        return p;
2156    }
2157    private int findNextFCDBoundary(CharSequence s, int p, int limit) {
2158        while(p<limit) {
2159            int c=Character.codePointAt(s, p);
2160            if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {
2161                break;
2162            }
2163            p+=Character.charCount(c);
2164        }
2165        return p;
2166    }
2167
2168    private void addToStartSet(Trie2Writable newData, int origin, int decompLead) {
2169        int canonValue=newData.get(decompLead);
2170        if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
2171            // origin is the first character whose decomposition starts with
2172            // the character for which we are setting the value.
2173            newData.set(decompLead, canonValue|origin);
2174        } else {
2175            // origin is not the first character, or it is U+0000.
2176            UnicodeSet set;
2177            if((canonValue&CANON_HAS_SET)==0) {
2178                int firstOrigin=canonValue&CANON_VALUE_MASK;
2179                canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size();
2180                newData.set(decompLead, canonValue);
2181                canonStartSets.add(set=new UnicodeSet());
2182                if(firstOrigin!=0) {
2183                    set.add(firstOrigin);
2184                }
2185            } else {
2186                set=canonStartSets.get(canonValue&CANON_VALUE_MASK);
2187            }
2188            set.add(origin);
2189        }
2190    }
2191
2192    @SuppressWarnings("unused")
2193    private VersionInfo dataVersion;
2194
2195    // Code point thresholds for quick check codes.
2196    private int minDecompNoCP;
2197    private int minCompNoMaybeCP;
2198
2199    // Norm16 value thresholds for quick check combinations and types of extra data.
2200    private int minYesNo;
2201    private int minYesNoMappingsOnly;
2202    private int minNoNo;
2203    private int limitNoNo;
2204    private int minMaybeYes;
2205
2206    private Trie2_16 normTrie;
2207    private String maybeYesCompositions;
2208    private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2209    private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2210    private int[] tccc180;  // [0x180] tccc values for U+0000..U+017F
2211
2212    private Trie2_32 canonIterData;
2213    private ArrayList<UnicodeSet> canonStartSets;
2214
2215    // bits in canonIterData
2216    private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000;
2217    private static final int CANON_HAS_COMPOSITIONS = 0x40000000;
2218    private static final int CANON_HAS_SET = 0x200000;
2219    private static final int CANON_VALUE_MASK = 0x1fffff;
2220}
2221