12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others.
22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License
37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/*
47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *******************************************************************************
57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Copyright (C) 1996-2014, International Business Machines Corporation and    *
67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * others. All Rights Reserved.                                                *
77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *******************************************************************************
87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.text;
107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.ArrayList;
127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.HashSet;
137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.Iterator;
147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.List;
157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.Set;
167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Norm2AllModes;
187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Normalizer2Impl;
197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Utility;
207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UCharacter;
217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/**
237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This class allows one to iterate through all the strings that are canonically equivalent to a given
247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * string. For example, here are some sample results:
257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <pre>
277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *</pre>
407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * since it has not been optimized for that situation.
427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @author M. Davis
437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.4
447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic final class CanonicalIterator {
477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Construct a CanonicalIterator object
497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param source string to get results for
507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @stable ICU 2.4
517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public CanonicalIterator(String source) {
537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        Norm2AllModes allModes = Norm2AllModes.getNFCInstance();
547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        nfd = allModes.decomp;
557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        nfcImpl = allModes.impl.ensureCanonIterData();
567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        setSource(source);
577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Gets the NFD form of the current source we are iterating over.
617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @return gets the source: NOTE: it is the NFD form of the source originally passed in
627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @stable ICU 2.4
637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public String getSource() {
657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert      return source;
667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Resets the iterator so that one can start again from the beginning.
707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @stable ICU 2.4
717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public void reset() {
737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        done = false;
747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (int i = 0; i < current.length; ++i) {
757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            current[i] = 0;
767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Get the next canonically equivalent string.
817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @return the next string that is canonically equivalent. The value null is returned when
837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * the iteration is done.
847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @stable ICU 2.4
857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public String next() {
877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (done) return null;
887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // construct return value
907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        buffer.setLength(0); // delete old contents
927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (int i = 0; i < pieces.length; ++i) {
937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            buffer.append(pieces[i][current[i]]);
947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        String result = buffer.toString();
967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // find next value for next time
987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (int i = current.length - 1; ; --i) {
1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (i < 0) {
1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                done = true;
1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                break;
1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            current[i]++;
1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (current[i] < pieces[i].length) break; // got sequence
1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            current[i] = 0;
1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return result;
1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Set a new source for this iterator. Allows object reuse.
1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param newSource the source string to iterate against. This allows the same iterator to be used
1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * while changing the source string, saving object creation.
1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @stable ICU 2.4
1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public void setSource(String newSource) {
1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        source = nfd.normalize(newSource);
1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        done = false;
1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // catch degenerate case
1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (newSource.length() == 0) {
1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            pieces = new String[1][];
1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            current = new int[1];
1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            pieces[0] = new String[]{""};
1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return;
1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // find the segments
1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        List<String> segmentList = new ArrayList<String>();
1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int cp;
1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int start = 0;
1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // i should be the end of the first code point
1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // break up the string into segements
1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int i = UTF16.findOffsetFromCodePoint(source, 1);
1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (; i < source.length(); i += Character.charCount(cp)) {
1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            cp = source.codePointAt(i);
1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (nfcImpl.isCanonSegmentStarter(cp)) {
1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                segmentList.add(source.substring(start, i)); // add up to i
1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                start = i;
1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        segmentList.add(source.substring(start, i)); // add last one
1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // allocate the arrays, and find the strings that are CE to each segment
1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        pieces = new String[segmentList.size()][];
1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        current = new int[segmentList.size()];
1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (i = 0; i < pieces.length; ++i) {
1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (PROGRESS) System.out.println("SEGMENT");
1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            pieces[i] = getEquivalents(segmentList.get(i));
1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Simple implementation of permutation.
1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param source the string to find permutations for
1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param skipZeros set to true to skip characters with canonical combining class zero
1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param output the set to add the results to
1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @internal
1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @deprecated This API is ICU internal only.
1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    @Deprecated
1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public static void permute(String source, boolean skipZeros, Set<String> output) {
1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // TODO: optimize
1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //if (PROGRESS) System.out.println("Permute: " + source);
1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // optimization:
1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // if zero or one character, just return a set with it
1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // we check for length < 2 to keep from counting code points all the time
1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            output.add(source);
1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return;
1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // otherwise iterate through the string, and recursively permute all the other characters
1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        Set<String> subpermute = new HashSet<String>();
1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int cp;
1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            cp = UTF16.charAt(source, i);
1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // optimization:
1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // if the character is canonical combining class zero,
1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // don't permute it
1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                continue;
1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // see what the permutations of the characters before and after this one are
1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            subpermute.clear();
1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            permute(source.substring(0,i)
1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // prefix this character to all of them
1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            String chStr = UTF16.valueOf(source, i);
2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            for (String s : subpermute) {
2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                String piece = chStr + s;
2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                //if (PROGRESS) System.out.println("  Piece: " + piece);
2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                output.add(piece);
2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // FOR TESTING
2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /*
2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *
2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public static UnicodeSet getSafeStart() {
2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return (UnicodeSet) SAFE_START.clone();
2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /*
2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *@return the set of characters whose decompositions start with the given character
2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *
2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public static UnicodeSet getStarts(int cp) {
2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        UnicodeSet result = AT_START.get(cp);
2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (result == null) result = EMPTY;
2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return (UnicodeSet) result.clone();
2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // ===================== PRIVATES ==============================
2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // debug
2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static boolean PROGRESS = false; // debug progress
2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static boolean SKIP_ZEROS = true;
2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // fields
2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private final Normalizer2 nfd;
2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private final Normalizer2Impl nfcImpl;
2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private String source;
2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private boolean done;
2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private String[][] pieces;
2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private int[] current;
2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // Note: C will need two more fields, since arrays there don't have lengths
2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // int pieces_length;
2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // int[] pieces_lengths;
2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // transient fields
2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private transient StringBuilder buffer = new StringBuilder();
2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private String[] getEquivalents(String segment) {
2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        Set<String> result = new HashSet<String>();
2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        Set<String> basic = getEquivalents2(segment);
2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        Set<String> permutations = new HashSet<String>();
2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // now get all the permutations
2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // add only the ones that are canonically equivalent
2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // TODO: optimize by not permuting any class zero.
2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        Iterator<String> it = basic.iterator();
2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        while (it.hasNext()) {
2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            String item = it.next();
2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            permutations.clear();
2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            permute(item, SKIP_ZEROS, permutations);
2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            Iterator<String> it2 = permutations.iterator();
2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            while (it2.hasNext()) {
2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                String possible = it2.next();
2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/*
2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (attempt.equals(segment)) {
2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/
2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (Normalizer.compare(possible, segment,0)==0) {
2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));
2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    result.add(possible);
2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else {
2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));
2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // convert into a String[] to clean up storage
2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        String[] finalResult = new String[result.size()];
2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        result.toArray(finalResult);
2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return finalResult;
2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private Set<String> getEquivalents2(String segment) {
2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        Set<String> result = new HashSet<String>();
2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));
2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        result.add(segment);
2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        StringBuffer workingBuffer = new StringBuffer();
2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        UnicodeSet starts = new UnicodeSet();
2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // cycle through all the characters
3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int cp;
3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (int i = 0; i < segment.length(); i += Character.charCount(cp)) {
3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // see if any character is at the start of some decomposition
3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            cp = segment.codePointAt(i);
3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (!nfcImpl.getCanonStartSet(cp, starts)) {
3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert              continue;
3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // if so, see which decompositions match
3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) {
3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                int cp2 = iter.codepoint;
3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                Set<String> remainder = extract(cp2, segment, i, workingBuffer);
3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (remainder == null) {
3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    continue;
3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // there were some matches, so add all the possibilities to the set.
3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                String prefix= segment.substring(0,i);
3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                prefix += UTF16.valueOf(cp2);
3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                for (String item : remainder) {
3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    result.add(prefix + item);
3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return result;
3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        /*
3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        Set result = new HashSet();
3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        result.add(segment);
3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        StringBuffer workingBuffer = new StringBuffer();
3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // cycle through all the characters
3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int cp;
3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // see if any character is at the start of some decomposition
3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            cp = UTF16.charAt(segment, i);
3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            NormalizerImpl.getCanonStartSet(c,fillSet)
3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            UnicodeSet starts = AT_START.get(cp);
3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (starts == null) continue;
3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            UnicodeSetIterator usi = new UnicodeSetIterator(starts);
3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // if so, see which decompositions match
3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            while (usi.next()) {
3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                int cp2 = usi.codepoint;
3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // we know that there are no strings in it
3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // so we don't have to check CharacterIterator.IS_STRING
3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                Set remainder = extract(cp2, segment, i, workingBuffer);
3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (remainder == null) continue;
3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // there were some matches, so add all the possibilities to the set.
3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                Iterator it = remainder.iterator();
3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                while (it.hasNext()) {
3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    String item = (String) it.next();
3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    result.add(prefix + item);
3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return result;
3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        */
3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * See if the decomposition of cp2 is at segment starting at segmentPos
3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * (with canonical rearrangment!)
3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * If so, take the remainder, and return the equivalents
3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) {
3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            + ", " + Utility.hex(segment.substring(segmentPos)));
3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        String decomp = nfcImpl.getDecomposition(comp);
3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (decomp == null) {
3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            decomp = UTF16.valueOf(comp);
3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // See if it matches the start of segment (at segmentPos)
3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        boolean ok = false;
3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int cp;
3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int decompPos = 0;
3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int decompCp = UTF16.charAt(decomp,0);
3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //int decompClass = getClass(decompCp);
3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        buf.setLength(0); // initialize working buffer, shared among callees
3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            cp = UTF16.charAt(segment, i);
3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (cp == decompCp) { // if equal, eat another cp from decomp
3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (PROGRESS) System.out.println("  matches: " + Utility.hex(UTF16.valueOf(cp)));
3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (decompPos == decomp.length()) { // done, have all decomp characters!
3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ok = true;
3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    break;
3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
3957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                decompCp = UTF16.charAt(decomp, decompPos);
3967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                decompPos += UTF16.getCharCount(decompCp);
3977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                //decompClass = getClass(decompCp);
3987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            } else {
3997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (PROGRESS) System.out.println("  buffer: " + Utility.hex(UTF16.valueOf(cp)));
4007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // brute force approach
4017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                UTF16.append(buf, cp);
4027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                /* TODO: optimize
4037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // since we know that the classes are monotonically increasing, after zero
4047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // e.g. 0 5 7 9 0 3
4057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // we can do an optimization
4067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // there are only a few cases that work: zero, less, same, greater
4077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // if both classes are the same, we fail
4087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // if the decomp class < the segment class, we fail
4097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                segClass = getClass(cp);
4117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (decompClass <= segClass) return null;
4127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                */
4137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
4147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
4157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (!ok) return null; // we failed, characters left over
4167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (PROGRESS) System.out.println("Matches");
4177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
4187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        String remainder = buf.toString();
4197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // brute force approach
4217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // to check to make sure result is canonically equivalent
4227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        /*
4237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
4247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
4257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        */
4267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
4287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // get the remaining combinations
4307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return getEquivalents2(remainder);
4317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
4327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /*
4347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // TODO: fix once we have a codepoint interface to get the canonical combining class
4357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // TODO: Need public access to canonical combining class in UCharacter!
4367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static int getClass(int cp) {
4377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return Normalizer.getClass((char)cp);
4387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
4397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
4407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert   // ================= BUILDER =========================
4427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // TODO: Flatten this data so it doesn't have to be reconstructed each time!
4437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
4457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change
4467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    static {
4477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        SET_WITH_NULL_STRING.add("");
4487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
4497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert  //  private static UnicodeSet SAFE_START = new UnicodeSet();
4517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert  //  private static CharMap AT_START = new CharMap();
4527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
4547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
4557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert  //  private static int LAST_UNICODE = 0x10FFFF;
4567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /*
4577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    static {
4587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        buildData();
4597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
4607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
4617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /*
4627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static void buildData() {
4637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (PROGRESS) System.out.println("Getting Safe Start");
4657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
4667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
4677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int cc = UCharacter.getCombiningClass(cp);
4687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (cc == 0) SAFE_START.add(cp);
4697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // will fix to be really safe below
4707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
4717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (PROGRESS) System.out.println();
4727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (PROGRESS) System.out.println("Getting Containment");
4747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
4757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
4767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
4787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            //String istr = UTF16.valueOf(cp);
4807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            String decomp = Normalizer.normalize(cp, Normalizer.NFD);
4817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            //if (decomp.equals(istr)) continue;
4827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // add each character in the decomposition to canBeIn
4847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int component;
4867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
4877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                component = UTF16.charAt(decomp, i);
4887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (i == 0) {
4897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    AT_START.add(component, cp);
4907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else if (UCharacter.getCombiningClass(component) == 0) {
4917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    SAFE_START.remove(component);
4927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
4937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
4947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
4957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (PROGRESS) System.out.println();
4967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
4977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // the following is just for a map from characters to a set of characters
4987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static class CharMap {
5007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        Map storage = new HashMap();
5017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        MutableInt probe = new MutableInt();
5027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        boolean converted = false;
5037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
5047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        public void add(int cp, int whatItIsIn) {
5057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
5067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (result == null) {
5077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                result = new UnicodeSet();
5087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                storage.put(probe, result);
5097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
5107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            result.add(whatItIsIn);
5117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
5127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
5137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        public UnicodeSet get(int cp) {
5147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return (UnicodeSet) storage.get(probe.set(cp));
5157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
5167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
5177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
5187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static class MutableInt {
5197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        public int contents;
5207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        public int hashCode() { return contents; }
5217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        public boolean equals(Object other) {
5227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return ((MutableInt)other).contents == contents;
5237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
5247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // allows chaining
5257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        public MutableInt set(int contents) {
5267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            this.contents = contents;
5277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return this;
5287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
5297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
5307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
5317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
5327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert}
533