12ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/* GENERATED SOURCE. DO NOT MODIFY. */ 2f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert// © 2016 and later: Unicode, Inc. and others. 3f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 42ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/* 52ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller ******************************************************************************* 62ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Copyright (C) 1996-2014, International Business Machines Corporation and * 72ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * others. All Rights Reserved. * 82ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller ******************************************************************************* 92ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerpackage android.icu.text; 112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerimport java.util.ArrayList; 132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerimport java.util.HashSet; 142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerimport java.util.Iterator; 152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerimport java.util.List; 162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerimport java.util.Set; 172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerimport android.icu.impl.Norm2AllModes; 192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerimport android.icu.impl.Normalizer2Impl; 202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerimport android.icu.impl.Utility; 212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerimport android.icu.lang.UCharacter; 222ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/** 242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * This class allows one to iterate through all the strings that are canonically equivalent to a given 252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * string. For example, here are some sample results: 262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * <pre> 282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE} 302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA} 312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE} 322ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 332ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE} 342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA} 352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE} 362ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA} 372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE} 382ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA} 392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE} 402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *</pre> 412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones, 422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * since it has not been optimized for that situation. 432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @author M. Davis 44836e6b40a94ec3fb7545a76cb072960442b7eee9Neil Fuller * @hide Only a subset of ICU is exposed in Android 452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerpublic final class CanonicalIterator { 482ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Construct a CanonicalIterator object 502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @param source string to get results for 512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller public CanonicalIterator(String source) { 532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller Norm2AllModes allModes = Norm2AllModes.getNFCInstance(); 542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller nfd = allModes.decomp; 552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller nfcImpl = allModes.impl.ensureCanonIterData(); 562ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller setSource(source); 572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Gets the NFD form of the current source we are iterating over. 612ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @return gets the source: NOTE: it is the NFD form of the source originally passed in 622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 632ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller public String getSource() { 642ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return source; 652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 662ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Resets the iterator so that one can start again from the beginning. 692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller public void reset() { 712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller done = false; 722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (int i = 0; i < current.length; ++i) { 732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller current[i] = 0; 742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 752ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 762ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Get the next canonically equivalent string. 792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b> 802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @return the next string that is canonically equivalent. The value null is returned when 812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * the iteration is done. 822ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 832ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller public String next() { 842ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (done) return null; 852ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // construct return value 872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller buffer.setLength(0); // delete old contents 892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (int i = 0; i < pieces.length; ++i) { 902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller buffer.append(pieces[i][current[i]]); 912ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String result = buffer.toString(); 932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 942ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // find next value for next time 952ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 962ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (int i = current.length - 1; ; --i) { 972ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (i < 0) { 982ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller done = true; 992ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller break; 1002ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1012ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller current[i]++; 1022ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (current[i] < pieces[i].length) break; // got sequence 1032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller current[i] = 0; 1042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1052ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return result; 1062ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 1092ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Set a new source for this iterator. Allows object reuse. 1102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @param newSource the source string to iterate against. This allows the same iterator to be used 1112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * while changing the source string, saving object creation. 1122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 1132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller public void setSource(String newSource) { 1142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller source = nfd.normalize(newSource); 1152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller done = false; 1162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // catch degenerate case 1182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (newSource.length() == 0) { 1192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller pieces = new String[1][]; 1202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller current = new int[1]; 1212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller pieces[0] = new String[]{""}; 1222ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return; 1232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // find the segments 1262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller List<String> segmentList = new ArrayList<String>(); 1272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int cp; 1282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int start = 0; 1292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // i should be the end of the first code point 1312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // break up the string into segements 1322ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1332ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int i = UTF16.findOffsetFromCodePoint(source, 1); 1342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (; i < source.length(); i += Character.charCount(cp)) { 1362ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller cp = source.codePointAt(i); 1372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (nfcImpl.isCanonSegmentStarter(cp)) { 1382ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller segmentList.add(source.substring(start, i)); // add up to i 1392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller start = i; 1402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller segmentList.add(source.substring(start, i)); // add last one 1432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // allocate the arrays, and find the strings that are CE to each segment 1452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller pieces = new String[segmentList.size()][]; 1462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller current = new int[segmentList.size()]; 1472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (i = 0; i < pieces.length; ++i) { 1482ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS) System.out.println("SEGMENT"); 1492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller pieces[i] = getEquivalents(segmentList.get(i)); 1502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 1542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * Simple implementation of permutation. 1552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b> 1562ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @param source the string to find permutations for 1572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @param skipZeros set to true to skip characters with canonical combining class zero 1582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @param output the set to add the results to 1592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * @deprecated This API is ICU internal only. 160836e6b40a94ec3fb7545a76cb072960442b7eee9Neil Fuller * @hide draft / provisional / internal are hidden on Android 1612ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 1622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller @Deprecated 1632ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller public static void permute(String source, boolean skipZeros, Set<String> output) { 1642ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // TODO: optimize 1652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller //if (PROGRESS) System.out.println("Permute: " + source); 1662ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // optimization: 1682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // if zero or one character, just return a set with it 1692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // we check for length < 2 to keep from counting code points all the time 1702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) { 1712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller output.add(source); 1722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return; 1732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1752ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // otherwise iterate through the string, and recursively permute all the other characters 1762ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller Set<String> subpermute = new HashSet<String>(); 1772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int cp; 1782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { 1792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller cp = UTF16.charAt(source, i); 1802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // optimization: 1822ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // if the character is canonical combining class zero, 1832ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // don't permute it 1842ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) { 1852ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); 1862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller continue; 1872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 1882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // see what the permutations of the characters before and after this one are 1902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller subpermute.clear(); 1912ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller permute(source.substring(0,i) 1922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute); 1932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 1942ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // prefix this character to all of them 1952ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String chStr = UTF16.valueOf(source, i); 1962ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (String s : subpermute) { 1972ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String piece = chStr + s; 1982ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller //if (PROGRESS) System.out.println(" Piece: " + piece); 1992ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller output.add(piece); 2002ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 2012ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 2022ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 2032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // FOR TESTING 2052ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2062ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /* 2072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition. 2082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * 2092ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller public static UnicodeSet getSafeStart() { 2102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return (UnicodeSet) SAFE_START.clone(); 2112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 2122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 2132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /* 2142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *@return the set of characters whose decompositions start with the given character 2152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * 2162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller public static UnicodeSet getStarts(int cp) { 2172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller UnicodeSet result = AT_START.get(cp); 2182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (result == null) result = EMPTY; 2192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return (UnicodeSet) result.clone(); 2202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 2212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 2222ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // ===================== PRIVATES ============================== 2242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // debug 2262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private static boolean PROGRESS = false; // debug progress 2272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null; 2282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private static boolean SKIP_ZEROS = true; 2292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // fields 2312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private final Normalizer2 nfd; 2322ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private final Normalizer2Impl nfcImpl; 2332ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private String source; 2342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private boolean done; 2352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private String[][] pieces; 2362ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private int[] current; 2372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Note: C will need two more fields, since arrays there don't have lengths 2382ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // int pieces_length; 2392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // int[] pieces_lengths; 2402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // transient fields 2422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private transient StringBuilder buffer = new StringBuilder(); 2432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // we have a segment, in NFD. Find all the strings that are canonically equivalent to it. 2462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private String[] getEquivalents(String segment) { 2472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller Set<String> result = new HashSet<String>(); 2482ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller Set<String> basic = getEquivalents2(segment); 2492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller Set<String> permutations = new HashSet<String>(); 2502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // now get all the permutations 2522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // add only the ones that are canonically equivalent 2532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // TODO: optimize by not permuting any class zero. 2542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller Iterator<String> it = basic.iterator(); 2552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller while (it.hasNext()) { 2562ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String item = it.next(); 2572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller permutations.clear(); 2582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller permute(item, SKIP_ZEROS, permutations); 2592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller Iterator<String> it2 = permutations.iterator(); 2602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller while (it2.hasNext()) { 2612ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String possible = it2.next(); 2622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2632ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/* 2642ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0); 2652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (attempt.equals(segment)) { 2662ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller*/ 2672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (Normalizer.compare(possible, segment,0)==0) { 2682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible)); 2702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller result.add(possible); 2712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } else { 2732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible)); 2742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 2752ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 2762ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 2772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // convert into a String[] to clean up storage 2792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String[] finalResult = new String[result.size()]; 2802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller result.toArray(finalResult); 2812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return finalResult; 2822ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 2832ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2842ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2852ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private Set<String> getEquivalents2(String segment) { 2862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller Set<String> result = new HashSet<String>(); 2882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment)); 2902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2912ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller result.add(segment); 2922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller StringBuffer workingBuffer = new StringBuffer(); 2932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller UnicodeSet starts = new UnicodeSet(); 2942ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2952ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // cycle through all the characters 2962ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int cp; 2972ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (int i = 0; i < segment.length(); i += Character.charCount(cp)) { 2982ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 2992ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // see if any character is at the start of some decomposition 3002ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller cp = segment.codePointAt(i); 3012ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (!nfcImpl.getCanonStartSet(cp, starts)) { 3022ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller continue; 3032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // if so, see which decompositions match 3052ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) { 3062ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int cp2 = iter.codepoint; 3072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller Set<String> remainder = extract(cp2, segment, i, workingBuffer); 3082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (remainder == null) { 3092ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller continue; 3102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 3122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // there were some matches, so add all the possibilities to the set. 3132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String prefix= segment.substring(0,i); 3142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller prefix += UTF16.valueOf(cp2); 3152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (String item : remainder) { 3162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller result.add(prefix + item); 3172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return result; 3212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /* 3222ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller Set result = new HashSet(); 3232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment)); 3242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller result.add(segment); 3252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller StringBuffer workingBuffer = new StringBuffer(); 3262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 3272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // cycle through all the characters 3282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int cp; 3292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 3302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) { 3312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // see if any character is at the start of some decomposition 3322ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller cp = UTF16.charAt(segment, i); 3332ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller NormalizerImpl.getCanonStartSet(c,fillSet) 3342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller UnicodeSet starts = AT_START.get(cp); 3352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (starts == null) continue; 3362ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller UnicodeSetIterator usi = new UnicodeSetIterator(starts); 3372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // if so, see which decompositions match 3382ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller while (usi.next()) { 3392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int cp2 = usi.codepoint; 3402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // we know that there are no strings in it 3412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // so we don't have to check CharacterIterator.IS_STRING 3422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller Set remainder = extract(cp2, segment, i, workingBuffer); 3432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (remainder == null) continue; 3442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 3452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // there were some matches, so add all the possibilities to the set. 3462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String prefix = segment.substring(0, i) + UTF16.valueOf(cp2); 3472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller Iterator it = remainder.iterator(); 3482ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller while (it.hasNext()) { 3492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String item = (String) it.next(); 3502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item)); 3512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller result.add(prefix + item); 3522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return result; 3562ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 3572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 3592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /** 3602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * See if the decomposition of cp2 is at segment starting at segmentPos 3612ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * (with canonical rearrangment!) 3622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller * If so, take the remainder, and return the equivalents 3632ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 3642ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) { 3652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp)) 3662ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller + ", " + Utility.hex(segment.substring(segmentPos))); 3672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 3682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String decomp = nfcImpl.getDecomposition(comp); 3692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (decomp == null) { 3702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller decomp = UTF16.valueOf(comp); 3712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 3732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // See if it matches the start of segment (at segmentPos) 3742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller boolean ok = false; 3752ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int cp; 3762ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int decompPos = 0; 3772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int decompCp = UTF16.charAt(decomp,0); 3782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char 3792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller //int decompClass = getClass(decompCp); 3802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller buf.setLength(0); // initialize working buffer, shared among callees 3812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 3822ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) { 3832ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller cp = UTF16.charAt(segment, i); 3842ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (cp == decompCp) { // if equal, eat another cp from decomp 3852ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS) System.out.println(" matches: " + Utility.hex(UTF16.valueOf(cp))); 3862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (decompPos == decomp.length()) { // done, have all decomp characters! 3872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars 3882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller ok = true; 3892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller break; 3902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 3912ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller decompCp = UTF16.charAt(decomp, decompPos); 3922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller decompPos += UTF16.getCharCount(decompCp); 3932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller //decompClass = getClass(decompCp); 3942ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } else { 3952ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS) System.out.println(" buffer: " + Utility.hex(UTF16.valueOf(cp))); 3962ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // brute force approach 3972ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller UTF16.append(buf, cp); 3982ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /* TODO: optimize 3992ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // since we know that the classes are monotonically increasing, after zero 4002ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // e.g. 0 5 7 9 0 3 4012ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // we can do an optimization 4022ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // there are only a few cases that work: zero, less, same, greater 4032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // if both classes are the same, we fail 4042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // if the decomp class < the segment class, we fail 4052ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4062ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller segClass = getClass(cp); 4072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (decompClass <= segClass) return null; 4082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 4092ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 4102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 4112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (!ok) return null; // we failed, characters left over 4122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS) System.out.println("Matches"); 4132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder 4142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String remainder = buf.toString(); 4152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // brute force approach 4172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // to check to make sure result is canonically equivalent 4182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /* 4192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0); 4202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null; 4212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 4222ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null; 4242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // get the remaining combinations 4262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return getEquivalents2(remainder); 4272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 4282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /* 4302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // TODO: fix once we have a codepoint interface to get the canonical combining class 4312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // TODO: Need public access to canonical combining class in UCharacter! 4322ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private static int getClass(int cp) { 4332ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return Normalizer.getClass((char)cp); 4342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 4352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 4362ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // ================= BUILDER ========================= 4382ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // TODO: Flatten this data so it doesn't have to be reconstructed each time! 4392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change 4412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change 4422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller static { 4432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller SET_WITH_NULL_STRING.add(""); 4442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 4452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // private static UnicodeSet SAFE_START = new UnicodeSet(); 4472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // private static CharMap AT_START = new CharMap(); 4482ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!; 4502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded. 4512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // private static int LAST_UNICODE = 0x10FFFF; 4522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /* 4532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller static { 4542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller buildData(); 4552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 4562ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 4572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller /* 4582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private static void buildData() { 4592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS) System.out.println("Getting Safe Start"); 4612ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (int cp = 0; cp <= LAST_UNICODE; ++cp) { 4622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.'); 4632ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int cc = UCharacter.getCombiningClass(cp); 4642ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (cc == 0) SAFE_START.add(cp); 4652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // will fix to be really safe below 4662ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 4672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS) System.out.println(); 4682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS) System.out.println("Getting Containment"); 4702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (int cp = 0; cp <= LAST_UNICODE; ++cp) { 4712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.'); 4722ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue; 4742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4752ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller //String istr = UTF16.valueOf(cp); 4762ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller String decomp = Normalizer.normalize(cp, Normalizer.NFD); 4772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller //if (decomp.equals(istr)) continue; 4782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4792ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // add each character in the decomposition to canBeIn 4802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller int component; 4822ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) { 4832ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller component = UTF16.charAt(decomp, i); 4842ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (i == 0) { 4852ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller AT_START.add(component, cp); 4862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } else if (UCharacter.getCombiningClass(component) == 0) { 4872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller SAFE_START.remove(component); 4882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 4892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 4902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 4912ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (PROGRESS) System.out.println(); 4922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 4932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // the following is just for a map from characters to a set of characters 4942ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 4952ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private static class CharMap { 4962ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller Map storage = new HashMap(); 4972ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller MutableInt probe = new MutableInt(); 4982ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller boolean converted = false; 4992ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 5002ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller public void add(int cp, int whatItIsIn) { 5012ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp)); 5022ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller if (result == null) { 5032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller result = new UnicodeSet(); 5042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller storage.put(probe, result); 5052ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 5062ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller result.add(whatItIsIn); 5072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 5082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 5092ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller public UnicodeSet get(int cp) { 5102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return (UnicodeSet) storage.get(probe.set(cp)); 5112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 5122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 5132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 5142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller private static class MutableInt { 5152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller public int contents; 5162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller public int hashCode() { return contents; } 5172ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller public boolean equals(Object other) { 5182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return ((MutableInt)other).contents == contents; 5192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 5202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller // allows chaining 5212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller public MutableInt set(int contents) { 5222ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller this.contents = contents; 5232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller return this; 5242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 5252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller } 5262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */ 5272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller 5282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller} 529