12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others. 22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Copyright (C) 1996-2014, International Business Machines Corporation and * 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * others. All Rights Reserved. * 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.text; 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.ArrayList; 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.HashSet; 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.Iterator; 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.List; 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.Set; 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Norm2AllModes; 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Normalizer2Impl; 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Utility; 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UCharacter; 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This class allows one to iterate through all the strings that are canonically equivalent to a given 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * string. For example, here are some sample results: 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <pre> 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE} 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA} 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE} 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE} 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA} 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE} 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA} 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE} 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA} 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE} 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *</pre> 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones, 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * since it has not been optimized for that situation. 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @author M. Davis 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.4 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic final class CanonicalIterator { 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Construct a CanonicalIterator object 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param source string to get results for 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.4 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public CanonicalIterator(String source) { 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Norm2AllModes allModes = Norm2AllModes.getNFCInstance(); 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfd = allModes.decomp; 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfcImpl = allModes.impl.ensureCanonIterData(); 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setSource(source); 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Gets the NFD form of the current source we are iterating over. 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return gets the source: NOTE: it is the NFD form of the source originally passed in 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.4 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public String getSource() { 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return source; 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Resets the iterator so that one can start again from the beginning. 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.4 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void reset() { 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert done = false; 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < current.length; ++i) { 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert current[i] = 0; 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Get the next canonically equivalent string. 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b> 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the next string that is canonically equivalent. The value null is returned when 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the iteration is done. 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.4 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public String next() { 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (done) return null; 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // construct return value 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.setLength(0); // delete old contents 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < pieces.length; ++i) { 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append(pieces[i][current[i]]); 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String result = buffer.toString(); 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // find next value for next time 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = current.length - 1; ; --i) { 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (i < 0) { 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert done = true; 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert current[i]++; 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (current[i] < pieces[i].length) break; // got sequence 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert current[i] = 0; 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set a new source for this iterator. Allows object reuse. 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param newSource the source string to iterate against. This allows the same iterator to be used 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * while changing the source string, saving object creation. 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.4 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setSource(String newSource) { 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert source = nfd.normalize(newSource); 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert done = false; 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // catch degenerate case 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (newSource.length() == 0) { 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pieces = new String[1][]; 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert current = new int[1]; 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pieces[0] = new String[]{""}; 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // find the segments 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert List<String> segmentList = new ArrayList<String>(); 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cp; 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int start = 0; 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // i should be the end of the first code point 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // break up the string into segements 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int i = UTF16.findOffsetFromCodePoint(source, 1); 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (; i < source.length(); i += Character.charCount(cp)) { 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp = source.codePointAt(i); 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (nfcImpl.isCanonSegmentStarter(cp)) { 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert segmentList.add(source.substring(start, i)); // add up to i 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = i; 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert segmentList.add(source.substring(start, i)); // add last one 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // allocate the arrays, and find the strings that are CE to each segment 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pieces = new String[segmentList.size()][]; 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert current = new int[segmentList.size()]; 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (i = 0; i < pieces.length; ++i) { 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS) System.out.println("SEGMENT"); 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pieces[i] = getEquivalents(segmentList.get(i)); 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Simple implementation of permutation. 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b> 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param source the string to find permutations for 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param skipZeros set to true to skip characters with canonical combining class zero 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param output the set to add the results to 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @internal 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated This API is ICU internal only. 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static void permute(String source, boolean skipZeros, Set<String> output) { 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: optimize 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //if (PROGRESS) System.out.println("Permute: " + source); 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // optimization: 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if zero or one character, just return a set with it 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we check for length < 2 to keep from counting code points all the time 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) { 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert output.add(source); 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // otherwise iterate through the string, and recursively permute all the other characters 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Set<String> subpermute = new HashSet<String>(); 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cp; 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp = UTF16.charAt(source, i); 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // optimization: 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if the character is canonical combining class zero, 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // don't permute it 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) { 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // see what the permutations of the characters before and after this one are 1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert subpermute.clear(); 1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert permute(source.substring(0,i) 1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute); 1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // prefix this character to all of them 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String chStr = UTF16.valueOf(source, i); 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (String s : subpermute) { 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String piece = chStr + s; 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //if (PROGRESS) System.out.println(" Piece: " + piece); 2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert output.add(piece); 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // FOR TESTING 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition. 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static UnicodeSet getSafeStart() { 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (UnicodeSet) SAFE_START.clone(); 2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *@return the set of characters whose decompositions start with the given character 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static UnicodeSet getStarts(int cp) { 2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UnicodeSet result = AT_START.get(cp); 2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (result == null) result = EMPTY; 2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (UnicodeSet) result.clone(); 2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ===================== PRIVATES ============================== 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // debug 2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean PROGRESS = false; // debug progress 2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null; 2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean SKIP_ZEROS = true; 2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // fields 2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final Normalizer2 nfd; 2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final Normalizer2Impl nfcImpl; 2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private String source; 2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean done; 2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private String[][] pieces; 2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int[] current; 2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: C will need two more fields, since arrays there don't have lengths 2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // int pieces_length; 2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // int[] pieces_lengths; 2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // transient fields 2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private transient StringBuilder buffer = new StringBuilder(); 2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we have a segment, in NFD. Find all the strings that are canonically equivalent to it. 2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private String[] getEquivalents(String segment) { 2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Set<String> result = new HashSet<String>(); 2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Set<String> basic = getEquivalents2(segment); 2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Set<String> permutations = new HashSet<String>(); 2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // now get all the permutations 2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // add only the ones that are canonically equivalent 2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: optimize by not permuting any class zero. 2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Iterator<String> it = basic.iterator(); 2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (it.hasNext()) { 2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String item = it.next(); 2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert permutations.clear(); 2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert permute(item, SKIP_ZEROS, permutations); 2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Iterator<String> it2 = permutations.iterator(); 2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (it2.hasNext()) { 2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String possible = it2.next(); 2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0); 2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (attempt.equals(segment)) { 2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/ 2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Normalizer.compare(possible, segment,0)==0) { 2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible)); 2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.add(possible); 2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible)); 2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // convert into a String[] to clean up storage 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String[] finalResult = new String[result.size()]; 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.toArray(finalResult); 2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return finalResult; 2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private Set<String> getEquivalents2(String segment) { 2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Set<String> result = new HashSet<String>(); 2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment)); 2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.add(segment); 2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuffer workingBuffer = new StringBuffer(); 2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UnicodeSet starts = new UnicodeSet(); 2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // cycle through all the characters 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cp; 3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < segment.length(); i += Character.charCount(cp)) { 3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // see if any character is at the start of some decomposition 3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp = segment.codePointAt(i); 3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!nfcImpl.getCanonStartSet(cp, starts)) { 3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if so, see which decompositions match 3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) { 3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cp2 = iter.codepoint; 3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Set<String> remainder = extract(cp2, segment, i, workingBuffer); 3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (remainder == null) { 3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // there were some matches, so add all the possibilities to the set. 3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String prefix= segment.substring(0,i); 3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prefix += UTF16.valueOf(cp2); 3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (String item : remainder) { 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.add(prefix + item); 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Set result = new HashSet(); 3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment)); 3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.add(segment); 3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuffer workingBuffer = new StringBuffer(); 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // cycle through all the characters 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cp; 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) { 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // see if any character is at the start of some decomposition 3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp = UTF16.charAt(segment, i); 3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert NormalizerImpl.getCanonStartSet(c,fillSet) 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UnicodeSet starts = AT_START.get(cp); 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (starts == null) continue; 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UnicodeSetIterator usi = new UnicodeSetIterator(starts); 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if so, see which decompositions match 3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (usi.next()) { 3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cp2 = usi.codepoint; 3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we know that there are no strings in it 3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // so we don't have to check CharacterIterator.IS_STRING 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Set remainder = extract(cp2, segment, i, workingBuffer); 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (remainder == null) continue; 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // there were some matches, so add all the possibilities to the set. 3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String prefix = segment.substring(0, i) + UTF16.valueOf(cp2); 3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Iterator it = remainder.iterator(); 3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (it.hasNext()) { 3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String item = (String) it.next(); 3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item)); 3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.add(prefix + item); 3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * See if the decomposition of cp2 is at segment starting at segmentPos 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (with canonical rearrangment!) 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If so, take the remainder, and return the equivalents 3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) { 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp)) 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert + ", " + Utility.hex(segment.substring(segmentPos))); 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String decomp = nfcImpl.getDecomposition(comp); 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (decomp == null) { 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decomp = UTF16.valueOf(comp); 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // See if it matches the start of segment (at segmentPos) 3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean ok = false; 3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cp; 3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int decompPos = 0; 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int decompCp = UTF16.charAt(decomp,0); 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //int decompClass = getClass(decompCp); 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buf.setLength(0); // initialize working buffer, shared among callees 3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) { 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp = UTF16.charAt(segment, i); 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cp == decompCp) { // if equal, eat another cp from decomp 3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS) System.out.println(" matches: " + Utility.hex(UTF16.valueOf(cp))); 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (decompPos == decomp.length()) { // done, have all decomp characters! 3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars 3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ok = true; 3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decompCp = UTF16.charAt(decomp, decompPos); 3967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decompPos += UTF16.getCharCount(decompCp); 3977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //decompClass = getClass(decompCp); 3987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS) System.out.println(" buffer: " + Utility.hex(UTF16.valueOf(cp))); 4007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // brute force approach 4017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16.append(buf, cp); 4027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* TODO: optimize 4037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // since we know that the classes are monotonically increasing, after zero 4047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // e.g. 0 5 7 9 0 3 4057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we can do an optimization 4067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // there are only a few cases that work: zero, less, same, greater 4077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if both classes are the same, we fail 4087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if the decomp class < the segment class, we fail 4097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert segClass = getClass(cp); 4117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (decompClass <= segClass) return null; 4127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!ok) return null; // we failed, characters left over 4167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS) System.out.println("Matches"); 4177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder 4187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String remainder = buf.toString(); 4197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // brute force approach 4217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // to check to make sure result is canonically equivalent 4227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 4237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0); 4247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null; 4257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null; 4287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // get the remaining combinations 4307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return getEquivalents2(remainder); 4317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 4347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: fix once we have a codepoint interface to get the canonical combining class 4357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: Need public access to canonical combining class in UCharacter! 4367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int getClass(int cp) { 4377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Normalizer.getClass((char)cp); 4387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ================= BUILDER ========================= 4427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: Flatten this data so it doesn't have to be reconstructed each time! 4437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change 4457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change 4467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert static { 4477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert SET_WITH_NULL_STRING.add(""); 4487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // private static UnicodeSet SAFE_START = new UnicodeSet(); 4517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // private static CharMap AT_START = new CharMap(); 4527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!; 4547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded. 4557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // private static int LAST_UNICODE = 0x10FFFF; 4567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 4577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert static { 4587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buildData(); 4597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 4627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static void buildData() { 4637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS) System.out.println("Getting Safe Start"); 4657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int cp = 0; cp <= LAST_UNICODE; ++cp) { 4667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.'); 4677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cc = UCharacter.getCombiningClass(cp); 4687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cc == 0) SAFE_START.add(cp); 4697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // will fix to be really safe below 4707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS) System.out.println(); 4727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS) System.out.println("Getting Containment"); 4747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int cp = 0; cp <= LAST_UNICODE; ++cp) { 4757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.'); 4767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue; 4787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //String istr = UTF16.valueOf(cp); 4807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String decomp = Normalizer.normalize(cp, Normalizer.NFD); 4817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //if (decomp.equals(istr)) continue; 4827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // add each character in the decomposition to canBeIn 4847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int component; 4867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) { 4877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert component = UTF16.charAt(decomp, i); 4887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (i == 0) { 4897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert AT_START.add(component, cp); 4907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (UCharacter.getCombiningClass(component) == 0) { 4917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert SAFE_START.remove(component); 4927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (PROGRESS) System.out.println(); 4967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the following is just for a map from characters to a set of characters 4987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static class CharMap { 5007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Map storage = new HashMap(); 5017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert MutableInt probe = new MutableInt(); 5027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean converted = false; 5037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void add(int cp, int whatItIsIn) { 5057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp)); 5067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (result == null) { 5077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = new UnicodeSet(); 5087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert storage.put(probe, result); 5097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result.add(whatItIsIn); 5117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public UnicodeSet get(int cp) { 5147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (UnicodeSet) storage.get(probe.set(cp)); 5157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static class MutableInt { 5197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int contents; 5207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int hashCode() { return contents; } 5217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean equals(Object other) { 5227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ((MutableInt)other).contents == contents; 5237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // allows chaining 5257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public MutableInt set(int contents) { 5267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.contents = contents; 5277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 5287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 533