CaseIterator.java revision 7935b1839a081ed19ae0d33029ad3c09632a2caa
1/** 2******************************************************************************* 3* Copyright (C) 1996-2012, International Business Machines Corporation and * 4* others. All Rights Reserved. * 5******************************************************************************* 6*/ 7 8// copied from the Transliterator demo 9 10package com.ibm.icu.dev.util; 11 12import java.util.HashMap; 13import java.util.HashSet; 14import java.util.Iterator; 15import java.util.Map; 16import java.util.Set; 17import java.util.TreeSet; 18 19import com.ibm.icu.lang.UCharacter; 20import com.ibm.icu.text.Transliterator; 21import com.ibm.icu.text.UTF16; 22import com.ibm.icu.text.UnicodeSet; 23 24/** 25 * Incrementally returns the set of all strings that case-fold to the same value. 26 */ 27public class CaseIterator { 28 29 // testing stuff 30 private static Transliterator toName = Transliterator.getInstance("[:^ascii:] Any-Name"); 31 private static Transliterator toHex = Transliterator.getInstance("[:^ascii:] Any-Hex"); 32 private static Transliterator toHex2 = Transliterator.getInstance("[[^\u0021-\u007F]-[,]] Any-Hex"); 33 34 // global tables (could be precompiled) 35 private static Map fromCaseFold = new HashMap(); 36 private static Map toCaseFold = new HashMap(); 37 private static int maxLength = 0; 38 39 // This exception list is generated on the console by turning on the GENERATED flag, 40 // which MUST be false for normal operation. 41 // Once the list is generated, it is pasted in here. 42 // A bit of a cludge, but this bootstrapping is the easiest way 43 // to get around certain complications in the data. 44 45 private static final boolean GENERATE = false; 46 47 private static final boolean DUMP = false; 48 49 private static String[][] exceptionList = { 50 // a\N{MODIFIER LETTER RIGHT HALF RING} 51 {"a\u02BE","A\u02BE","a\u02BE",}, 52 // ff 53 {"ff","FF","Ff","fF","ff",}, 54 // ffi 55 {"ffi","FFI","FFi","FfI","Ffi","F\uFB01","fFI","fFi","ffI","ffi","f\uFB01","\uFB00I","\uFB00i",}, 56 // ffl 57 {"ffl","FFL","FFl","FfL","Ffl","F\uFB02","fFL","fFl","ffL","ffl","f\uFB02","\uFB00L","\uFB00l",}, 58 // fi 59 {"fi","FI","Fi","fI","fi",}, 60 // fl 61 {"fl","FL","Fl","fL","fl",}, 62 // h\N{COMBINING MACRON BELOW} 63 {"h\u0331","H\u0331","h\u0331",}, 64 // i\N{COMBINING DOT ABOVE} 65 {"i\u0307","I\u0307","i\u0307",}, 66 // j\N{COMBINING CARON} 67 {"j\u030C","J\u030C","j\u030C",}, 68 // ss 69 {"ss","SS","Ss","S\u017F","sS","ss","s\u017F","\u017FS","\u017Fs","\u017F\u017F",}, 70 // st 71 {"st","ST","St","sT","st","\u017FT","\u017Ft",}, 72 // t\N{COMBINING DIAERESIS} 73 {"t\u0308","T\u0308","t\u0308",}, 74 // w\N{COMBINING RING ABOVE} 75 {"w\u030A","W\u030A","w\u030A",}, 76 // y\N{COMBINING RING ABOVE} 77 {"y\u030A","Y\u030A","y\u030A",}, 78 // \N{MODIFIER LETTER APOSTROPHE}n 79 {"\u02BCn","\u02BCN","\u02BCn",}, 80 // \N{GREEK SMALL LETTER ALPHA WITH TONOS}\N{GREEK SMALL LETTER IOTA} 81 {"\u03AC\u03B9","\u0386\u0345","\u0386\u0399","\u0386\u03B9","\u0386\u1FBE","\u03AC\u0345","\u03AC\u0399","\u03AC\u03B9","\u03AC\u1FBE",}, 82 // \N{GREEK SMALL LETTER ETA WITH TONOS}\N{GREEK SMALL LETTER IOTA} 83 {"\u03AE\u03B9","\u0389\u0345","\u0389\u0399","\u0389\u03B9","\u0389\u1FBE","\u03AE\u0345","\u03AE\u0399","\u03AE\u03B9","\u03AE\u1FBE",}, 84 // \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI} 85 {"\u03B1\u0342","\u0391\u0342","\u03B1\u0342",}, 86 // \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA} 87 {"\u03B1\u0342\u03B9","\u0391\u0342\u0345","\u0391\u0342\u0399","\u0391\u0342\u03B9","\u0391\u0342\u1FBE", 88 "\u03B1\u0342\u0345","\u03B1\u0342\u0399","\u03B1\u0342\u03B9","\u03B1\u0342\u1FBE","\u1FB6\u0345", 89 "\u1FB6\u0399","\u1FB6\u03B9","\u1FB6\u1FBE",}, 90 // \N{GREEK SMALL LETTER ALPHA}\N{GREEK SMALL LETTER IOTA} 91 {"\u03B1\u03B9","\u0391\u0345","\u0391\u0399","\u0391\u03B9","\u0391\u1FBE","\u03B1\u0345","\u03B1\u0399","\u03B1\u03B9","\u03B1\u1FBE",}, 92 // \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI} 93 {"\u03B7\u0342","\u0397\u0342","\u03B7\u0342",}, 94 // \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA} 95 {"\u03B7\u0342\u03B9","\u0397\u0342\u0345","\u0397\u0342\u0399","\u0397\u0342\u03B9","\u0397\u0342\u1FBE", 96 "\u03B7\u0342\u0345","\u03B7\u0342\u0399","\u03B7\u0342\u03B9","\u03B7\u0342\u1FBE","\u1FC6\u0345","\u1FC6\u0399", 97 "\u1FC6\u03B9","\u1FC6\u1FBE",}, 98 // \N{GREEK SMALL LETTER ETA}\N{GREEK SMALL LETTER IOTA} 99 {"\u03B7\u03B9","\u0397\u0345","\u0397\u0399","\u0397\u03B9","\u0397\u1FBE","\u03B7\u0345","\u03B7\u0399","\u03B7\u03B9","\u03B7\u1FBE",}, 100 // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT} 101 {"\u03B9\u0308\u0300","\u0345\u0308\u0300","\u0399\u0308\u0300","\u03B9\u0308\u0300","\u1FBE\u0308\u0300",}, 102 // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT} 103 {"\u03B9\u0308\u0301","\u0345\u0308\u0301","\u0399\u0308\u0301","\u03B9\u0308\u0301","\u1FBE\u0308\u0301",}, 104 // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI} 105 {"\u03B9\u0308\u0342","\u0345\u0308\u0342","\u0399\u0308\u0342","\u03B9\u0308\u0342","\u1FBE\u0308\u0342",}, 106 // \N{GREEK SMALL LETTER IOTA}\N{COMBINING GREEK PERISPOMENI} 107 {"\u03B9\u0342","\u0345\u0342","\u0399\u0342","\u03B9\u0342","\u1FBE\u0342",}, 108 // \N{GREEK SMALL LETTER RHO}\N{COMBINING COMMA ABOVE} 109 {"\u03C1\u0313","\u03A1\u0313","\u03C1\u0313","\u03F1\u0313",}, 110 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT} 111 {"\u03C5\u0308\u0300","\u03A5\u0308\u0300","\u03C5\u0308\u0300",}, 112 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT} 113 {"\u03C5\u0308\u0301","\u03A5\u0308\u0301","\u03C5\u0308\u0301",}, 114 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI} 115 {"\u03C5\u0308\u0342","\u03A5\u0308\u0342","\u03C5\u0308\u0342",}, 116 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE} 117 {"\u03C5\u0313","\u03A5\u0313","\u03C5\u0313",}, 118 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GRAVE ACCENT} 119 {"\u03C5\u0313\u0300","\u03A5\u0313\u0300","\u03C5\u0313\u0300","\u1F50\u0300",}, 120 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING ACUTE ACCENT} 121 {"\u03C5\u0313\u0301","\u03A5\u0313\u0301","\u03C5\u0313\u0301","\u1F50\u0301",}, 122 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GREEK PERISPOMENI} 123 {"\u03C5\u0313\u0342","\u03A5\u0313\u0342","\u03C5\u0313\u0342","\u1F50\u0342",}, 124 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING GREEK PERISPOMENI} 125 {"\u03C5\u0342","\u03A5\u0342","\u03C5\u0342",}, 126 // \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI} 127 {"\u03C9\u0342","\u03A9\u0342","\u03C9\u0342","\u2126\u0342",}, 128 // \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA} 129 {"\u03C9\u0342\u03B9","\u03A9\u0342\u0345","\u03A9\u0342\u0399","\u03A9\u0342\u03B9","\u03A9\u0342\u1FBE","\u03C9\u0342\u0345","\u03C9\u0342\u0399","\u03C9\u0342\u03B9","\u03C9\u0342\u1FBE","\u1FF6\u0345", 130 "\u1FF6\u0399","\u1FF6\u03B9","\u1FF6\u1FBE","\u2126\u0342\u0345","\u2126\u0342\u0399","\u2126\u0342\u03B9","\u2126\u0342\u1FBE",}, 131 // \N{GREEK SMALL LETTER OMEGA}\N{GREEK SMALL LETTER IOTA} 132 {"\u03C9\u03B9","\u03A9\u0345","\u03A9\u0399","\u03A9\u03B9","\u03A9\u1FBE","\u03C9\u0345","\u03C9\u0399","\u03C9\u03B9","\u03C9\u1FBE","\u2126\u0345","\u2126\u0399","\u2126\u03B9","\u2126\u1FBE",}, 133 // \N{GREEK SMALL LETTER OMEGA WITH TONOS}\N{GREEK SMALL LETTER IOTA} 134 {"\u03CE\u03B9","\u038F\u0345","\u038F\u0399","\u038F\u03B9","\u038F\u1FBE","\u03CE\u0345","\u03CE\u0399","\u03CE\u03B9","\u03CE\u1FBE",}, 135 // \N{ARMENIAN SMALL LETTER ECH}\N{ARMENIAN SMALL LETTER YIWN} 136 {"\u0565\u0582","\u0535\u0552","\u0535\u0582","\u0565\u0552","\u0565\u0582",}, 137 // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER ECH} 138 {"\u0574\u0565","\u0544\u0535","\u0544\u0565","\u0574\u0535","\u0574\u0565",}, 139 // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER INI} 140 {"\u0574\u056B","\u0544\u053B","\u0544\u056B","\u0574\u053B","\u0574\u056B",}, 141 // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER XEH} 142 {"\u0574\u056D","\u0544\u053D","\u0544\u056D","\u0574\u053D","\u0574\u056D",}, 143 // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER NOW} 144 {"\u0574\u0576","\u0544\u0546","\u0544\u0576","\u0574\u0546","\u0574\u0576",}, 145 // \N{ARMENIAN SMALL LETTER VEW}\N{ARMENIAN SMALL LETTER NOW} 146 {"\u057E\u0576","\u054E\u0546","\u054E\u0576","\u057E\u0546","\u057E\u0576",}, 147 // \N{GREEK SMALL LETTER ALPHA WITH PSILI}\N{GREEK SMALL LETTER IOTA} 148 {"\u1F00\u03B9","\u1F00\u0345","\u1F00\u0399","\u1F00\u03B9","\u1F00\u1FBE","\u1F08\u0345","\u1F08\u0399","\u1F08\u03B9","\u1F08\u1FBE",}, 149 // \N{GREEK SMALL LETTER ALPHA WITH DASIA}\N{GREEK SMALL LETTER IOTA} 150 {"\u1F01\u03B9","\u1F01\u0345","\u1F01\u0399","\u1F01\u03B9","\u1F01\u1FBE","\u1F09\u0345","\u1F09\u0399","\u1F09\u03B9","\u1F09\u1FBE",}, 151 // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA} 152 {"\u1F02\u03B9","\u1F02\u0345","\u1F02\u0399","\u1F02\u03B9","\u1F02\u1FBE","\u1F0A\u0345","\u1F0A\u0399","\u1F0A\u03B9","\u1F0A\u1FBE",}, 153 // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA} 154 {"\u1F03\u03B9","\u1F03\u0345","\u1F03\u0399","\u1F03\u03B9","\u1F03\u1FBE","\u1F0B\u0345","\u1F0B\u0399","\u1F0B\u03B9","\u1F0B\u1FBE",}, 155 // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA} 156 {"\u1F04\u03B9","\u1F04\u0345","\u1F04\u0399","\u1F04\u03B9","\u1F04\u1FBE","\u1F0C\u0345","\u1F0C\u0399","\u1F0C\u03B9","\u1F0C\u1FBE",}, 157 // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA} 158 {"\u1F05\u03B9","\u1F05\u0345","\u1F05\u0399","\u1F05\u03B9","\u1F05\u1FBE","\u1F0D\u0345","\u1F0D\u0399","\u1F0D\u03B9","\u1F0D\u1FBE",}, 159 // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA} 160 {"\u1F06\u03B9","\u1F06\u0345","\u1F06\u0399","\u1F06\u03B9","\u1F06\u1FBE","\u1F0E\u0345","\u1F0E\u0399","\u1F0E\u03B9","\u1F0E\u1FBE",}, 161 // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA} 162 {"\u1F07\u03B9","\u1F07\u0345","\u1F07\u0399","\u1F07\u03B9","\u1F07\u1FBE","\u1F0F\u0345","\u1F0F\u0399","\u1F0F\u03B9","\u1F0F\u1FBE",}, 163 // \N{GREEK SMALL LETTER ETA WITH PSILI}\N{GREEK SMALL LETTER IOTA} 164 {"\u1F20\u03B9","\u1F20\u0345","\u1F20\u0399","\u1F20\u03B9","\u1F20\u1FBE","\u1F28\u0345","\u1F28\u0399","\u1F28\u03B9","\u1F28\u1FBE",}, 165 // \N{GREEK SMALL LETTER ETA WITH DASIA}\N{GREEK SMALL LETTER IOTA} 166 {"\u1F21\u03B9","\u1F21\u0345","\u1F21\u0399","\u1F21\u03B9","\u1F21\u1FBE","\u1F29\u0345","\u1F29\u0399","\u1F29\u03B9","\u1F29\u1FBE",}, 167 // \N{GREEK SMALL LETTER ETA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA} 168 {"\u1F22\u03B9","\u1F22\u0345","\u1F22\u0399","\u1F22\u03B9","\u1F22\u1FBE","\u1F2A\u0345","\u1F2A\u0399","\u1F2A\u03B9","\u1F2A\u1FBE",}, 169 // \N{GREEK SMALL LETTER ETA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA} 170 {"\u1F23\u03B9","\u1F23\u0345","\u1F23\u0399","\u1F23\u03B9","\u1F23\u1FBE","\u1F2B\u0345","\u1F2B\u0399","\u1F2B\u03B9","\u1F2B\u1FBE",}, 171 // \N{GREEK SMALL LETTER ETA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA} 172 {"\u1F24\u03B9","\u1F24\u0345","\u1F24\u0399","\u1F24\u03B9","\u1F24\u1FBE","\u1F2C\u0345","\u1F2C\u0399","\u1F2C\u03B9","\u1F2C\u1FBE",}, 173 // \N{GREEK SMALL LETTER ETA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA} 174 {"\u1F25\u03B9","\u1F25\u0345","\u1F25\u0399","\u1F25\u03B9","\u1F25\u1FBE","\u1F2D\u0345","\u1F2D\u0399","\u1F2D\u03B9","\u1F2D\u1FBE",}, 175 // \N{GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA} 176 {"\u1F26\u03B9","\u1F26\u0345","\u1F26\u0399","\u1F26\u03B9","\u1F26\u1FBE","\u1F2E\u0345","\u1F2E\u0399","\u1F2E\u03B9","\u1F2E\u1FBE",}, 177 // \N{GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA} 178 {"\u1F27\u03B9","\u1F27\u0345","\u1F27\u0399","\u1F27\u03B9","\u1F27\u1FBE","\u1F2F\u0345","\u1F2F\u0399","\u1F2F\u03B9","\u1F2F\u1FBE",}, 179 // \N{GREEK SMALL LETTER OMEGA WITH PSILI}\N{GREEK SMALL LETTER IOTA} 180 {"\u1F60\u03B9","\u1F60\u0345","\u1F60\u0399","\u1F60\u03B9","\u1F60\u1FBE","\u1F68\u0345","\u1F68\u0399","\u1F68\u03B9","\u1F68\u1FBE",}, 181 // \N{GREEK SMALL LETTER OMEGA WITH DASIA}\N{GREEK SMALL LETTER IOTA} 182 {"\u1F61\u03B9","\u1F61\u0345","\u1F61\u0399","\u1F61\u03B9","\u1F61\u1FBE","\u1F69\u0345","\u1F69\u0399","\u1F69\u03B9","\u1F69\u1FBE",}, 183 // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA} 184 {"\u1F62\u03B9","\u1F62\u0345","\u1F62\u0399","\u1F62\u03B9","\u1F62\u1FBE","\u1F6A\u0345","\u1F6A\u0399","\u1F6A\u03B9","\u1F6A\u1FBE",}, 185 // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA} 186 {"\u1F63\u03B9","\u1F63\u0345","\u1F63\u0399","\u1F63\u03B9","\u1F63\u1FBE","\u1F6B\u0345","\u1F6B\u0399","\u1F6B\u03B9","\u1F6B\u1FBE",}, 187 // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA} 188 {"\u1F64\u03B9","\u1F64\u0345","\u1F64\u0399","\u1F64\u03B9","\u1F64\u1FBE","\u1F6C\u0345","\u1F6C\u0399","\u1F6C\u03B9","\u1F6C\u1FBE",}, 189 // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA} 190 {"\u1F65\u03B9","\u1F65\u0345","\u1F65\u0399","\u1F65\u03B9","\u1F65\u1FBE","\u1F6D\u0345","\u1F6D\u0399","\u1F6D\u03B9","\u1F6D\u1FBE",}, 191 // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA} 192 {"\u1F66\u03B9","\u1F66\u0345","\u1F66\u0399","\u1F66\u03B9","\u1F66\u1FBE","\u1F6E\u0345","\u1F6E\u0399","\u1F6E\u03B9","\u1F6E\u1FBE",}, 193 // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA} 194 {"\u1F67\u03B9","\u1F67\u0345","\u1F67\u0399","\u1F67\u03B9","\u1F67\u1FBE","\u1F6F\u0345","\u1F6F\u0399","\u1F6F\u03B9","\u1F6F\u1FBE",}, 195 // \N{GREEK SMALL LETTER ALPHA WITH VARIA}\N{GREEK SMALL LETTER IOTA} 196 {"\u1F70\u03B9","\u1F70\u0345","\u1F70\u0399","\u1F70\u03B9","\u1F70\u1FBE","\u1FBA\u0345","\u1FBA\u0399","\u1FBA\u03B9","\u1FBA\u1FBE",}, 197 // \N{GREEK SMALL LETTER ETA WITH VARIA}\N{GREEK SMALL LETTER IOTA} 198 {"\u1F74\u03B9","\u1F74\u0345","\u1F74\u0399","\u1F74\u03B9","\u1F74\u1FBE","\u1FCA\u0345","\u1FCA\u0399","\u1FCA\u03B9","\u1FCA\u1FBE",}, 199 // \N{GREEK SMALL LETTER OMEGA WITH VARIA}\N{GREEK SMALL LETTER IOTA} 200 {"\u1F7C\u03B9","\u1F7C\u0345","\u1F7C\u0399","\u1F7C\u03B9","\u1F7C\u1FBE","\u1FFA\u0345","\u1FFA\u0399","\u1FFA\u03B9","\u1FFA\u1FBE",}, 201 }; 202 203 // this initializes the data used to generated the case-equivalents 204 205 static { 206 207 // Gather up the exceptions in a form we can use 208 209 if (!GENERATE) { 210 for (int i = 0; i < exceptionList.length; ++i) { 211 String[] exception = exceptionList[i]; 212 Set s = new HashSet(); 213 // there has to be some method to do the following, but I can't find it in the collections 214 for (int j = 0; j < exception.length; ++j) { 215 s.add(exception[j]); 216 } 217 fromCaseFold.put(exception[0], s); 218 } 219 } 220 221 // walk through all the characters, and at every case fold result, 222 // put a set of all the characters that map to that result 223 224 boolean defaultmapping = true; // false for turkish 225 for (int i = 0; i <= 0x10FFFF; ++i) { 226 int cat = UCharacter.getType(i); 227 if (cat == Character.UNASSIGNED || cat == Character.PRIVATE_USE) continue; 228 229 String cp = UTF16.valueOf(i); 230 String mapped = UCharacter.foldCase(cp, defaultmapping); 231 if (mapped.equals(cp)) continue; 232 233 if (maxLength < mapped.length()) maxLength = mapped.length(); 234 235 // at this point, have different case folding 236 237 Set s = (Set) fromCaseFold.get(mapped); 238 if (s == null) { 239 s = new HashSet(); 240 s.add(mapped); // add the case fold result itself 241 fromCaseFold.put(mapped, s); 242 } 243 s.add(cp); 244 toCaseFold.put(cp, mapped); 245 toCaseFold.put(mapped, mapped); // add mapping to self 246 } 247 248 // Emit the final data 249 250 if (DUMP) { 251 System.out.println("maxLength = " + maxLength); 252 253 System.out.println("\nfromCaseFold:"); 254 Iterator it = fromCaseFold.keySet().iterator(); 255 while (it.hasNext()) { 256 Object key = it.next(); 257 System.out.print(" " + toHex2.transliterate((String)key) + ": "); 258 Set s = (Set) fromCaseFold.get(key); 259 Iterator it2 = s.iterator(); 260 boolean first = true; 261 while (it2.hasNext()) { 262 if (first) { 263 first = false; 264 } else { 265 System.out.print(", "); 266 } 267 System.out.print(toHex2.transliterate((String)it2.next())); 268 } 269 System.out.println(""); 270 } 271 272 System.out.println("\ntoCaseFold:"); 273 it = toCaseFold.keySet().iterator(); 274 while (it.hasNext()) { 275 String key = (String) it.next(); 276 String value = (String) toCaseFold.get(key); 277 System.out.println(" " + toHex2.transliterate(key) + ": " + toHex2.transliterate(value)); 278 } 279 } 280 281 // Now convert all those sets into linear arrays 282 // We can't do this in place in Java, so make a temporary target array 283 284 // Note: This could be transformed into a single array, with offsets into it. 285 // Might be best choice in C. 286 287 288 Map fromCaseFold2 = new HashMap(); 289 Iterator it = fromCaseFold.keySet().iterator(); 290 while (it.hasNext()) { 291 Object key = it.next(); 292 Set s = (Set) fromCaseFold.get(key); 293 String[] temp = new String[s.size()]; 294 s.toArray(temp); 295 fromCaseFold2.put(key, temp); 296 } 297 fromCaseFold = fromCaseFold2; 298 299 // We have processed everything, so the iterator will now work 300 // The following is normally OFF. 301 // It is here to generate (under the GENERATE flag) the static exception list. 302 // It must be at the very end of initialization, so that the iterator is functional. 303 // (easiest to do it that way) 304 305 if (GENERATE) { 306 307 // first get small set of items that have multiple characters 308 309 Set multichars = new TreeSet(); 310 it = fromCaseFold.keySet().iterator(); 311 while (it.hasNext()) { 312 String key = (String) it.next(); 313 if (UTF16.countCodePoint(key) < 2) continue; 314 multichars.add(key); 315 } 316 317 // now we will go through each of them. 318 319 CaseIterator ci = new CaseIterator(); 320 it = multichars.iterator(); 321 322 while (it.hasNext()) { 323 String key = (String) it.next(); 324 325 // here is a nasty complication. Take 'ffi' ligature. We 326 // can't just close it, since we would miss the combination 327 // that includes the 'fi' => "fi" ligature 328 // so first do a pass through, and add substring combinations 329 // we call this a 'partial closure' 330 331 Set partialClosure = new TreeSet(); 332 partialClosure.add(key); 333 334 if (UTF16.countCodePoint(key) > 2) { 335 Iterator multiIt2 = multichars.iterator(); 336 while (multiIt2.hasNext()) { 337 String otherKey = (String) multiIt2.next(); 338 if (otherKey.length() >= key.length()) continue; 339 int pos = -1; 340 while (true) { 341 // The following is not completely general 342 // but works for the actual cased stuff, 343 // and should work for future characters, since we won't have 344 // more ligatures & other oddities. 345 pos = key.indexOf(otherKey, pos+1); 346 if (pos < 0) break; 347 int endPos = pos + otherKey.length(); 348 // we know we have a proper substring, 349 // so get the combinations 350 String[] choices = (String[]) fromCaseFold.get(otherKey); 351 for (int ii = 0; ii < choices.length; ++ii) { 352 String patchwork = key.substring(0, pos) 353 + choices[ii] 354 + key.substring(endPos); 355 partialClosure.add(patchwork); 356 } 357 } 358 } 359 } 360 361 // now, for each thing in the partial closure, get its 362 // case closure and add it to the final result. 363 364 Set closure = new TreeSet(); // this will be the real closure 365 Iterator partialIt = partialClosure.iterator(); 366 while (partialIt.hasNext()) { 367 String key2 = (String) partialIt.next(); 368 ci.reset(key2); 369 for (String temp = ci.next(); temp != null; temp = ci.next()) { 370 closure.add(temp); 371 } 372 // form closure 373 /*String[] choices = (String[]) fromCaseFold.get(key2); 374 for (int i = 0; i < choices.length; ++i) { 375 ci.reset(choices[i]); 376 String temp; 377 while (null != (temp = ci.next())) { 378 closure.add(temp); 379 } 380 } 381 */ 382 } 383 384 // print it out, so that it can be cut and pasted back into this document. 385 386 Iterator it2 = closure.iterator(); 387 System.out.println("\t// " + toName.transliterate(key)); 388 System.out.print("\t{\"" + toHex.transliterate(key) + "\","); 389 while (it2.hasNext()) { 390 String item = (String)it2.next(); 391 System.out.print("\"" + toHex.transliterate(item) + "\","); 392 } 393 System.out.println("},"); 394 } 395 } 396 } 397 398 // ============ PRIVATE CLASS DATA ============ 399 400 // pieces that we will put together 401 // is not changed during iteration 402 private int count = 0; 403 private String[][] variants; 404 405 // state information, changes during iteration 406 private boolean done = false; 407 private int[] counts; 408 409 // internal buffer for efficiency 410 private StringBuffer nextBuffer = new StringBuffer(); 411 412 // ======================== 413 414 /** 415 * Reset to different source. Once reset, the iteration starts from the beginning. 416 * @param source The string to get case variants for 417 */ 418 public void reset(String source) { 419 420 // allocate arrays to store pieces 421 // using length might be slightly too long, but we don't care much 422 423 counts = new int[source.length()]; 424 variants = new String[source.length()][]; 425 426 // walk through the source, and break up into pieces 427 // each piece becomes an array of equivalent values 428 // TODO: could optimized this later to coalesce all single string pieces 429 430 String piece = null; 431 count = 0; 432 for (int i = 0; i < source.length(); i += piece.length()) { 433 434 // find *longest* matching piece 435 String caseFold = null; 436 437 if (GENERATE) { 438 // do exactly one CP 439 piece = UTF16.valueOf(source, i); 440 caseFold = (String) toCaseFold.get(piece); 441 } else { 442 int max = i + maxLength; 443 if (max > source.length()) max = source.length(); 444 for (int j = max; j > i; --j) { 445 piece = source.substring(i, j); 446 caseFold = (String) toCaseFold.get(piece); 447 if (caseFold != null) break; 448 } 449 } 450 451 // if we fail, pick one code point 452 if (caseFold == null) { 453 piece = UTF16.valueOf(source, i); 454 variants[count++] = new String[] {piece}; // single item string 455 } else { 456 variants[count++] = (String[])fromCaseFold.get(caseFold); 457 } 458 } 459 reset(); 460 } 461 462 /** 463 * Restart the iteration from the beginning, but with same source 464 */ 465 public void reset() { 466 done = false; 467 for (int i = 0; i < count; ++i) { 468 counts[i] = 0; 469 } 470 } 471 472 /** 473 * Iterates through the case variants. 474 * @return next case variant. Each variant will case-fold to the same value as the source will. 475 * When the iteration is done, null is returned. 476 */ 477 public String next() { 478 479 if (done) return null; 480 int i; 481 482 // TODO Optimize so we keep the piece before and after the current position 483 // so we don't have so much concatenation 484 485 // get the result, a concatenation 486 487 nextBuffer.setLength(0); 488 for (i = 0; i < count; ++i) { 489 nextBuffer.append(variants[i][counts[i]]); 490 } 491 492 // find the next right set of pieces to concatenate 493 494 for (i = count-1; i >= 0; --i) { 495 counts[i]++; 496 if (counts[i] < variants[i].length) break; 497 counts[i] = 0; 498 } 499 500 // if we go too far, bail 501 502 if (i < 0) { 503 done = true; 504 } 505 506 return nextBuffer.toString(); 507 } 508 509 510 /** 511 * Temporary test, just to see how the stuff works. 512 */ 513 static public void main(String[] args) { 514 String[] testCases = {"fiss", "h\u03a3"}; 515 CaseIterator ci = new CaseIterator(); 516 517 for (int i = 0; i < testCases.length; ++i) { 518 String item = testCases[i]; 519 System.out.println(); 520 System.out.println("Testing: " + toName.transliterate(item)); 521 System.out.println(); 522 ci.reset(item); 523 int count = 0; 524 for (String temp = ci.next(); temp != null; temp = ci.next()) { 525 System.out.println(toName.transliterate(temp)); 526 count++; 527 } 528 System.out.println("Total: " + count); 529 } 530 531 // generate a list of all caseless characters -- characters whose 532 // case closure is themselves. 533 534 UnicodeSet caseless = new UnicodeSet(); 535 536 for (int i = 0; i <= 0x10FFFF; ++i) { 537 String cp = UTF16.valueOf(i); 538 ci.reset(cp); 539 int count = 0; 540 String fold = null; 541 for (String temp = ci.next(); temp != null; temp = ci.next()) { 542 fold = temp; 543 if (++count > 1) break; 544 } 545 if (count==1 && fold.equals(cp)) { 546 caseless.add(i); 547 } 548 } 549 550 System.out.println("caseless = " + caseless.toPattern(true)); 551 552 UnicodeSet not_lc = new UnicodeSet("[:^lc:]"); 553 554 UnicodeSet a = new UnicodeSet(); 555 a.set(not_lc); 556 a.removeAll(caseless); 557 System.out.println("[:^lc:] - caseless = " + a.toPattern(true)); 558 559 a.set(caseless); 560 a.removeAll(not_lc); 561 System.out.println("caseless - [:^lc:] = " + a.toPattern(true)); 562 } 563} 564