1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4********************************************************************** 5* Copyright (c) 2003-2012, International Business Machines 6* Corporation and others. All Rights Reserved. 7********************************************************************** 8* Author: Alan Liu 9* Created: February 11 2003 10* Since: ICU 2.6 11********************************************************************** 12*/ 13package com.ibm.icu.dev.tool.translit; 14 15import java.io.FileOutputStream; 16import java.io.IOException; 17import java.io.PrintStream; 18import java.util.Collections; 19import java.util.Comparator; 20import java.util.HashMap; 21import java.util.Iterator; 22import java.util.Locale; 23import java.util.Map; 24import java.util.Set; 25import java.util.TreeSet; 26import java.util.Vector; 27 28import com.ibm.icu.impl.Utility; 29import com.ibm.icu.lang.UCharacter; 30import com.ibm.icu.text.BreakIterator; 31import com.ibm.icu.text.UTF16; 32import com.ibm.icu.text.UnicodeSet; 33 34/** 35 * This class produces the data tables used by the closeOver() method 36 * of UnicodeSet. 37 * 38 * Whenever the Unicode database changes, this tool must be re-run 39 * (AFTER the data file(s) underlying ICU4J are udpated). 40 * 41 * The output of this tool should then be pasted into the appropriate 42 * files: 43 * 44 * ICU4J: com.ibm.icu.text.UnicodeSet.java 45 * ICU4C: /icu/source/common/uniset.cpp 46 */ 47class UnicodeSetCloseOver { 48 49 // Our output files 50 static final String JAVA_OUT = "to_UnicodeSet.java"; 51 static final String JAVA_CHARPROP_OUT = "to_UCharacterProperty.java"; 52 static final String C_SET_OUT = "to_uniset.cpp"; 53 static final String C_UCHAR_OUT = "to_uchar.c"; 54 55 // Source code "do not edit" warning 56 static final String WARNING = "MACHINE-GENERATED; Unicode version " + 57 UCharacter.getUnicodeVersion() + 58 "; DO NOT EDIT; See " + 59 UnicodeSetCloseOver.class.getName(); 60 61 // Case folding options flag. This must correspond to the options 62 // used in UnicodeSet.closeOver() in Java and C++. 63 static final boolean DEFAULT_CASE_MAP = true; // false for Turkish 64 65 public static void main(String[] args) throws IOException { 66 System.out.println("This tool will generate several output files. Each is named according"); 67 System.out.println("the target file. For example, the contents of to_UnicodeSet.java should"); 68 System.out.println("be pasted into UnicodeSet.java."); 69 System.out.println(); 70 71 generateCaseData(); 72 } 73 74 /** 75 * Create a map of String => Set. The String in this case is a 76 * folded string for which 77 * UCharacter.foldCase(folded. DEFAULT_CASE_MAP).equals(folded). 78 * The Set contains all single-character strings x for which 79 * UCharacter.foldCase(x, DEFAULT_CASE_MAP).equals(folded), as 80 * well as folded itself. 81 */ 82 static Map createCaseFoldEquivalencyClasses() { 83 Map equivClasses = new HashMap(); 84 for (int i = 0; i <= 0x10FFFF; ++i) { 85 int cat = UCharacter.getType(i); 86 if (cat == Character.UNASSIGNED || cat == Character.PRIVATE_USE) 87 continue; 88 89 String cp = UTF16.valueOf(i); 90 String folded = UCharacter.foldCase(cp, DEFAULT_CASE_MAP); 91 if (folded.equals(cp)) continue; 92 93 // At this point, have different case folding. Add 94 // the code point and its folded equivalent into the 95 // equivalency class. 96 TreeSet s = (TreeSet) equivClasses.get(folded); 97 if (s == null) { 98 s = new TreeSet(); 99 s.add(folded); // add the case fold result itself 100 equivClasses.put(folded, s); 101 } 102 s.add(cp); 103 } 104 return equivClasses; 105 } 106 107 /** 108 * Analyze the case fold equivalency classes. Break them into two 109 * groups: 'pairs', and 'nonpairs'. Create a tally of the length 110 * configurations of the nonpairs. 111 * 112 * Length configurations of equivalency classes, as of Unicode 113 * 3.2. Most of the classes (83%) have two single codepoints. 114 * Here "112:28" means there are 28 equivalency classes with 2 115 * single codepoints and one string of length 2. 116 * 117 * 11:656 118 * 111:16 119 * 1111:3 120 * 112:28 121 * 113:2 122 * 12:31 123 * 13:12 124 * 22:38 125 * 126 * Note: This method does not count the frequencies of the 127 * different length configurations (as shown above after ':'); it 128 * merely records which configurations occur. 129 * 130 * @param pairs Accumulate equivalency classes that consist of 131 * exactly two codepoints here. This is 83+% of the classes. 132 * E.g., {"a", "A"}. 133 * @param nonpairs Accumulate other equivalency classes here, as 134 * lists of strings. E,g, {"st", "\uFB05", "\uFB06"}. 135 * @param lengths Accumulate a list of unique length structures, 136 * not including pairs. Each length structure is represented by a 137 * string of digits. The digit string "12" means the equivalency 138 * class contains a single code point and a string of length 2. 139 * Typical contents of 'lengths': { "111", "1111", "112", 140 * "113", "12", "13", "22" }. Note the absence of "11". 141 */ 142 static void analyzeCaseData(Map equivClasses, 143 StringBuffer pairs, 144 Vector nonpairs, 145 Vector lengths) { 146 Iterator i = new TreeSet(equivClasses.keySet()).iterator(); 147 StringBuffer buf = new StringBuffer(); 148 while (i.hasNext()) { 149 Object key = i.next(); 150 Vector v = new Vector((Set) equivClasses.get(key)); 151 if (v.size() == 2) { 152 String a = (String) v.elementAt(0); 153 String b = (String) v.elementAt(1); 154 if (a.length() == 1 && b.length() == 1) { 155 pairs.append(a).append(b); 156 continue; 157 // Note that pairs are included in 'lengths' 158 } 159 } 160 String[] a = new String[v.size()]; 161 v.toArray(a); 162 nonpairs.add(a); 163 //int singleCount = 0; 164 //int stringCount = 0; 165 // Make a string of the lengths, e.g., "111" means 3 166 // single code points; "13" means a single code point 167 // and a string of length 3. 168 v.clear(); 169 for (int j=0; j<a.length; ++j) { 170 v.add(new Integer(a[j].length())); 171 } 172 Collections.sort(v); 173 buf.setLength(0); 174 for (int j=0; j<v.size(); ++j) { 175 buf.append(String.valueOf(v.elementAt(j))); 176 } 177 if (!lengths.contains(buf.toString())) { 178 lengths.add(buf.toString()); 179 } 180 } 181 } 182 183 @SuppressWarnings("resource") 184 static void generateCaseData() throws IOException { 185 186 Map equivClasses = createCaseFoldEquivalencyClasses(); 187 188 // Accumulate equivalency classes that consist of exactly 189 // two codepoints here. This is 83+% of the classes. 190 // E.g., {"a", "A"}. 191 StringBuffer pairs = new StringBuffer(); 192 193 // Accumulate other equivalency classes here, as lists 194 // of strings. E,g, {"st", "\uFB05", "\uFB06"}. 195 Vector nonpairs = new Vector(); // contains String[] 196 Vector lengths = new Vector(); // "111", "12", "22", etc. 197 198 analyzeCaseData(equivClasses, pairs, nonpairs, lengths); 199 200 //------------------------------------------------------------- 201 // Emit Java source 202 PrintStream out = new PrintStream(new FileOutputStream(JAVA_OUT)); 203 System.out.println("Writing " + JAVA_OUT); 204 205 out.println(" // " + WARNING); 206 out.println(" private static final String CASE_PAIRS ="); 207 out.println(Utility.formatForSource(pairs.toString()) + ";"); 208 out.println(); 209 out.println(" // " + WARNING); 210 out.println(" private static final String[][] CASE_NONPAIRS = {"); 211 for (int j=0; j<nonpairs.size(); ++j) { 212 String[] a = (String[]) nonpairs.elementAt(j); 213 out.print(" {"); 214 for (int k=0; k<a.length; ++k) { 215 if (k != 0) out.print(", "); 216 out.print(Utility.format1ForSource(a[k])); 217 } 218 out.println("},"); 219 } 220 out.println(" };"); 221 222 //------------------------------------------------------------- 223 // Emit C++ source 224 225 // In C++, the pairs are again emitted in an array, but this 226 // array is the final representation form -- it will not be 227 // reprocessed into a hash. It will be binary searched by 228 // looking at the even elements [0], [2], [4], etc., and 229 // ignoring the odd elements. The even elements must contain 230 // the folded members of the pairs. That is, in the pair 231 // {'A', 'a'}, the even element must be 'a', not 'A'. Then a 232 // code point to be located is first folded ('Y' => 'y') then 233 // it binary searched against [0]='A', [2]='B', etc. When a 234 // match is found at k, the pair is [k], [k+1]. 235 236 out = new PrintStream(new FileOutputStream(C_SET_OUT)); 237 System.out.println("Writing " + C_SET_OUT); 238 239 // Sort the pairs. They must be ordered by the folded element. 240 // Store these as two-character strings, with charAt(0) being 241 // the folded member of the pair. 242 TreeSet sortPairs = new TreeSet(new Comparator() { 243 public int compare(Object a, Object b) { 244 return ((int) ((String) a).charAt(0)) - 245 ((int) ((String) b).charAt(0)); 246 } 247 public boolean equals(Object obj) { 248 return false; 249 } 250 }); 251 for (int i=0; i<pairs.length(); i+=2) { 252 String a = String.valueOf(pairs.charAt(i)); 253 String b = String.valueOf(pairs.charAt(i+1)); 254 String folded = UCharacter.foldCase(a, DEFAULT_CASE_MAP); 255 if (a.equals(folded)) { 256 sortPairs.add(a + b); 257 } else { 258 sortPairs.add(b + a); 259 } 260 } 261 262 // Emit the pairs 263 out.println("// " + WARNING); 264 out.println("static const UChar CASE_PAIRS[] = {"); 265 Iterator it = sortPairs.iterator(); 266 while (it.hasNext()) { 267 out.print(" "); 268 int n = 0; 269 while (n++ < 5 && it.hasNext()) { 270 String s = (String) it.next(); 271 //out.print((int) s.charAt(0) + "," + 272 // (int) s.charAt(1) + ","); 273 out.print("0x" + Utility.hex(s.charAt(0)) + ",0x" + 274 Utility.hex(s.charAt(1)) + ","); 275 } 276 out.println(); 277 } 278 out.println("};"); 279 out.println(); 280 281 // The non-pairs are encoded in the following way. All the 282 // single codepoints in each class are grouped together 283 // followed by a zero. Then each multi-character string is 284 // added, followed by a zero. Finally, another zero is added. 285 // Some examples: 286 // {"iQ", "R"} => [ 'R', 0, 'i', 'Q', 0, 0 ] 287 // {"S", "D", "F", "G"} => [ 'S', 'D', 'F', 'G', 0, 0 ] 288 // {"jW", "jY"} => [ 0, 'j', 'W', 0, 'j', 'Y', 0, 0 ] 289 // The end-result is a short, flat array of UChar values that 290 // can be used to initialize a UChar[] array in C. 291 292 int maxLen = 0; // Maximum encoded length of any class, including zeros 293 out.println("// " + WARNING); 294 out.println("static const CaseEquivClass CASE_NONPAIRS[] = {"); 295 for (int j=0; j<nonpairs.size(); ++j) { 296 int len = 0; 297 String[] a = (String[]) nonpairs.elementAt(j); 298 out.print(" {"); 299 // Emit single code points 300 for (int k=0; k<a.length; ++k) { 301 if (a[k].length() != 1) continue; 302 //out.print((int) a[k].charAt(0) + ","); 303 out.print("0x"+Utility.hex(a[k].charAt(0)) + ","); 304 ++len; 305 } 306 out.print("0, "); // End of single code points 307 ++len; 308 // Emit multi-character strings 309 for (int k=0; k<a.length; ++k) { 310 if (a[k].length() == 1) continue; 311 for (int m=0; m<a[k].length(); ++m) { 312 //out.print((int) a[k].charAt(m) + ","); 313 out.print("0x"+Utility.hex(a[k].charAt(m)) + ","); 314 ++len; 315 } 316 out.print("0, "); // End of string 317 ++len; 318 } 319 out.println("0},"); // End of equivalency class 320 ++len; 321 if (len > maxLen) maxLen = len; 322 } 323 out.println("};"); 324 325 // Make sure the CaseEquivClass data can fit. 326 if (maxLen > 8) { 327 throw new RuntimeException("Must adjust CaseEquivClass to accomodate " + maxLen + " UChars"); 328 } 329 330 // Also make sure that we can map into this array using a 331 // CompactByteArray. We could do this check above, but we 332 // keep it here, adjacent to the maxLen check. We use one 333 // value (-1 == 255) to indicate "no value." 334 if (nonpairs.size() > 255) { 335 throw new RuntimeException("Too many CASE_NONPAIRS array elements to be indexed by a CompactByteArray"); 336 } 337 338 //------------------------------------------------------------- 339 // Case-unique set: All characters c for which closeOver(c)==c. 340 341 // UPDATE: Instead of using this, we're using the related 342 // notion of Case_Sensitive. See below. Note that 343 // Case_Sensitive != ^Case_Unique. 344 345 if (false) { 346 UnicodeSet caseUnique = new UnicodeSet(); 347 for (int i = 0; i <= 0x10FFFF; ++i) { 348 String cp = UTF16.valueOf(i); 349 if (equivClasses.get(UCharacter.foldCase(cp, DEFAULT_CASE_MAP)) == null) { 350 caseUnique.add(i); 351 } 352 } 353 // out.println("caseUnique = " + caseUnique.toPattern(true)); 354 } 355 356 UnicodeSet caseSensitive = getCaseSensitive(); 357 //System.out.println("caseSensitive = " + caseSensitive.toPattern(true)); 358 359 // Now for C, emit an array of ranges 360 out = new PrintStream(new FileOutputStream(C_UCHAR_OUT)); 361 System.out.println("Writing " + C_UCHAR_OUT); 362 363 out.println("/* " + WARNING + " */"); 364 emitUCharRangesArray(out, caseSensitive, "CASE_SENSITIVE_RANGES"); 365 366 // For Java, emit a string with the ranges (each pair of chars 367 // in the string is a range). 368 out = new PrintStream(new FileOutputStream(JAVA_CHARPROP_OUT)); 369 System.out.println("Writing " + JAVA_CHARPROP_OUT); 370 out.println(" // " + WARNING); 371 emitRangesString(out, caseSensitive, "CASE_SENSITIVE_RANGES"); 372 } 373 374 /** 375 * Create the set of case-sensitive characters. These are characters 376 * that participate in any case mapping operation as a source or 377 * as a member of a target string. 378 */ 379 static UnicodeSet getCaseSensitive() { 380 UnicodeSet caseSensitive = new UnicodeSet(); 381 Locale loc = Locale.US; 382 BreakIterator bi = BreakIterator.getTitleInstance(loc); 383 for (int c = 0; c <= 0x10FFFF; ++c) { 384 String cp = UTF16.valueOf(c); 385 for (int j=0; j<4; ++j) { 386 String s = null; 387 switch (j) { 388 case 0: s = UCharacter.toUpperCase(loc, cp); break; 389 case 1: s = UCharacter.toLowerCase(loc, cp); break; 390 case 2: s = UCharacter.toTitleCase(loc, cp, bi); break; 391 case 3: s = UCharacter.foldCase(cp, DEFAULT_CASE_MAP); break; 392 } 393 if (!s.equals(cp)) { 394 int cc; 395 for (int k=0; k<s.length(); k+=UTF16.getCharCount(cc)) { 396 cc = UTF16.charAt(s, k); 397 caseSensitive.add(cc); 398 } 399 for (int k=0; k<cp.length(); k+=UTF16.getCharCount(cc)) { 400 cc = UTF16.charAt(cp, k); 401 caseSensitive.add(cc); 402 } 403 } 404 } 405 // Also do the single-codepoint API. This shouldn't add any 406 // code points, but we do it for completeness. 407 for (int j=0; j<4; ++j) { 408 int d = 0; 409 switch (j) { 410 case 0: d = UCharacter.toUpperCase(c); break; 411 case 1: d = UCharacter.toLowerCase(c); break; 412 case 2: d = UCharacter.toTitleCase(c); break; 413 case 3: d = UCharacter.foldCase(c, DEFAULT_CASE_MAP); break; 414 } 415 if (d != c) { 416 if (!caseSensitive.contains(c) || 417 !caseSensitive.contains(d)) { 418 System.out.println("Warning: code point " + c + 419 " => " + d + " created NEW MAPPING"+ 420 " for Case_Sensitive"); 421 } 422 caseSensitive.add(c); 423 caseSensitive.add(d); 424 } 425 } 426 } 427 return caseSensitive; 428 } 429 430 /** 431 * Given a UnicodeSet, emit it as an array of UChar pairs. Each 432 * pair will be the start/end of a range. Code points >= U+10000 433 * will be represented as surrogate pairs. 434 */ 435 static void emitUCharRangesArray(PrintStream out, UnicodeSet set, String id) { 436 // Store the pairs in a StringBuffer. This handles surrogate 437 // representation. 438 StringBuffer buf = new StringBuffer(); 439 for (int i=0; i<set.getRangeCount(); ++i) { 440 UTF16.append(buf, set.getRangeStart(i)); 441 UTF16.append(buf, set.getRangeEnd(i)); 442 } 443 // Emit the pairs 444 out.println("static const UChar " + id + "[] = {"); 445 for (int i=0; i<buf.length(); ) { 446 out.print(" "); 447 for (int n=0; n++<10 && i<buf.length(); ++i) { 448 out.print("0x" + Utility.hex(buf.charAt(i), 4) + ','); 449 } 450 out.println(); 451 } 452 out.println("};"); 453 out.println("#define " + id + "_LENGTH (sizeof(" + id + 454 ")/sizeof(" + id + "[0]))"); 455 } 456 457 /** 458 * Given a UnicodeSet, emit it as a Java string. The most economical 459 * format is not the pattern, but instead a pairs list, with each 460 * range pair represented as two adjacent characters. 461 */ 462 static void emitRangesString(PrintStream out, UnicodeSet set, String id) { 463 // Store the pairs in a StringBuffer. This handles surrogate 464 // representation. 465 StringBuffer buf = new StringBuffer(); 466 for (int i=0; i<set.getRangeCount(); ++i) { 467 UTF16.append(buf, set.getRangeStart(i)); 468 UTF16.append(buf, set.getRangeEnd(i)); 469 } 470 // Emit the pairs 471 out.println(" private static final String " + id + " ="); 472 out.println(Utility.formatForSource(buf.toString()) + ";"); 473 } 474} 475