1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4 ******************************************************************************* 5 * Copyright (C) 1996-2010, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 10package com.ibm.icu.dev.test.translit; 11import java.io.File; 12import java.io.FileOutputStream; 13import java.io.IOException; 14import java.io.OutputStreamWriter; 15import java.io.PrintWriter; 16import java.util.Enumeration; 17import java.util.Iterator; 18import java.util.Map; 19import java.util.Set; 20import java.util.TreeMap; 21import java.util.TreeSet; 22 23import com.ibm.icu.lang.UCharacter; 24import com.ibm.icu.lang.UScript; 25import com.ibm.icu.text.Normalizer; 26import com.ibm.icu.text.Transliterator; 27import com.ibm.icu.text.UTF16; 28import com.ibm.icu.text.UnicodeSet; 29import com.ibm.icu.text.UnicodeSetIterator; 30 31public class WriteCharts { 32 public static void main(String[] args) throws IOException { 33 if (false) { 34 printSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]"); 35 } 36 String testSet = ""; 37 if (args.length == 0) args = getAllScripts(); 38 for (int i = 0; i < args.length; ++i) { 39 // Enumeration enum = Transliterator.getAvailableIDs(); 40 if (args[i].startsWith("[")) { 41 testSet = args[i]; 42 } else { 43 print(testSet, args[i]); 44 testSet = ""; 45 } 46 } 47 } 48 49 public static void printSet(String source) { 50 UnicodeSet s = new UnicodeSet(source); 51 System.out.println("Printout for '" + source + "'"); 52 int count = s.getRangeCount(); 53 for (int i = 0; i < count; ++i) { 54 int start = s.getRangeStart(i); 55 int end = s.getRangeEnd(i); 56 System.out.println(Integer.toString(start,16) + ".." + Integer.toString(end,16)); 57 } 58 } 59 60 public static String[] getAllScripts() { 61 Set set = new TreeSet(); 62 int scripts[]; 63 Enumeration sources = Transliterator.getAvailableSources(); 64 while(sources.hasMoreElements()) { 65 String source = (String) sources.nextElement(); 66 scripts = UScript.getCode(source); 67 if (scripts == null) { 68 System.out.println("[Skipping " + source + "]"); 69 continue; 70 } 71 int sourceScript = scripts[0]; 72 System.out.println("Source: " + source + ";\tScripts: " + showScripts(scripts)); 73 Enumeration targets = Transliterator.getAvailableTargets(source); 74 while(targets.hasMoreElements()) { 75 String target = (String) targets.nextElement(); 76 scripts = UScript.getCode(target); 77 if (scripts == null 78 || priority(scripts[0]) < priority(sourceScript)) { 79 // skip doing both directions 80 System.out.println("[Skipping '" + source + "-" + target + "']"); 81 continue; 82 } 83 System.out.println("\tTarget: " + target + ";\tScripts: " + showScripts(scripts)); 84 Enumeration variants = Transliterator.getAvailableVariants(source, target); 85 while(variants.hasMoreElements()) { 86 String variant = (String) variants.nextElement(); 87 String id = source + "-" + target; 88 if (variant.length() != 0) { 89 id += "/" + variant; 90 if (false) { 91 System.out.println("SKIPPING VARIANT, SINCE IT CURRENTLY BREAKS!\t" + id); 92 continue; 93 } 94 } 95 System.out.println("\t\t\t\tAdding: '" + id + "'"); 96 set.add(id); 97 } 98 } 99 } 100 String[] results = new String[set.size()]; 101 set.toArray(results); 102 return results; 103 } 104 105 static public int priority(int script) { 106 if (script == UScript.LATIN) return -2; 107 return script; 108 } 109 110 public static String showScripts(int[] scripts) { 111 StringBuffer results = new StringBuffer(); 112 for (int i = 0; i < scripts.length; ++i) { 113 if (i != 0) results.append(", "); 114 results.append(UScript.getName(scripts[i])); 115 } 116 return results.toString(); 117 } 118 119 public static void print(String testSet, String rawId) throws IOException { 120 System.out.println("Processing " + rawId); 121 Transliterator t = Transliterator.getInstance(rawId); 122 String id = t.getID(); 123 124 // clean up IDs. Ought to be API for getting source, target, variant 125 int minusPos = id.indexOf('-'); 126 String source = id.substring(0,minusPos); 127 String target = id.substring(minusPos+1); 128 int slashPos = target.indexOf('/'); 129 if (slashPos >= 0) target = target.substring(0,slashPos); 130 131 // check that the source is a script 132 if (testSet.equals("")) { 133 int[] scripts = UScript.getCode(source); 134 if (scripts == null) { 135 System.out.println("FAILED: " 136 + Transliterator.getDisplayName(id) 137 + " does not have a script as the source"); 138 return; 139 } else { 140 testSet = "[:" + source + ":]"; 141 if (source.equalsIgnoreCase("katakana")) { 142 testSet = "[" + testSet + "\u30FC]"; 143 printSet(testSet); 144 } 145 } 146 } 147 UnicodeSet sourceSet = new UnicodeSet(testSet); 148 149 // check that the target is a script 150 int[] scripts = UScript.getCode(target); 151 if (scripts == null) { 152 target = "[:Latin:]"; 153 } else { 154 target = "[:" + target + ":]"; 155 } 156 UnicodeSet targetSet = new UnicodeSet(target); 157 158 Transliterator inverse = t.getInverse(); 159 160 //Transliterator hex = Transliterator.getInstance("Any-Hex"); 161 162 163 // iterate through script 164 System.out.println("Transliterating " + sourceSet.toPattern(true) 165 + " with " + Transliterator.getDisplayName(id)); 166 167 UnicodeSet leftOverSet = new UnicodeSet(targetSet); 168 UnicodeSet privateUse = new UnicodeSet("[:private use:]"); 169 170 Map map = new TreeMap(); 171 172 UnicodeSet targetSetPlusAnyways = new UnicodeSet(targetSet); 173 targetSetPlusAnyways.addAll(okAnyway); 174 175 UnicodeSet sourceSetPlusAnyways = new UnicodeSet(sourceSet); 176 sourceSetPlusAnyways.addAll(okAnyway); 177 178 UnicodeSetIterator usi = new UnicodeSetIterator(sourceSet); 179 180 while (usi.next()) { 181 int j = usi.codepoint; 182 /* 183 int count = sourceSet.getRangeCount(); 184 for (int i = 0; i < count; ++i) { 185 int end = sourceSet.getRangeEnd(i); 186 for (int j = sourceSet.getRangeStart(i); j <= end; ++j) { 187 */ 188 // String flag = ""; 189 String ss = UTF16.valueOf(j); 190 String ts = t.transliterate(ss); 191 char group = 0; 192 if (!targetSetPlusAnyways.containsAll(ts)) { 193 group |= 1; 194 } 195 if (UTF16.countCodePoint(ts) == 1) { 196 leftOverSet.remove(UTF16.charAt(ts,0)); 197 } 198 String rt = inverse.transliterate(ts); 199 if (!sourceSetPlusAnyways.containsAll(rt)) { 200 group |= 2; 201 } else if (!ss.equals(rt)) { 202 group |= 4; 203 } 204 205 if (!privateUse.containsNone(ts) || !privateUse.containsNone(rt)) { 206 group |= 16; 207 } 208 209 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.NFKD)) 210 + "\u0000" + ss, 211 "<td class='s'>" + ss + "<br><tt>" + hex(ss) 212 + "</tt></td><td class='t'>" + ts + "<br><tt>" + hex(ts) 213 + "</tt></td><td class='r'>" + rt + "<br><tt>" + hex(rt) + "</tt></td>" ); 214 215 // Check Duals 216 /* 217 int maxDual = 200; 218 dual: 219 for (int i2 = 0; i2 < count; ++i2) { 220 int end2 = sourceSet.getRangeEnd(i2); 221 for (int j2 = sourceSet.getRangeStart(i2); j2 <= end; ++j2) { 222 String ss2 = UTF16.valueOf(j2); 223 String ts2 = t.transliterate(ss2); 224 String rt2 = inverse.transliterate(ts2); 225 226 String ss12 = ss + ss2; 227 String ts12 = t.transliterate(ss + ss12); 228 String rt12 = inverse.transliterate(ts12); 229 if (ts12.equals(ts + ts2) && rt12.equals(rt + rt2)) continue; 230 if (--maxDual < 0) break dual; 231 232 // transliteration of whole differs from that of parts 233 group = 0x100; 234 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss12, Normalizer.DECOMP_COMPAT, 0)) 235 + "\u0000" + ss12, 236 "<td class='s'>" + ss12 + "<br><tt>" + hex(ss12) 237 + "</tt></td><td class='t'>" + ts12 + "<br><tt>" + hex(ts12) 238 + "</tt></td><td class='r'>" + rt12 + "<br><tt>" + hex(rt12) + "</tt></td>" ); 239 } 240 } 241 */ 242 //} 243 } 244 245 246 leftOverSet.remove(0x0100,0x02FF); // remove extended & IPA 247 248 /*int count = leftOverSet.getRangeCount(); 249 for (int i = 0; i < count; ++i) { 250 int end = leftOverSet.getRangeEnd(i); 251 for (int j = leftOverSet.getRangeStart(i); j <= end; ++j) { 252 */ 253 254 usi.reset(leftOverSet); 255 while (usi.next()) { 256 int j = usi.codepoint; 257 258 String ts = UTF16.valueOf(j); 259 // String decomp = Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0); 260 // if (!decomp.equals(ts)) continue; 261 262 String rt = inverse.transliterate(ts); 263 // String flag = ""; 264 char group = 0x80; 265 266 if (!sourceSetPlusAnyways.containsAll(rt)) { 267 group |= 8; 268 } 269 if (!privateUse.containsNone(rt)) { 270 group |= 16; 271 } 272 273 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.NFKD)) + ts, 274 "<td class='s'>-</td><td class='t'>" + ts + "<br><tt>" + hex(ts) 275 + "</tt></td><td class='r'>" 276 + rt + "<br><tt>" + hex(rt) + "</tt></td>"); 277 //} 278 } 279 280 // make file name and open 281 File f = new File("transliteration/chart_" + id.replace('/', '_') + ".html"); 282 String filename = f.getCanonicalFile().toString(); 283 PrintWriter out = new PrintWriter( 284 new OutputStreamWriter( 285 new FileOutputStream(filename), "UTF-8")); 286 //out.print('\uFEFF'); // BOM 287 288 System.out.println("Writing " + filename); 289 290 try { 291 out.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">"); 292 out.println("<HTML><HEAD>"); 293 out.println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>"); 294 out.println("<link rel='stylesheet' href='http://www.unicode.org/charts/uca/charts.css' type='text/css'>"); 295 296 out.println("<BODY>"); 297 out.println("<h1>Transliteration Samples for '" + Transliterator.getDisplayName(id) + "'</h1>"); 298 out.println("<p>This file illustrates the transliterations of " + Transliterator.getDisplayName(id) + "."); 299 out.println("The samples are mechanically generated, and only include single characters"); 300 out.println("from the source set. Thus it will <i>not</i> contain examples where the transliteration"); 301 out.println("depends on the context around the character. For a more detailed -- and interactive -- example, see the"); 302 out.println("<a href='http://demo.icu-project.org/icu-bin/translit'>Transliteration Demo</a></p><hr>"); 303 304 // set up the headers 305 int columnCount = 3; 306 String headerBase = "<th>Source</th><th>Target</th><th>Return</th>"; 307 String headers = headerBase; 308 for (int i = columnCount - 1; i > 0; --i) { 309 if (i != columnCount - 1) headers += "<th> </th>"; 310 headers += headerBase; 311 } 312 313 String tableHeader = "<p><table border='1'><tr>" + headers + "</tr>"; 314 String tableFooter = "</table></p>"; 315 out.println("<h2>Round Trip</h2>"); 316 out.println(tableHeader); 317 318 Iterator it = map.keySet().iterator(); 319 char lastGroup = 0; 320 int count = 0; 321 int column = 0; 322 while (it.hasNext()) { 323 String key = (String) it.next(); 324 char group = key.charAt(0); 325 if (group != lastGroup || count++ > 50) { 326 lastGroup = group; 327 count = 0; 328 if (column != 0) { 329 out.println("</tr>"); 330 column = 0; 331 } 332 out.println(tableFooter); 333 334 // String title = ""; 335 if ((group & 0x100) != 0) out.println("<hr><h2>Duals</h2>"); 336 else if ((group & 0x80) != 0) out.println("<hr><h2>Completeness</h2>"); 337 else out.println("<hr><h2>Round Trip</h2>"); 338 if ((group & 16) != 0) out.println("<h3>Errors: Contains Private Use Characters</h3>"); 339 if ((group & 8) != 0) out.println("<h3>Possible Errors: Return not in Source Set</h3>"); 340 if ((group & 4) != 0) out.println("<h3>One-Way Mapping: Return not equal to Source</h3>"); 341 if ((group & 2) != 0) out.println("<h3>Errors: Return not in Source Set</h3>"); 342 if ((group & 1) != 0) out.println("<h3>Errors: Target not in Target Set</h3>"); 343 344 out.println(tableHeader); 345 column = 0; 346 } 347 String value = (String) map.get(key); 348 if (column++ == 0) out.print("<tr>"); 349 else out.print("<th> </th>"); 350 out.println(value); 351 if (column == 3) { 352 out.println("</tr>"); 353 column = 0; 354 } 355 } 356 if (column != 0) { 357 out.println("</tr>"); 358 column = 0; 359 } 360 out.println(tableFooter + "</BODY></HTML>"); 361 362 } finally { 363 out.close(); 364 } 365 } 366 367 public static String hex(String s) { 368 int cp; 369 StringBuffer results = new StringBuffer(); 370 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 371 cp = UTF16.charAt(s, i); 372 if (i != 0) results.append(' '); 373 results.append(Integer.toHexString(cp)); 374 } 375 return results.toString().toUpperCase(); 376 } 377 378 static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]"); 379 380 /* 381 // tests whether a string is in a set. Also checks for Common and Inherited 382 public static boolean isIn(String s, UnicodeSet set) { 383 int cp; 384 for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) { 385 cp = UTF16.charAt(s, i); 386 if (set.contains(cp)) continue; 387 if (okAnyway.contains(cp)) continue; 388 return false; 389 } 390 return true; 391 } 392 */ 393 394} 395