1/* 2 ******************************************************************************* 3 * Copyright (C) 1996-2010, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 8package com.ibm.icu.dev.test.translit; 9import java.io.File; 10import java.io.FileOutputStream; 11import java.io.IOException; 12import java.io.OutputStreamWriter; 13import java.io.PrintWriter; 14import java.util.Enumeration; 15import java.util.Iterator; 16import java.util.Map; 17import java.util.Set; 18import java.util.TreeMap; 19import java.util.TreeSet; 20 21import com.ibm.icu.lang.UCharacter; 22import com.ibm.icu.lang.UScript; 23import com.ibm.icu.text.Normalizer; 24import com.ibm.icu.text.Transliterator; 25import com.ibm.icu.text.UTF16; 26import com.ibm.icu.text.UnicodeSet; 27import com.ibm.icu.text.UnicodeSetIterator; 28 29public class WriteCharts { 30 public static void main(String[] args) throws IOException { 31 if (false) { 32 printSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]"); 33 } 34 String testSet = ""; 35 if (args.length == 0) args = getAllScripts(); 36 for (int i = 0; i < args.length; ++i) { 37 // Enumeration enum = Transliterator.getAvailableIDs(); 38 if (args[i].startsWith("[")) { 39 testSet = args[i]; 40 } else { 41 print(testSet, args[i]); 42 testSet = ""; 43 } 44 } 45 } 46 47 public static void printSet(String source) { 48 UnicodeSet s = new UnicodeSet(source); 49 System.out.println("Printout for '" + source + "'"); 50 int count = s.getRangeCount(); 51 for (int i = 0; i < count; ++i) { 52 int start = s.getRangeStart(i); 53 int end = s.getRangeEnd(i); 54 System.out.println(Integer.toString(start,16) + ".." + Integer.toString(end,16)); 55 } 56 } 57 58 public static String[] getAllScripts() { 59 Set set = new TreeSet(); 60 int scripts[]; 61 Enumeration sources = Transliterator.getAvailableSources(); 62 while(sources.hasMoreElements()) { 63 String source = (String) sources.nextElement(); 64 scripts = UScript.getCode(source); 65 if (scripts == null) { 66 System.out.println("[Skipping " + source + "]"); 67 continue; 68 } 69 int sourceScript = scripts[0]; 70 System.out.println("Source: " + source + ";\tScripts: " + showScripts(scripts)); 71 Enumeration targets = Transliterator.getAvailableTargets(source); 72 while(targets.hasMoreElements()) { 73 String target = (String) targets.nextElement(); 74 scripts = UScript.getCode(target); 75 if (scripts == null 76 || priority(scripts[0]) < priority(sourceScript)) { 77 // skip doing both directions 78 System.out.println("[Skipping '" + source + "-" + target + "']"); 79 continue; 80 } 81 System.out.println("\tTarget: " + target + ";\tScripts: " + showScripts(scripts)); 82 Enumeration variants = Transliterator.getAvailableVariants(source, target); 83 while(variants.hasMoreElements()) { 84 String variant = (String) variants.nextElement(); 85 String id = source + "-" + target; 86 if (variant.length() != 0) { 87 id += "/" + variant; 88 if (false) { 89 System.out.println("SKIPPING VARIANT, SINCE IT CURRENTLY BREAKS!\t" + id); 90 continue; 91 } 92 } 93 System.out.println("\t\t\t\tAdding: '" + id + "'"); 94 set.add(id); 95 } 96 } 97 } 98 String[] results = new String[set.size()]; 99 set.toArray(results); 100 return results; 101 } 102 103 static public int priority(int script) { 104 if (script == UScript.LATIN) return -2; 105 return script; 106 } 107 108 public static String showScripts(int[] scripts) { 109 StringBuffer results = new StringBuffer(); 110 for (int i = 0; i < scripts.length; ++i) { 111 if (i != 0) results.append(", "); 112 results.append(UScript.getName(scripts[i])); 113 } 114 return results.toString(); 115 } 116 117 public static void print(String testSet, String rawId) throws IOException { 118 System.out.println("Processing " + rawId); 119 Transliterator t = Transliterator.getInstance(rawId); 120 String id = t.getID(); 121 122 // clean up IDs. Ought to be API for getting source, target, variant 123 int minusPos = id.indexOf('-'); 124 String source = id.substring(0,minusPos); 125 String target = id.substring(minusPos+1); 126 int slashPos = target.indexOf('/'); 127 if (slashPos >= 0) target = target.substring(0,slashPos); 128 129 // check that the source is a script 130 if (testSet.equals("")) { 131 int[] scripts = UScript.getCode(source); 132 if (scripts == null) { 133 System.out.println("FAILED: " 134 + Transliterator.getDisplayName(id) 135 + " does not have a script as the source"); 136 return; 137 } else { 138 testSet = "[:" + source + ":]"; 139 if (source.equalsIgnoreCase("katakana")) { 140 testSet = "[" + testSet + "\u30FC]"; 141 printSet(testSet); 142 } 143 } 144 } 145 UnicodeSet sourceSet = new UnicodeSet(testSet); 146 147 // check that the target is a script 148 int[] scripts = UScript.getCode(target); 149 if (scripts == null) { 150 target = "[:Latin:]"; 151 } else { 152 target = "[:" + target + ":]"; 153 } 154 UnicodeSet targetSet = new UnicodeSet(target); 155 156 Transliterator inverse = t.getInverse(); 157 158 //Transliterator hex = Transliterator.getInstance("Any-Hex"); 159 160 161 // iterate through script 162 System.out.println("Transliterating " + sourceSet.toPattern(true) 163 + " with " + Transliterator.getDisplayName(id)); 164 165 UnicodeSet leftOverSet = new UnicodeSet(targetSet); 166 UnicodeSet privateUse = new UnicodeSet("[:private use:]"); 167 168 Map map = new TreeMap(); 169 170 UnicodeSet targetSetPlusAnyways = new UnicodeSet(targetSet); 171 targetSetPlusAnyways.addAll(okAnyway); 172 173 UnicodeSet sourceSetPlusAnyways = new UnicodeSet(sourceSet); 174 sourceSetPlusAnyways.addAll(okAnyway); 175 176 UnicodeSetIterator usi = new UnicodeSetIterator(sourceSet); 177 178 while (usi.next()) { 179 int j = usi.codepoint; 180 /* 181 int count = sourceSet.getRangeCount(); 182 for (int i = 0; i < count; ++i) { 183 int end = sourceSet.getRangeEnd(i); 184 for (int j = sourceSet.getRangeStart(i); j <= end; ++j) { 185 */ 186 // String flag = ""; 187 String ss = UTF16.valueOf(j); 188 String ts = t.transliterate(ss); 189 char group = 0; 190 if (!targetSetPlusAnyways.containsAll(ts)) { 191 group |= 1; 192 } 193 if (UTF16.countCodePoint(ts) == 1) { 194 leftOverSet.remove(UTF16.charAt(ts,0)); 195 } 196 String rt = inverse.transliterate(ts); 197 if (!sourceSetPlusAnyways.containsAll(rt)) { 198 group |= 2; 199 } else if (!ss.equals(rt)) { 200 group |= 4; 201 } 202 203 if (!privateUse.containsNone(ts) || !privateUse.containsNone(rt)) { 204 group |= 16; 205 } 206 207 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.NFKD)) 208 + "\u0000" + ss, 209 "<td class='s'>" + ss + "<br><tt>" + hex(ss) 210 + "</tt></td><td class='t'>" + ts + "<br><tt>" + hex(ts) 211 + "</tt></td><td class='r'>" + rt + "<br><tt>" + hex(rt) + "</tt></td>" ); 212 213 // Check Duals 214 /* 215 int maxDual = 200; 216 dual: 217 for (int i2 = 0; i2 < count; ++i2) { 218 int end2 = sourceSet.getRangeEnd(i2); 219 for (int j2 = sourceSet.getRangeStart(i2); j2 <= end; ++j2) { 220 String ss2 = UTF16.valueOf(j2); 221 String ts2 = t.transliterate(ss2); 222 String rt2 = inverse.transliterate(ts2); 223 224 String ss12 = ss + ss2; 225 String ts12 = t.transliterate(ss + ss12); 226 String rt12 = inverse.transliterate(ts12); 227 if (ts12.equals(ts + ts2) && rt12.equals(rt + rt2)) continue; 228 if (--maxDual < 0) break dual; 229 230 // transliteration of whole differs from that of parts 231 group = 0x100; 232 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss12, Normalizer.DECOMP_COMPAT, 0)) 233 + "\u0000" + ss12, 234 "<td class='s'>" + ss12 + "<br><tt>" + hex(ss12) 235 + "</tt></td><td class='t'>" + ts12 + "<br><tt>" + hex(ts12) 236 + "</tt></td><td class='r'>" + rt12 + "<br><tt>" + hex(rt12) + "</tt></td>" ); 237 } 238 } 239 */ 240 //} 241 } 242 243 244 leftOverSet.remove(0x0100,0x02FF); // remove extended & IPA 245 246 /*int count = leftOverSet.getRangeCount(); 247 for (int i = 0; i < count; ++i) { 248 int end = leftOverSet.getRangeEnd(i); 249 for (int j = leftOverSet.getRangeStart(i); j <= end; ++j) { 250 */ 251 252 usi.reset(leftOverSet); 253 while (usi.next()) { 254 int j = usi.codepoint; 255 256 String ts = UTF16.valueOf(j); 257 // String decomp = Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0); 258 // if (!decomp.equals(ts)) continue; 259 260 String rt = inverse.transliterate(ts); 261 // String flag = ""; 262 char group = 0x80; 263 264 if (!sourceSetPlusAnyways.containsAll(rt)) { 265 group |= 8; 266 } 267 if (!privateUse.containsNone(rt)) { 268 group |= 16; 269 } 270 271 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.NFKD)) + ts, 272 "<td class='s'>-</td><td class='t'>" + ts + "<br><tt>" + hex(ts) 273 + "</tt></td><td class='r'>" 274 + rt + "<br><tt>" + hex(rt) + "</tt></td>"); 275 //} 276 } 277 278 // make file name and open 279 File f = new File("transliteration/chart_" + id.replace('/', '_') + ".html"); 280 String filename = f.getCanonicalFile().toString(); 281 PrintWriter out = new PrintWriter( 282 new OutputStreamWriter( 283 new FileOutputStream(filename), "UTF-8")); 284 //out.print('\uFEFF'); // BOM 285 286 System.out.println("Writing " + filename); 287 288 try { 289 out.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">"); 290 out.println("<HTML><HEAD>"); 291 out.println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>"); 292 out.println("<link rel='stylesheet' href='http://www.unicode.org/charts/uca/charts.css' type='text/css'>"); 293 294 out.println("<BODY>"); 295 out.println("<h1>Transliteration Samples for '" + Transliterator.getDisplayName(id) + "'</h1>"); 296 out.println("<p>This file illustrates the transliterations of " + Transliterator.getDisplayName(id) + "."); 297 out.println("The samples are mechanically generated, and only include single characters"); 298 out.println("from the source set. Thus it will <i>not</i> contain examples where the transliteration"); 299 out.println("depends on the context around the character. For a more detailed -- and interactive -- example, see the"); 300 out.println("<a href='http://demo.icu-project.org/icu-bin/translit'>Transliteration Demo</a></p><hr>"); 301 302 // set up the headers 303 int columnCount = 3; 304 String headerBase = "<th>Source</th><th>Target</th><th>Return</th>"; 305 String headers = headerBase; 306 for (int i = columnCount - 1; i > 0; --i) { 307 if (i != columnCount - 1) headers += "<th> </th>"; 308 headers += headerBase; 309 } 310 311 String tableHeader = "<p><table border='1'><tr>" + headers + "</tr>"; 312 String tableFooter = "</table></p>"; 313 out.println("<h2>Round Trip</h2>"); 314 out.println(tableHeader); 315 316 Iterator it = map.keySet().iterator(); 317 char lastGroup = 0; 318 int count = 0; 319 int column = 0; 320 while (it.hasNext()) { 321 String key = (String) it.next(); 322 char group = key.charAt(0); 323 if (group != lastGroup || count++ > 50) { 324 lastGroup = group; 325 count = 0; 326 if (column != 0) { 327 out.println("</tr>"); 328 column = 0; 329 } 330 out.println(tableFooter); 331 332 // String title = ""; 333 if ((group & 0x100) != 0) out.println("<hr><h2>Duals</h2>"); 334 else if ((group & 0x80) != 0) out.println("<hr><h2>Completeness</h2>"); 335 else out.println("<hr><h2>Round Trip</h2>"); 336 if ((group & 16) != 0) out.println("<h3>Errors: Contains Private Use Characters</h3>"); 337 if ((group & 8) != 0) out.println("<h3>Possible Errors: Return not in Source Set</h3>"); 338 if ((group & 4) != 0) out.println("<h3>One-Way Mapping: Return not equal to Source</h3>"); 339 if ((group & 2) != 0) out.println("<h3>Errors: Return not in Source Set</h3>"); 340 if ((group & 1) != 0) out.println("<h3>Errors: Target not in Target Set</h3>"); 341 342 out.println(tableHeader); 343 column = 0; 344 } 345 String value = (String) map.get(key); 346 if (column++ == 0) out.print("<tr>"); 347 else out.print("<th> </th>"); 348 out.println(value); 349 if (column == 3) { 350 out.println("</tr>"); 351 column = 0; 352 } 353 } 354 if (column != 0) { 355 out.println("</tr>"); 356 column = 0; 357 } 358 out.println(tableFooter + "</BODY></HTML>"); 359 360 } finally { 361 out.close(); 362 } 363 } 364 365 public static String hex(String s) { 366 int cp; 367 StringBuffer results = new StringBuffer(); 368 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 369 cp = UTF16.charAt(s, i); 370 if (i != 0) results.append(' '); 371 results.append(Integer.toHexString(cp)); 372 } 373 return results.toString().toUpperCase(); 374 } 375 376 static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]"); 377 378 /* 379 // tests whether a string is in a set. Also checks for Common and Inherited 380 public static boolean isIn(String s, UnicodeSet set) { 381 int cp; 382 for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) { 383 cp = UTF16.charAt(s, i); 384 if (set.contains(cp)) continue; 385 if (okAnyway.contains(cp)) continue; 386 return false; 387 } 388 return true; 389 } 390 */ 391 392} 393