1/** 2 ******************************************************************************* 3 * Copyright (C) 2001-2010, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7package com.ibm.icu.dev.demo.translit; 8import java.io.BufferedWriter; 9import java.io.File; 10import java.io.FileOutputStream; 11import java.io.IOException; 12import java.io.OutputStreamWriter; 13import java.io.PrintWriter; 14import java.util.Comparator; 15import java.util.HashMap; 16import java.util.Iterator; 17import java.util.Set; 18import java.util.TreeSet; 19 20import com.ibm.icu.impl.Utility; 21import com.ibm.icu.lang.UCharacter; 22import com.ibm.icu.lang.UScript; 23import com.ibm.icu.text.Normalizer; 24import com.ibm.icu.text.Transliterator; 25import com.ibm.icu.text.UTF16; 26import com.ibm.icu.text.UnicodeSet; 27import com.ibm.icu.text.UnicodeSetIterator; 28 29public class TransliterationChart { 30 public static void main(String[] args) throws IOException { 31 System.out.println("Start"); 32 UnicodeSet lengthMarks = new UnicodeSet("[\u09D7\u0B56-\u0B57\u0BD7\u0C56\u0CD5-\u0CD6\u0D57\u0C55\u0CD5]"); 33 int[] indicScripts = { 34 UScript.LATIN, 35 UScript.DEVANAGARI, 36 UScript.BENGALI, 37 UScript.GURMUKHI, 38 UScript.GUJARATI, 39 UScript.ORIYA, 40 UScript.TAMIL, 41 UScript.TELUGU, 42 UScript.KANNADA, 43 UScript.MALAYALAM, 44 }; 45 String[] names = new String[indicScripts.length]; 46 UnicodeSet[] sets = new UnicodeSet[indicScripts.length]; 47 Transliterator[] fallbacks = new Transliterator[indicScripts.length]; 48 for (int i = 0; i < indicScripts.length; ++i) { 49 names[i] = UScript.getName(indicScripts[i]); 50 sets[i] = new UnicodeSet("[[:" + names[i] + ":]&[[:L:][:M:]]&[:age=3.1:]]"); 51 fallbacks[i] = Transliterator.getInstance("any-" + names[i]); 52 } 53 EquivClass eq = new EquivClass(new ReverseComparator()); 54 PrintWriter pw = openPrintWriter("transChart.html"); 55 pw.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>"); 56 pw.println("<title>Indic Transliteration Chart</title><style>"); 57 pw.println("td { text-align: Center; font-size: 200% }"); 58 pw.println("tt { font-size: 50% }"); 59 pw.println("td.miss { background-color: #CCCCFF }"); 60 pw.println("</style></head><body bgcolor='#FFFFFF'>"); 61 62 Transliterator anyToLatin = Transliterator.getInstance("any-latin"); 63 64 String testString = "\u0946\u093E"; 65 66 UnicodeSet failNorm = new UnicodeSet(); 67 Set latinFail = new TreeSet(); 68 69 for (int i = 0; i < indicScripts.length; ++i) { 70 if (indicScripts[i] == UScript.LATIN) continue; 71 String source = names[i]; 72 System.out.println(source); 73 UnicodeSet sourceChars = sets[i]; 74 75 for (int j = 0; j < indicScripts.length; ++j) { 76 if (i == j) continue; 77 String target = names[j]; 78 Transliterator forward = Transliterator.getInstance(source + '-' + target); 79 Transliterator backward = forward.getInverse(); 80 UnicodeSetIterator it = new UnicodeSetIterator(sourceChars); 81 while (it.next()) { 82 if (lengthMarks.contains(it.codepoint)) continue; 83 String s = Normalizer.normalize(it.codepoint,Normalizer.NFC,0); 84 //if (!Normalizer.isNormalized(s,Normalizer.NFC,0)) continue; 85 if (!s.equals(Normalizer.normalize(s,Normalizer.NFD,0))) { 86 failNorm.add(it.codepoint); 87 } 88 String t = fix(forward.transliterate(s)); 89 if (t.equals(testString)) { 90 System.out.println("debug"); 91 } 92 93 String r = fix(backward.transliterate(t)); 94 if (Normalizer.compare(s,r,0) == 0) { 95 if (indicScripts[j] != UScript.LATIN) eq.add(s,t); 96 } else { 97 if (indicScripts[j] == UScript.LATIN) { 98 latinFail.add(s + " - " + t + " - " + r); 99 } 100 } 101 } 102 } 103 } 104 // collect equivalents 105 pw.println("<table border='1' cellspacing='0'><tr>"); 106 for (int i = 0; i < indicScripts.length; ++i) { 107 pw.print("<th width='10%'>" + names[i].substring(0,3) + "</th>"); 108 } 109 pw.println("</tr>"); 110 111 Iterator rit = eq.getSetIterator(new MyComparator()); 112 while(rit.hasNext()) { 113 Set equivs = (Set)rit.next(); 114 pw.print("<tr>"); 115 Iterator sit = equivs.iterator(); 116 String source = (String)sit.next(); 117 String item = anyToLatin.transliterate(source); 118 if (item.equals("") || source.equals(item)) item = " "; 119 pw.print("<td>" + item + "</td>"); 120 for (int i = 1; i < indicScripts.length; ++i) { 121 sit = equivs.iterator(); 122 item = ""; 123 while (sit.hasNext()) { 124 String trial = (String)sit.next(); 125 if (!sets[i].containsAll(trial)) continue; 126 item = trial; 127 break; 128 } 129 String classString = ""; 130 if (item.equals("")) { 131 classString = " class='miss'"; 132 String temp = fallbacks[i].transliterate(source); 133 if (!temp.equals("") && !temp.equals(source)) item = temp; 134 } 135 String backup = item.equals("") ? " " : item; 136 pw.print("<td" + classString + " title='" + getName(item, "; ") + "'>" 137 + backup + "<br><tt>" + Utility.hex(item) + "</tt></td>"); 138 } 139 /* 140 Iterator sit = equivs.iterator(); 141 while (sit.hasNext()) { 142 String item = (String)sit.next(); 143 pw.print("<td>" + item + "</td>"); 144 } 145 */ 146 pw.println("</tr>"); 147 } 148 pw.println("</table>"); 149 if (true) { 150 pw.println("<h2>Failed Normalization</h2>"); 151 152 UnicodeSetIterator it = new UnicodeSetIterator(failNorm); 153 UnicodeSet pieces = new UnicodeSet(); 154 while (it.next()) { 155 String s = UTF16.valueOf(it.codepoint); 156 String d = Normalizer.normalize(s,Normalizer.NFD,0); 157 pw.println("Norm:" + s + ", " + Utility.hex(s) + " " + UCharacter.getName(it.codepoint) 158 + "; " + d + ", " + Utility.hex(d) + ", "); 159 pw.println(UCharacter.getName(d.charAt(1)) + "<br>"); 160 if (UCharacter.getName(d.charAt(1)).indexOf("LENGTH") >= 0) pieces.add(d.charAt(1)); 161 } 162 pw.println(pieces); 163 164 pw.println("<h2>Failed Round-Trip</h2>"); 165 Iterator cit = latinFail.iterator(); 166 while (cit.hasNext()) { 167 pw.println(cit.next() + "<br>"); 168 } 169 } 170 171 pw.println("</table></body></html>"); 172 pw.close(); 173 System.out.println("Done"); 174 } 175 176 public static String fix(String s) { 177 if (s.equals("\u0946\u093E")) return "\u094A"; 178 if (s.equals("\u0C46\u0C3E")) return "\u0C4A"; 179 if (s.equals("\u0CC6\u0CBE")) return "\u0CCA"; 180 181 if (s.equals("\u0947\u093E")) return "\u094B"; 182 if (s.equals("\u0A47\u0A3E")) return "\u0A4B"; 183 if (s.equals("\u0AC7\u0ABE")) return "\u0ACB"; 184 if (s.equals("\u0C47\u0C3E")) return "\u0C4B"; 185 if (s.equals("\u0CC7\u0CBE")) return "\u0CCB"; 186 187 //return Normalizer.normalize(s,Normalizer.NFD,0); 188 return s; 189 } 190 191 public static PrintWriter openPrintWriter(String fileName) throws IOException { 192 File lf = new File(fileName); 193 System.out.println("Creating file: " + lf.getAbsoluteFile()); 194 195 return new PrintWriter( 196 new BufferedWriter( 197 new OutputStreamWriter( 198 new FileOutputStream(fileName), "UTF8"), 4*1024)); 199 } 200 201 202 public static String getName(String s, String separator) { 203 int cp; 204 StringBuffer sb = new StringBuffer(); 205 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 206 cp = UTF16.charAt(s,i); 207 if (i != 0) sb.append(separator); 208 sb.append(UCharacter.getName(cp)); 209 } 210 return sb.toString(); 211 } 212 213 static class MyComparator implements Comparator { 214 public int compare(Object o1, Object o2) { 215 Iterator i1 = ((TreeSet) o1).iterator(); 216 Iterator i2 = ((TreeSet) o2).iterator(); 217 while (i1.hasNext() && i2.hasNext()) { 218 String a = (String)i1.next(); 219 String b = (String)i2.next(); 220 int result = a.compareTo(b); 221 if (result != 0) return result; 222 } 223 if (i1.hasNext()) return 1; 224 if (i2.hasNext()) return -1; 225 return 0; 226 } 227 228 } 229 static class ReverseComparator implements Comparator { 230 public int compare(Object o1, Object o2) { 231 String a = o1.toString(); 232 char a1 = a.charAt(0); 233 String b = o2.toString(); 234 char b1 = b.charAt(0); 235 if (a1 < 0x900 && b1 > 0x900) return -1; 236 if (a1 > 0x900 && b1 < 0x900) return +1; 237 return a.compareTo(b); 238 } 239 } 240 241 static class EquivClass { 242 EquivClass(Comparator c) { 243 comparator = c; 244 } 245 private HashMap itemToSet = new HashMap(); 246 private Comparator comparator; 247 248 void add(Object a, Object b) { 249 Set sa = (Set)itemToSet.get(a); 250 Set sb = (Set)itemToSet.get(b); 251 if (sa == null && sb == null) { // new set! 252 Set s = new TreeSet(comparator); 253 s.add(a); 254 s.add(b); 255 itemToSet.put(a, s); 256 itemToSet.put(b, s); 257 } else if (sa == null) { 258 sb.add(a); 259 } else if (sb == null) { 260 sa.add(b); 261 } else { // merge sets, dumping sb 262 sa.addAll(sb); 263 Iterator it = sb.iterator(); 264 while (it.hasNext()) { 265 itemToSet.put(it.next(), sa); 266 } 267 } 268 } 269 270 private class MyIterator implements Iterator { 271 private Iterator it; 272 MyIterator (Comparator comp) { 273 TreeSet values = new TreeSet(comp); 274 values.addAll(itemToSet.values()); 275 it = values.iterator(); 276 } 277 278 public boolean hasNext() { 279 return it.hasNext(); 280 } 281 public Object next() { 282 return it.next(); 283 } 284 public void remove() { 285 throw new IllegalArgumentException("can't remove"); 286 } 287 } 288 289 public Iterator getSetIterator (Comparator comp) { 290 return new MyIterator(comp); 291 } 292 293 } 294}