TransliterationChart.java revision bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7
1/**
2 *******************************************************************************
3 * Copyright (C) 2001-2010, International Business Machines Corporation and    *
4 * others. All Rights Reserved.                                                *
5 *******************************************************************************
6 */
7package com.ibm.icu.dev.demo.translit;
8import java.io.BufferedWriter;
9import java.io.File;
10import java.io.FileOutputStream;
11import java.io.IOException;
12import java.io.OutputStreamWriter;
13import java.io.PrintWriter;
14import java.util.Comparator;
15import java.util.HashMap;
16import java.util.Iterator;
17import java.util.Set;
18import java.util.TreeSet;
19
20import com.ibm.icu.impl.Utility;
21import com.ibm.icu.lang.UCharacter;
22import com.ibm.icu.lang.UScript;
23import com.ibm.icu.text.Normalizer;
24import com.ibm.icu.text.Transliterator;
25import com.ibm.icu.text.UTF16;
26import com.ibm.icu.text.UnicodeSet;
27import com.ibm.icu.text.UnicodeSetIterator;
28
29public class TransliterationChart {
30    public static void main(String[] args) throws IOException {
31        System.out.println("Start");
32        UnicodeSet lengthMarks = new UnicodeSet("[\u09D7\u0B56-\u0B57\u0BD7\u0C56\u0CD5-\u0CD6\u0D57\u0C55\u0CD5]");
33        int[] indicScripts = {
34            UScript.LATIN,
35            UScript.DEVANAGARI,
36            UScript.BENGALI,
37            UScript.GURMUKHI,
38            UScript.GUJARATI,
39            UScript.ORIYA,
40            UScript.TAMIL,
41            UScript.TELUGU,
42            UScript.KANNADA,
43            UScript.MALAYALAM,
44        };
45        String[] names = new String[indicScripts.length];
46        UnicodeSet[] sets = new UnicodeSet[indicScripts.length];
47        Transliterator[] fallbacks = new Transliterator[indicScripts.length];
48        for (int i = 0; i < indicScripts.length; ++i) {
49            names[i] = UScript.getName(indicScripts[i]);
50            sets[i] = new UnicodeSet("[[:" + names[i] + ":]&[[:L:][:M:]]&[:age=3.1:]]");
51            fallbacks[i] = Transliterator.getInstance("any-" + names[i]);
52        }
53        EquivClass eq = new EquivClass(new ReverseComparator());
54        PrintWriter pw = openPrintWriter("transChart.html");
55        pw.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
56        pw.println("<title>Indic Transliteration Chart</title><style>");
57        pw.println("td { text-align: Center; font-size: 200% }");
58        pw.println("tt { font-size: 50% }");
59        pw.println("td.miss { background-color: #CCCCFF }");
60        pw.println("</style></head><body bgcolor='#FFFFFF'>");
61
62        Transliterator anyToLatin = Transliterator.getInstance("any-latin");
63
64        String testString = "\u0946\u093E";
65
66        UnicodeSet failNorm = new UnicodeSet();
67        Set latinFail = new TreeSet();
68
69        for (int i = 0; i < indicScripts.length; ++i) {
70            if (indicScripts[i] == UScript.LATIN) continue;
71            String source = names[i];
72            System.out.println(source);
73            UnicodeSet sourceChars = sets[i];
74
75            for (int j = 0; j < indicScripts.length; ++j) {
76                if (i == j) continue;
77                String target = names[j];
78                Transliterator forward = Transliterator.getInstance(source + '-' + target);
79                Transliterator backward = forward.getInverse();
80                UnicodeSetIterator it = new UnicodeSetIterator(sourceChars);
81                while (it.next()) {
82                    if (lengthMarks.contains(it.codepoint)) continue;
83                    String s = Normalizer.normalize(it.codepoint,Normalizer.NFC,0);
84                    //if (!Normalizer.isNormalized(s,Normalizer.NFC,0)) continue;
85                    if (!s.equals(Normalizer.normalize(s,Normalizer.NFD,0))) {
86                        failNorm.add(it.codepoint);
87                    }
88                    String t = fix(forward.transliterate(s));
89                    if (t.equals(testString)) {
90                        System.out.println("debug");
91                    }
92
93                    String r = fix(backward.transliterate(t));
94                    if (Normalizer.compare(s,r,0) == 0) {
95                        if (indicScripts[j] != UScript.LATIN) eq.add(s,t);
96                    } else {
97                        if (indicScripts[j] == UScript.LATIN) {
98                            latinFail.add(s + " - " + t + " - " + r);
99                        }
100                    }
101                }
102            }
103        }
104        // collect equivalents
105        pw.println("<table border='1' cellspacing='0'><tr>");
106        for (int i = 0; i < indicScripts.length; ++i) {
107            pw.print("<th width='10%'>" + names[i].substring(0,3) + "</th>");
108        }
109        pw.println("</tr>");
110
111        Iterator rit = eq.getSetIterator(new MyComparator());
112        while(rit.hasNext()) {
113            Set equivs = (Set)rit.next();
114            pw.print("<tr>");
115            Iterator sit = equivs.iterator();
116            String source = (String)sit.next();
117            String item = anyToLatin.transliterate(source);
118            if (item.equals("") || source.equals(item)) item = "&nbsp;";
119            pw.print("<td>" + item + "</td>");
120            for (int i = 1; i < indicScripts.length; ++i) {
121                sit = equivs.iterator();
122                item = "";
123                while (sit.hasNext()) {
124                    String trial = (String)sit.next();
125                    if (!sets[i].containsAll(trial)) continue;
126                    item = trial;
127                    break;
128                }
129                String classString = "";
130                if (item.equals("")) {
131                    classString = " class='miss'";
132                    String temp = fallbacks[i].transliterate(source);
133                    if (!temp.equals("") && !temp.equals(source)) item = temp;
134                }
135                String backup = item.equals("") ? "&nbsp;" : item;
136                pw.print("<td" + classString + " title='" + getName(item, "; ") + "'>"
137                    + backup + "<br><tt>" + Utility.hex(item) + "</tt></td>");
138            }
139            /*
140            Iterator sit = equivs.iterator();
141            while (sit.hasNext()) {
142                String item = (String)sit.next();
143                pw.print("<td>" + item + "</td>");
144            }
145            */
146            pw.println("</tr>");
147        }
148        pw.println("</table>");
149        if (true) {
150            pw.println("<h2>Failed Normalization</h2>");
151
152            UnicodeSetIterator it = new UnicodeSetIterator(failNorm);
153            UnicodeSet pieces = new UnicodeSet();
154            while (it.next()) {
155                String s = UTF16.valueOf(it.codepoint);
156                String d = Normalizer.normalize(s,Normalizer.NFD,0);
157                pw.println("Norm:" + s + ", " + Utility.hex(s) + " " + UCharacter.getName(it.codepoint)
158                     + "; " + d + ", " + Utility.hex(d) + ", ");
159                pw.println(UCharacter.getName(d.charAt(1)) + "<br>");
160                if (UCharacter.getName(d.charAt(1)).indexOf("LENGTH") >= 0) pieces.add(d.charAt(1));
161            }
162            pw.println(pieces);
163
164            pw.println("<h2>Failed Round-Trip</h2>");
165            Iterator cit = latinFail.iterator();
166            while (cit.hasNext()) {
167                pw.println(cit.next() + "<br>");
168            }
169        }
170
171        pw.println("</table></body></html>");
172        pw.close();
173        System.out.println("Done");
174    }
175
176    public static String fix(String s) {
177        if (s.equals("\u0946\u093E")) return "\u094A";
178        if (s.equals("\u0C46\u0C3E")) return "\u0C4A";
179        if (s.equals("\u0CC6\u0CBE")) return "\u0CCA";
180
181        if (s.equals("\u0947\u093E")) return "\u094B";
182        if (s.equals("\u0A47\u0A3E")) return "\u0A4B";
183        if (s.equals("\u0AC7\u0ABE")) return "\u0ACB";
184        if (s.equals("\u0C47\u0C3E")) return "\u0C4B";
185        if (s.equals("\u0CC7\u0CBE")) return "\u0CCB";
186
187        //return Normalizer.normalize(s,Normalizer.NFD,0);
188        return s;
189    }
190
191    public static PrintWriter openPrintWriter(String fileName) throws IOException {
192        File lf = new File(fileName);
193        System.out.println("Creating file: " + lf.getAbsoluteFile());
194
195        return new PrintWriter(
196                new BufferedWriter(
197                    new OutputStreamWriter(
198                        new FileOutputStream(fileName), "UTF8"), 4*1024));
199    }
200
201
202    public static String getName(String s, String separator) {
203        int cp;
204        StringBuffer sb = new StringBuffer();
205        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
206            cp = UTF16.charAt(s,i);
207            if (i != 0) sb.append(separator);
208            sb.append(UCharacter.getName(cp));
209        }
210        return sb.toString();
211    }
212
213    static class MyComparator implements Comparator {
214       public int compare(Object o1, Object o2) {
215            Iterator i1 = ((TreeSet) o1).iterator();
216            Iterator i2 = ((TreeSet) o2).iterator();
217            while (i1.hasNext() && i2.hasNext()) {
218                String a = (String)i1.next();
219                String b = (String)i2.next();
220                int result = a.compareTo(b);
221                if (result != 0) return result;
222            }
223            if (i1.hasNext()) return 1;
224            if (i2.hasNext()) return -1;
225            return 0;
226        }
227
228    }
229    static class ReverseComparator implements Comparator {
230        public int compare(Object o1, Object o2) {
231            String a = o1.toString();
232            char a1 = a.charAt(0);
233            String b = o2.toString();
234            char b1 = b.charAt(0);
235            if (a1 < 0x900 && b1 > 0x900) return -1;
236            if (a1 > 0x900 && b1 < 0x900) return +1;
237            return a.compareTo(b);
238        }
239    }
240
241    static class EquivClass {
242        EquivClass(Comparator c) {
243            comparator = c;
244        }
245        private HashMap itemToSet = new HashMap();
246        private Comparator comparator;
247
248        void add(Object a, Object b) {
249            Set sa = (Set)itemToSet.get(a);
250            Set sb = (Set)itemToSet.get(b);
251            if (sa == null && sb == null) { // new set!
252                Set s = new TreeSet(comparator);
253                s.add(a);
254                s.add(b);
255                itemToSet.put(a, s);
256                itemToSet.put(b, s);
257            } else if (sa == null) {
258                sb.add(a);
259            } else if (sb == null) {
260                sa.add(b);
261            } else { // merge sets, dumping sb
262                sa.addAll(sb);
263                Iterator it = sb.iterator();
264                while (it.hasNext()) {
265                    itemToSet.put(it.next(), sa);
266                }
267            }
268        }
269
270        private class MyIterator implements Iterator {
271            private Iterator it;
272            MyIterator (Comparator comp) {
273                TreeSet values = new TreeSet(comp);
274                values.addAll(itemToSet.values());
275                it = values.iterator();
276            }
277
278            public boolean hasNext() {
279                return it.hasNext();
280            }
281            public Object next() {
282                return it.next();
283            }
284            public void remove() {
285                throw new IllegalArgumentException("can't remove");
286            }
287        }
288
289        public Iterator getSetIterator (Comparator comp) {
290            return new MyIterator(comp);
291        }
292
293    }
294}