1/*
2 *******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines Corporation and    *
4 * others. All Rights Reserved.                                                *
5 *******************************************************************************
6 */
7
8package com.ibm.icu.dev.test.translit;
9import java.io.File;
10import java.io.FileOutputStream;
11import java.io.IOException;
12import java.io.OutputStreamWriter;
13import java.io.PrintWriter;
14import java.util.Enumeration;
15import java.util.Iterator;
16import java.util.Map;
17import java.util.Set;
18import java.util.TreeMap;
19import java.util.TreeSet;
20
21import com.ibm.icu.lang.UCharacter;
22import com.ibm.icu.lang.UScript;
23import com.ibm.icu.text.Normalizer;
24import com.ibm.icu.text.Transliterator;
25import com.ibm.icu.text.UTF16;
26import com.ibm.icu.text.UnicodeSet;
27import com.ibm.icu.text.UnicodeSetIterator;
28
29public class WriteCharts {
30    public static void main(String[] args) throws IOException {
31        if (false) {
32            printSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");
33        }
34        String testSet = "";
35        if (args.length == 0) args = getAllScripts();
36        for (int i = 0; i < args.length; ++i) {
37    // Enumeration enum = Transliterator.getAvailableIDs();
38            if (args[i].startsWith("[")) {
39                testSet = args[i];
40            } else {
41                print(testSet, args[i]);
42                testSet = "";
43            }
44        }
45    }
46
47    public static void printSet(String source) {
48        UnicodeSet s = new UnicodeSet(source);
49        System.out.println("Printout for '" + source + "'");
50        int count = s.getRangeCount();
51        for (int i = 0; i < count; ++i) {
52            int start = s.getRangeStart(i);
53            int end = s.getRangeEnd(i);
54            System.out.println(Integer.toString(start,16) + ".." + Integer.toString(end,16));
55        }
56    }
57
58    public static String[] getAllScripts() {
59        Set set = new TreeSet();
60        int scripts[];
61        Enumeration sources = Transliterator.getAvailableSources();
62        while(sources.hasMoreElements()) {
63            String source = (String) sources.nextElement();
64            scripts = UScript.getCode(source);
65            if (scripts == null) {
66                System.out.println("[Skipping " + source + "]");
67                continue;
68            }
69            int sourceScript = scripts[0];
70            System.out.println("Source: " + source + ";\tScripts: " + showScripts(scripts));
71            Enumeration targets = Transliterator.getAvailableTargets(source);
72            while(targets.hasMoreElements()) {
73                String target = (String) targets.nextElement();
74                scripts = UScript.getCode(target);
75                if (scripts == null
76                        || priority(scripts[0]) < priority(sourceScript)) {
77                    // skip doing both directions
78                    System.out.println("[Skipping '" + source + "-" + target + "']");
79                    continue;
80                }
81                System.out.println("\tTarget: " + target + ";\tScripts: " + showScripts(scripts));
82                Enumeration variants = Transliterator.getAvailableVariants(source, target);
83                while(variants.hasMoreElements()) {
84                    String variant = (String) variants.nextElement();
85                    String id = source + "-" + target;
86                    if (variant.length() != 0) {
87                        id += "/" + variant;
88                        if (false) {
89                            System.out.println("SKIPPING VARIANT, SINCE IT CURRENTLY BREAKS!\t" + id);
90                            continue;
91                        }
92                    }
93                    System.out.println("\t\t\t\tAdding: '" + id + "'");
94                    set.add(id);
95                }
96            }
97        }
98        String[] results = new String[set.size()];
99        set.toArray(results);
100        return results;
101    }
102
103    static public int priority(int script) {
104        if (script == UScript.LATIN) return -2;
105        return script;
106    }
107
108    public static String showScripts(int[] scripts) {
109        StringBuffer results = new StringBuffer();
110        for (int i = 0; i < scripts.length; ++i) {
111            if (i != 0) results.append(", ");
112            results.append(UScript.getName(scripts[i]));
113        }
114        return results.toString();
115    }
116
117    public static void print(String testSet, String rawId) throws IOException {
118        System.out.println("Processing " + rawId);
119        Transliterator t = Transliterator.getInstance(rawId);
120        String id = t.getID();
121
122        // clean up IDs. Ought to be API for getting source, target, variant
123        int minusPos = id.indexOf('-');
124        String source = id.substring(0,minusPos);
125        String target = id.substring(minusPos+1);
126        int slashPos = target.indexOf('/');
127        if (slashPos >= 0) target = target.substring(0,slashPos);
128
129        // check that the source is a script
130        if (testSet.equals("")) {
131            int[] scripts = UScript.getCode(source);
132            if (scripts == null) {
133                System.out.println("FAILED: "
134                    + Transliterator.getDisplayName(id)
135                    + " does not have a script as the source");
136                return;
137            } else {
138                testSet = "[:" + source + ":]";
139                if (source.equalsIgnoreCase("katakana")) {
140                    testSet = "[" + testSet + "\u30FC]";
141                    printSet(testSet);
142                }
143            }
144        }
145        UnicodeSet sourceSet = new UnicodeSet(testSet);
146
147        // check that the target is a script
148        int[] scripts = UScript.getCode(target);
149        if (scripts == null) {
150            target = "[:Latin:]";
151        } else {
152            target = "[:" + target + ":]";
153        }
154        UnicodeSet targetSet = new UnicodeSet(target);
155
156        Transliterator inverse = t.getInverse();
157
158        //Transliterator hex = Transliterator.getInstance("Any-Hex");
159
160
161        // iterate through script
162        System.out.println("Transliterating " + sourceSet.toPattern(true)
163            + " with " + Transliterator.getDisplayName(id));
164
165        UnicodeSet leftOverSet = new UnicodeSet(targetSet);
166        UnicodeSet privateUse = new UnicodeSet("[:private use:]");
167
168        Map map = new TreeMap();
169
170        UnicodeSet targetSetPlusAnyways = new UnicodeSet(targetSet);
171        targetSetPlusAnyways.addAll(okAnyway);
172
173        UnicodeSet sourceSetPlusAnyways = new UnicodeSet(sourceSet);
174        sourceSetPlusAnyways.addAll(okAnyway);
175
176        UnicodeSetIterator usi = new UnicodeSetIterator(sourceSet);
177
178        while (usi.next()) {
179            int j = usi.codepoint;
180            /*
181        int count = sourceSet.getRangeCount();
182        for (int i = 0; i < count; ++i) {
183            int end = sourceSet.getRangeEnd(i);
184            for (int j = sourceSet.getRangeStart(i); j <= end; ++j) {
185            */
186               // String flag = "";
187                String ss = UTF16.valueOf(j);
188                String ts = t.transliterate(ss);
189                char group = 0;
190                if (!targetSetPlusAnyways.containsAll(ts)) {
191                    group |= 1;
192                }
193                if (UTF16.countCodePoint(ts) == 1) {
194                    leftOverSet.remove(UTF16.charAt(ts,0));
195                }
196                String rt = inverse.transliterate(ts);
197                if (!sourceSetPlusAnyways.containsAll(rt)) {
198                    group |= 2;
199                } else if (!ss.equals(rt)) {
200                    group |= 4;
201                }
202
203                if (!privateUse.containsNone(ts) || !privateUse.containsNone(rt)) {
204                    group |= 16;
205                }
206
207                map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.NFKD))
208                        + "\u0000" + ss,
209                    "<td class='s'>" + ss + "<br><tt>" + hex(ss)
210                        + "</tt></td><td class='t'>" + ts + "<br><tt>" + hex(ts)
211                        + "</tt></td><td class='r'>" + rt + "<br><tt>" + hex(rt) + "</tt></td>" );
212
213                // Check Duals
214                /*
215                int maxDual = 200;
216              dual:
217                for (int i2 = 0; i2 < count; ++i2) {
218                    int end2 = sourceSet.getRangeEnd(i2);
219                    for (int j2 = sourceSet.getRangeStart(i2); j2 <= end; ++j2) {
220                        String ss2 = UTF16.valueOf(j2);
221                        String ts2 = t.transliterate(ss2);
222                        String rt2 = inverse.transliterate(ts2);
223
224                        String ss12 = ss + ss2;
225                        String ts12 = t.transliterate(ss + ss12);
226                        String rt12 = inverse.transliterate(ts12);
227                        if (ts12.equals(ts + ts2) && rt12.equals(rt + rt2)) continue;
228                        if (--maxDual < 0) break dual;
229
230                        // transliteration of whole differs from that of parts
231                        group = 0x100;
232                        map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss12, Normalizer.DECOMP_COMPAT, 0))
233                                + "\u0000" + ss12,
234                            "<td class='s'>" + ss12 + "<br><tt>" + hex(ss12)
235                                + "</tt></td><td class='t'>" + ts12 + "<br><tt>" + hex(ts12)
236                                + "</tt></td><td class='r'>" + rt12 + "<br><tt>" + hex(rt12) + "</tt></td>" );
237                    }
238                }
239                */
240            //}
241        }
242
243
244        leftOverSet.remove(0x0100,0x02FF); // remove extended & IPA
245
246        /*int count = leftOverSet.getRangeCount();
247        for (int i = 0; i < count; ++i) {
248            int end = leftOverSet.getRangeEnd(i);
249            for (int j = leftOverSet.getRangeStart(i); j <= end; ++j) {
250            */
251
252        usi.reset(leftOverSet);
253        while (usi.next()) {
254            int j = usi.codepoint;
255
256                String ts = UTF16.valueOf(j);
257                // String decomp = Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0);
258                // if (!decomp.equals(ts)) continue;
259
260                String rt = inverse.transliterate(ts);
261                // String flag = "";
262                char group = 0x80;
263
264                if (!sourceSetPlusAnyways.containsAll(rt)) {
265                    group |= 8;
266                }
267                if (!privateUse.containsNone(rt)) {
268                    group |= 16;
269                }
270
271                map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.NFKD)) + ts,
272                    "<td class='s'>-</td><td class='t'>" + ts + "<br><tt>" + hex(ts)
273                    + "</tt></td><td class='r'>"
274                    + rt + "<br><tt>" + hex(rt) + "</tt></td>");
275            //}
276        }
277
278        // make file name and open
279        File f = new File("transliteration/chart_" + id.replace('/', '_') + ".html");
280        String filename = f.getCanonicalFile().toString();
281        PrintWriter out = new PrintWriter(
282            new OutputStreamWriter(
283                new FileOutputStream(filename), "UTF-8"));
284        //out.print('\uFEFF'); // BOM
285
286        System.out.println("Writing " + filename);
287
288        try {
289            out.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">");
290            out.println("<HTML><HEAD>");
291            out.println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>");
292            out.println("<link rel='stylesheet' href='http://www.unicode.org/charts/uca/charts.css' type='text/css'>");
293
294            out.println("<BODY>");
295            out.println("<h1>Transliteration Samples for '" + Transliterator.getDisplayName(id) + "'</h1>");
296            out.println("<p>This file illustrates the transliterations of " + Transliterator.getDisplayName(id) + ".");
297            out.println("The samples are mechanically generated, and only include single characters");
298            out.println("from the source set. Thus it will <i>not</i> contain examples where the transliteration");
299            out.println("depends on the context around the character. For a more detailed -- and interactive -- example, see the");
300            out.println("<a href='http://demo.icu-project.org/icu-bin/translit'>Transliteration Demo</a></p><hr>");
301
302            // set up the headers
303            int columnCount = 3;
304            String headerBase = "<th>Source</th><th>Target</th><th>Return</th>";
305            String headers = headerBase;
306            for (int i = columnCount - 1; i > 0; --i) {
307                if (i != columnCount - 1) headers += "<th>&nbsp;</th>";
308                headers += headerBase;
309            }
310
311            String tableHeader = "<p><table border='1'><tr>" + headers + "</tr>";
312            String tableFooter = "</table></p>";
313            out.println("<h2>Round Trip</h2>");
314            out.println(tableHeader);
315
316            Iterator it = map.keySet().iterator();
317            char lastGroup = 0;
318            int count = 0;
319            int column = 0;
320            while (it.hasNext()) {
321                String key = (String) it.next();
322                char group = key.charAt(0);
323                if (group != lastGroup || count++ > 50) {
324                    lastGroup = group;
325                    count = 0;
326                    if (column != 0) {
327                        out.println("</tr>");
328                        column = 0;
329                    }
330                    out.println(tableFooter);
331
332                    // String title = "";
333                    if ((group & 0x100) != 0) out.println("<hr><h2>Duals</h2>");
334                    else if ((group & 0x80) != 0) out.println("<hr><h2>Completeness</h2>");
335                    else out.println("<hr><h2>Round Trip</h2>");
336                    if ((group & 16) != 0) out.println("<h3>Errors: Contains Private Use Characters</h3>");
337                    if ((group & 8) != 0) out.println("<h3>Possible Errors: Return not in Source Set</h3>");
338                    if ((group & 4) != 0) out.println("<h3>One-Way Mapping: Return not equal to Source</h3>");
339                    if ((group & 2) != 0) out.println("<h3>Errors: Return not in Source Set</h3>");
340                    if ((group & 1) != 0) out.println("<h3>Errors: Target not in Target Set</h3>");
341
342                    out.println(tableHeader);
343                    column = 0;
344                }
345                String value = (String) map.get(key);
346                if (column++ == 0) out.print("<tr>");
347                else out.print("<th>&nbsp;</th>");
348                out.println(value);
349                if (column == 3) {
350                    out.println("</tr>");
351                    column = 0;
352                }
353            }
354            if (column != 0) {
355                out.println("</tr>");
356                column = 0;
357            }
358            out.println(tableFooter + "</BODY></HTML>");
359
360        } finally {
361            out.close();
362        }
363    }
364
365    public static String hex(String s) {
366        int cp;
367        StringBuffer results = new StringBuffer();
368        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
369            cp = UTF16.charAt(s, i);
370            if (i != 0) results.append(' ');
371            results.append(Integer.toHexString(cp));
372        }
373        return results.toString().toUpperCase();
374    }
375
376    static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]");
377
378    /*
379    // tests whether a string is in a set. Also checks for Common and Inherited
380    public static boolean isIn(String s, UnicodeSet set) {
381        int cp;
382        for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
383            cp = UTF16.charAt(s, i);
384            if (set.contains(cp)) continue;
385            if (okAnyway.contains(cp)) continue;
386            return false;
387        }
388        return true;
389    }
390    */
391
392}
393