1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 1996-2010, International Business Machines Corporation and    *
6 * others. All Rights Reserved.                                                *
7 *******************************************************************************
8 */
9
10package com.ibm.icu.dev.test.translit;
11import java.io.File;
12import java.io.FileOutputStream;
13import java.io.IOException;
14import java.io.OutputStreamWriter;
15import java.io.PrintWriter;
16import java.util.Enumeration;
17import java.util.Iterator;
18import java.util.Map;
19import java.util.Set;
20import java.util.TreeMap;
21import java.util.TreeSet;
22
23import com.ibm.icu.lang.UCharacter;
24import com.ibm.icu.lang.UScript;
25import com.ibm.icu.text.Normalizer;
26import com.ibm.icu.text.Transliterator;
27import com.ibm.icu.text.UTF16;
28import com.ibm.icu.text.UnicodeSet;
29import com.ibm.icu.text.UnicodeSetIterator;
30
31public class WriteCharts {
32    public static void main(String[] args) throws IOException {
33        if (false) {
34            printSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");
35        }
36        String testSet = "";
37        if (args.length == 0) args = getAllScripts();
38        for (int i = 0; i < args.length; ++i) {
39    // Enumeration enum = Transliterator.getAvailableIDs();
40            if (args[i].startsWith("[")) {
41                testSet = args[i];
42            } else {
43                print(testSet, args[i]);
44                testSet = "";
45            }
46        }
47    }
48
49    public static void printSet(String source) {
50        UnicodeSet s = new UnicodeSet(source);
51        System.out.println("Printout for '" + source + "'");
52        int count = s.getRangeCount();
53        for (int i = 0; i < count; ++i) {
54            int start = s.getRangeStart(i);
55            int end = s.getRangeEnd(i);
56            System.out.println(Integer.toString(start,16) + ".." + Integer.toString(end,16));
57        }
58    }
59
60    public static String[] getAllScripts() {
61        Set set = new TreeSet();
62        int scripts[];
63        Enumeration sources = Transliterator.getAvailableSources();
64        while(sources.hasMoreElements()) {
65            String source = (String) sources.nextElement();
66            scripts = UScript.getCode(source);
67            if (scripts == null) {
68                System.out.println("[Skipping " + source + "]");
69                continue;
70            }
71            int sourceScript = scripts[0];
72            System.out.println("Source: " + source + ";\tScripts: " + showScripts(scripts));
73            Enumeration targets = Transliterator.getAvailableTargets(source);
74            while(targets.hasMoreElements()) {
75                String target = (String) targets.nextElement();
76                scripts = UScript.getCode(target);
77                if (scripts == null
78                        || priority(scripts[0]) < priority(sourceScript)) {
79                    // skip doing both directions
80                    System.out.println("[Skipping '" + source + "-" + target + "']");
81                    continue;
82                }
83                System.out.println("\tTarget: " + target + ";\tScripts: " + showScripts(scripts));
84                Enumeration variants = Transliterator.getAvailableVariants(source, target);
85                while(variants.hasMoreElements()) {
86                    String variant = (String) variants.nextElement();
87                    String id = source + "-" + target;
88                    if (variant.length() != 0) {
89                        id += "/" + variant;
90                        if (false) {
91                            System.out.println("SKIPPING VARIANT, SINCE IT CURRENTLY BREAKS!\t" + id);
92                            continue;
93                        }
94                    }
95                    System.out.println("\t\t\t\tAdding: '" + id + "'");
96                    set.add(id);
97                }
98            }
99        }
100        String[] results = new String[set.size()];
101        set.toArray(results);
102        return results;
103    }
104
105    static public int priority(int script) {
106        if (script == UScript.LATIN) return -2;
107        return script;
108    }
109
110    public static String showScripts(int[] scripts) {
111        StringBuffer results = new StringBuffer();
112        for (int i = 0; i < scripts.length; ++i) {
113            if (i != 0) results.append(", ");
114            results.append(UScript.getName(scripts[i]));
115        }
116        return results.toString();
117    }
118
119    public static void print(String testSet, String rawId) throws IOException {
120        System.out.println("Processing " + rawId);
121        Transliterator t = Transliterator.getInstance(rawId);
122        String id = t.getID();
123
124        // clean up IDs. Ought to be API for getting source, target, variant
125        int minusPos = id.indexOf('-');
126        String source = id.substring(0,minusPos);
127        String target = id.substring(minusPos+1);
128        int slashPos = target.indexOf('/');
129        if (slashPos >= 0) target = target.substring(0,slashPos);
130
131        // check that the source is a script
132        if (testSet.equals("")) {
133            int[] scripts = UScript.getCode(source);
134            if (scripts == null) {
135                System.out.println("FAILED: "
136                    + Transliterator.getDisplayName(id)
137                    + " does not have a script as the source");
138                return;
139            } else {
140                testSet = "[:" + source + ":]";
141                if (source.equalsIgnoreCase("katakana")) {
142                    testSet = "[" + testSet + "\u30FC]";
143                    printSet(testSet);
144                }
145            }
146        }
147        UnicodeSet sourceSet = new UnicodeSet(testSet);
148
149        // check that the target is a script
150        int[] scripts = UScript.getCode(target);
151        if (scripts == null) {
152            target = "[:Latin:]";
153        } else {
154            target = "[:" + target + ":]";
155        }
156        UnicodeSet targetSet = new UnicodeSet(target);
157
158        Transliterator inverse = t.getInverse();
159
160        //Transliterator hex = Transliterator.getInstance("Any-Hex");
161
162
163        // iterate through script
164        System.out.println("Transliterating " + sourceSet.toPattern(true)
165            + " with " + Transliterator.getDisplayName(id));
166
167        UnicodeSet leftOverSet = new UnicodeSet(targetSet);
168        UnicodeSet privateUse = new UnicodeSet("[:private use:]");
169
170        Map map = new TreeMap();
171
172        UnicodeSet targetSetPlusAnyways = new UnicodeSet(targetSet);
173        targetSetPlusAnyways.addAll(okAnyway);
174
175        UnicodeSet sourceSetPlusAnyways = new UnicodeSet(sourceSet);
176        sourceSetPlusAnyways.addAll(okAnyway);
177
178        UnicodeSetIterator usi = new UnicodeSetIterator(sourceSet);
179
180        while (usi.next()) {
181            int j = usi.codepoint;
182            /*
183        int count = sourceSet.getRangeCount();
184        for (int i = 0; i < count; ++i) {
185            int end = sourceSet.getRangeEnd(i);
186            for (int j = sourceSet.getRangeStart(i); j <= end; ++j) {
187            */
188               // String flag = "";
189                String ss = UTF16.valueOf(j);
190                String ts = t.transliterate(ss);
191                char group = 0;
192                if (!targetSetPlusAnyways.containsAll(ts)) {
193                    group |= 1;
194                }
195                if (UTF16.countCodePoint(ts) == 1) {
196                    leftOverSet.remove(UTF16.charAt(ts,0));
197                }
198                String rt = inverse.transliterate(ts);
199                if (!sourceSetPlusAnyways.containsAll(rt)) {
200                    group |= 2;
201                } else if (!ss.equals(rt)) {
202                    group |= 4;
203                }
204
205                if (!privateUse.containsNone(ts) || !privateUse.containsNone(rt)) {
206                    group |= 16;
207                }
208
209                map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.NFKD))
210                        + "\u0000" + ss,
211                    "<td class='s'>" + ss + "<br><tt>" + hex(ss)
212                        + "</tt></td><td class='t'>" + ts + "<br><tt>" + hex(ts)
213                        + "</tt></td><td class='r'>" + rt + "<br><tt>" + hex(rt) + "</tt></td>" );
214
215                // Check Duals
216                /*
217                int maxDual = 200;
218              dual:
219                for (int i2 = 0; i2 < count; ++i2) {
220                    int end2 = sourceSet.getRangeEnd(i2);
221                    for (int j2 = sourceSet.getRangeStart(i2); j2 <= end; ++j2) {
222                        String ss2 = UTF16.valueOf(j2);
223                        String ts2 = t.transliterate(ss2);
224                        String rt2 = inverse.transliterate(ts2);
225
226                        String ss12 = ss + ss2;
227                        String ts12 = t.transliterate(ss + ss12);
228                        String rt12 = inverse.transliterate(ts12);
229                        if (ts12.equals(ts + ts2) && rt12.equals(rt + rt2)) continue;
230                        if (--maxDual < 0) break dual;
231
232                        // transliteration of whole differs from that of parts
233                        group = 0x100;
234                        map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss12, Normalizer.DECOMP_COMPAT, 0))
235                                + "\u0000" + ss12,
236                            "<td class='s'>" + ss12 + "<br><tt>" + hex(ss12)
237                                + "</tt></td><td class='t'>" + ts12 + "<br><tt>" + hex(ts12)
238                                + "</tt></td><td class='r'>" + rt12 + "<br><tt>" + hex(rt12) + "</tt></td>" );
239                    }
240                }
241                */
242            //}
243        }
244
245
246        leftOverSet.remove(0x0100,0x02FF); // remove extended & IPA
247
248        /*int count = leftOverSet.getRangeCount();
249        for (int i = 0; i < count; ++i) {
250            int end = leftOverSet.getRangeEnd(i);
251            for (int j = leftOverSet.getRangeStart(i); j <= end; ++j) {
252            */
253
254        usi.reset(leftOverSet);
255        while (usi.next()) {
256            int j = usi.codepoint;
257
258                String ts = UTF16.valueOf(j);
259                // String decomp = Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0);
260                // if (!decomp.equals(ts)) continue;
261
262                String rt = inverse.transliterate(ts);
263                // String flag = "";
264                char group = 0x80;
265
266                if (!sourceSetPlusAnyways.containsAll(rt)) {
267                    group |= 8;
268                }
269                if (!privateUse.containsNone(rt)) {
270                    group |= 16;
271                }
272
273                map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.NFKD)) + ts,
274                    "<td class='s'>-</td><td class='t'>" + ts + "<br><tt>" + hex(ts)
275                    + "</tt></td><td class='r'>"
276                    + rt + "<br><tt>" + hex(rt) + "</tt></td>");
277            //}
278        }
279
280        // make file name and open
281        File f = new File("transliteration/chart_" + id.replace('/', '_') + ".html");
282        String filename = f.getCanonicalFile().toString();
283        PrintWriter out = new PrintWriter(
284            new OutputStreamWriter(
285                new FileOutputStream(filename), "UTF-8"));
286        //out.print('\uFEFF'); // BOM
287
288        System.out.println("Writing " + filename);
289
290        try {
291            out.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">");
292            out.println("<HTML><HEAD>");
293            out.println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>");
294            out.println("<link rel='stylesheet' href='http://www.unicode.org/charts/uca/charts.css' type='text/css'>");
295
296            out.println("<BODY>");
297            out.println("<h1>Transliteration Samples for '" + Transliterator.getDisplayName(id) + "'</h1>");
298            out.println("<p>This file illustrates the transliterations of " + Transliterator.getDisplayName(id) + ".");
299            out.println("The samples are mechanically generated, and only include single characters");
300            out.println("from the source set. Thus it will <i>not</i> contain examples where the transliteration");
301            out.println("depends on the context around the character. For a more detailed -- and interactive -- example, see the");
302            out.println("<a href='http://demo.icu-project.org/icu-bin/translit'>Transliteration Demo</a></p><hr>");
303
304            // set up the headers
305            int columnCount = 3;
306            String headerBase = "<th>Source</th><th>Target</th><th>Return</th>";
307            String headers = headerBase;
308            for (int i = columnCount - 1; i > 0; --i) {
309                if (i != columnCount - 1) headers += "<th>&nbsp;</th>";
310                headers += headerBase;
311            }
312
313            String tableHeader = "<p><table border='1'><tr>" + headers + "</tr>";
314            String tableFooter = "</table></p>";
315            out.println("<h2>Round Trip</h2>");
316            out.println(tableHeader);
317
318            Iterator it = map.keySet().iterator();
319            char lastGroup = 0;
320            int count = 0;
321            int column = 0;
322            while (it.hasNext()) {
323                String key = (String) it.next();
324                char group = key.charAt(0);
325                if (group != lastGroup || count++ > 50) {
326                    lastGroup = group;
327                    count = 0;
328                    if (column != 0) {
329                        out.println("</tr>");
330                        column = 0;
331                    }
332                    out.println(tableFooter);
333
334                    // String title = "";
335                    if ((group & 0x100) != 0) out.println("<hr><h2>Duals</h2>");
336                    else if ((group & 0x80) != 0) out.println("<hr><h2>Completeness</h2>");
337                    else out.println("<hr><h2>Round Trip</h2>");
338                    if ((group & 16) != 0) out.println("<h3>Errors: Contains Private Use Characters</h3>");
339                    if ((group & 8) != 0) out.println("<h3>Possible Errors: Return not in Source Set</h3>");
340                    if ((group & 4) != 0) out.println("<h3>One-Way Mapping: Return not equal to Source</h3>");
341                    if ((group & 2) != 0) out.println("<h3>Errors: Return not in Source Set</h3>");
342                    if ((group & 1) != 0) out.println("<h3>Errors: Target not in Target Set</h3>");
343
344                    out.println(tableHeader);
345                    column = 0;
346                }
347                String value = (String) map.get(key);
348                if (column++ == 0) out.print("<tr>");
349                else out.print("<th>&nbsp;</th>");
350                out.println(value);
351                if (column == 3) {
352                    out.println("</tr>");
353                    column = 0;
354                }
355            }
356            if (column != 0) {
357                out.println("</tr>");
358                column = 0;
359            }
360            out.println(tableFooter + "</BODY></HTML>");
361
362        } finally {
363            out.close();
364        }
365    }
366
367    public static String hex(String s) {
368        int cp;
369        StringBuffer results = new StringBuffer();
370        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
371            cp = UTF16.charAt(s, i);
372            if (i != 0) results.append(' ');
373            results.append(Integer.toHexString(cp));
374        }
375        return results.toString().toUpperCase();
376    }
377
378    static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]");
379
380    /*
381    // tests whether a string is in a set. Also checks for Common and Inherited
382    public static boolean isIn(String s, UnicodeSet set) {
383        int cp;
384        for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
385            cp = UTF16.charAt(s, i);
386            if (set.contains(cp)) continue;
387            if (okAnyway.contains(cp)) continue;
388            return false;
389        }
390        return true;
391    }
392    */
393
394}
395