// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html#License /* ******************************************************************************* * Copyright (C) 1996-2010, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.dev.test.translit; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.Enumeration; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UScript; import com.ibm.icu.text.Normalizer; import com.ibm.icu.text.Transliterator; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; public class WriteCharts { public static void main(String[] args) throws IOException { if (false) { printSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]"); } String testSet = ""; if (args.length == 0) args = getAllScripts(); for (int i = 0; i < args.length; ++i) { // Enumeration enum = Transliterator.getAvailableIDs(); if (args[i].startsWith("[")) { testSet = args[i]; } else { print(testSet, args[i]); testSet = ""; } } } public static void printSet(String source) { UnicodeSet s = new UnicodeSet(source); System.out.println("Printout for '" + source + "'"); int count = s.getRangeCount(); for (int i = 0; i < count; ++i) { int start = s.getRangeStart(i); int end = s.getRangeEnd(i); System.out.println(Integer.toString(start,16) + ".." + Integer.toString(end,16)); } } public static String[] getAllScripts() { Set set = new TreeSet(); int scripts[]; Enumeration sources = Transliterator.getAvailableSources(); while(sources.hasMoreElements()) { String source = (String) sources.nextElement(); scripts = UScript.getCode(source); if (scripts == null) { System.out.println("[Skipping " + source + "]"); continue; } int sourceScript = scripts[0]; System.out.println("Source: " + source + ";\tScripts: " + showScripts(scripts)); Enumeration targets = Transliterator.getAvailableTargets(source); while(targets.hasMoreElements()) { String target = (String) targets.nextElement(); scripts = UScript.getCode(target); if (scripts == null || priority(scripts[0]) < priority(sourceScript)) { // skip doing both directions System.out.println("[Skipping '" + source + "-" + target + "']"); continue; } System.out.println("\tTarget: " + target + ";\tScripts: " + showScripts(scripts)); Enumeration variants = Transliterator.getAvailableVariants(source, target); while(variants.hasMoreElements()) { String variant = (String) variants.nextElement(); String id = source + "-" + target; if (variant.length() != 0) { id += "/" + variant; if (false) { System.out.println("SKIPPING VARIANT, SINCE IT CURRENTLY BREAKS!\t" + id); continue; } } System.out.println("\t\t\t\tAdding: '" + id + "'"); set.add(id); } } } String[] results = new String[set.size()]; set.toArray(results); return results; } static public int priority(int script) { if (script == UScript.LATIN) return -2; return script; } public static String showScripts(int[] scripts) { StringBuffer results = new StringBuffer(); for (int i = 0; i < scripts.length; ++i) { if (i != 0) results.append(", "); results.append(UScript.getName(scripts[i])); } return results.toString(); } public static void print(String testSet, String rawId) throws IOException { System.out.println("Processing " + rawId); Transliterator t = Transliterator.getInstance(rawId); String id = t.getID(); // clean up IDs. Ought to be API for getting source, target, variant int minusPos = id.indexOf('-'); String source = id.substring(0,minusPos); String target = id.substring(minusPos+1); int slashPos = target.indexOf('/'); if (slashPos >= 0) target = target.substring(0,slashPos); // check that the source is a script if (testSet.equals("")) { int[] scripts = UScript.getCode(source); if (scripts == null) { System.out.println("FAILED: " + Transliterator.getDisplayName(id) + " does not have a script as the source"); return; } else { testSet = "[:" + source + ":]"; if (source.equalsIgnoreCase("katakana")) { testSet = "[" + testSet + "\u30FC]"; printSet(testSet); } } } UnicodeSet sourceSet = new UnicodeSet(testSet); // check that the target is a script int[] scripts = UScript.getCode(target); if (scripts == null) { target = "[:Latin:]"; } else { target = "[:" + target + ":]"; } UnicodeSet targetSet = new UnicodeSet(target); Transliterator inverse = t.getInverse(); //Transliterator hex = Transliterator.getInstance("Any-Hex"); // iterate through script System.out.println("Transliterating " + sourceSet.toPattern(true) + " with " + Transliterator.getDisplayName(id)); UnicodeSet leftOverSet = new UnicodeSet(targetSet); UnicodeSet privateUse = new UnicodeSet("[:private use:]"); Map map = new TreeMap(); UnicodeSet targetSetPlusAnyways = new UnicodeSet(targetSet); targetSetPlusAnyways.addAll(okAnyway); UnicodeSet sourceSetPlusAnyways = new UnicodeSet(sourceSet); sourceSetPlusAnyways.addAll(okAnyway); UnicodeSetIterator usi = new UnicodeSetIterator(sourceSet); while (usi.next()) { int j = usi.codepoint; /* int count = sourceSet.getRangeCount(); for (int i = 0; i < count; ++i) { int end = sourceSet.getRangeEnd(i); for (int j = sourceSet.getRangeStart(i); j <= end; ++j) { */ // String flag = ""; String ss = UTF16.valueOf(j); String ts = t.transliterate(ss); char group = 0; if (!targetSetPlusAnyways.containsAll(ts)) { group |= 1; } if (UTF16.countCodePoint(ts) == 1) { leftOverSet.remove(UTF16.charAt(ts,0)); } String rt = inverse.transliterate(ts); if (!sourceSetPlusAnyways.containsAll(rt)) { group |= 2; } else if (!ss.equals(rt)) { group |= 4; } if (!privateUse.containsNone(ts) || !privateUse.containsNone(rt)) { group |= 16; } map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.NFKD)) + "\u0000" + ss, "
This file illustrates the transliterations of " + Transliterator.getDisplayName(id) + "."); out.println("The samples are mechanically generated, and only include single characters"); out.println("from the source set. Thus it will not contain examples where the transliteration"); out.println("depends on the context around the character. For a more detailed -- and interactive -- example, see the"); out.println("Transliteration Demo