1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4**********************************************************************
5* Copyright (c) 2003-2012, International Business Machines
6* Corporation and others.  All Rights Reserved.
7**********************************************************************
8* Author: Alan Liu
9* Created: February 11 2003
10* Since: ICU 2.6
11**********************************************************************
12*/
13package com.ibm.icu.dev.tool.translit;
14
15import java.io.FileOutputStream;
16import java.io.IOException;
17import java.io.PrintStream;
18import java.util.Collections;
19import java.util.Comparator;
20import java.util.HashMap;
21import java.util.Iterator;
22import java.util.Locale;
23import java.util.Map;
24import java.util.Set;
25import java.util.TreeSet;
26import java.util.Vector;
27
28import com.ibm.icu.impl.Utility;
29import com.ibm.icu.lang.UCharacter;
30import com.ibm.icu.text.BreakIterator;
31import com.ibm.icu.text.UTF16;
32import com.ibm.icu.text.UnicodeSet;
33
34/**
35 * This class produces the data tables used by the closeOver() method
36 * of UnicodeSet.
37 *
38 * Whenever the Unicode database changes, this tool must be re-run
39 * (AFTER the data file(s) underlying ICU4J are udpated).
40 *
41 * The output of this tool should then be pasted into the appropriate
42 * files:
43 *
44 * ICU4J: com.ibm.icu.text.UnicodeSet.java
45 * ICU4C: /icu/source/common/uniset.cpp
46 */
47class UnicodeSetCloseOver {
48
49    // Our output files
50    static final String JAVA_OUT          = "to_UnicodeSet.java";
51    static final String JAVA_CHARPROP_OUT = "to_UCharacterProperty.java";
52    static final String C_SET_OUT         = "to_uniset.cpp";
53    static final String C_UCHAR_OUT       = "to_uchar.c";
54
55    // Source code "do not edit" warning
56    static final String WARNING = "MACHINE-GENERATED; Unicode version " +
57        UCharacter.getUnicodeVersion() +
58        "; DO NOT EDIT; See " +
59        UnicodeSetCloseOver.class.getName();
60
61    // Case folding options flag.  This must correspond to the options
62    // used in UnicodeSet.closeOver() in Java and C++.
63    static final boolean DEFAULT_CASE_MAP = true; // false for Turkish
64
65    public static void main(String[] args) throws IOException {
66        System.out.println("This tool will generate several output files.  Each is named according");
67        System.out.println("the target file.  For example, the contents of to_UnicodeSet.java should");
68        System.out.println("be pasted into UnicodeSet.java.");
69        System.out.println();
70
71        generateCaseData();
72    }
73
74    /**
75     * Create a map of String => Set.  The String in this case is a
76     * folded string for which
77     * UCharacter.foldCase(folded. DEFAULT_CASE_MAP).equals(folded).
78     * The Set contains all single-character strings x for which
79     * UCharacter.foldCase(x, DEFAULT_CASE_MAP).equals(folded), as
80     * well as folded itself.
81     */
82    static Map createCaseFoldEquivalencyClasses() {
83        Map equivClasses = new HashMap();
84        for (int i = 0; i <= 0x10FFFF; ++i) {
85            int cat = UCharacter.getType(i);
86            if (cat == Character.UNASSIGNED || cat == Character.PRIVATE_USE)
87                continue;
88
89            String cp = UTF16.valueOf(i);
90            String folded = UCharacter.foldCase(cp, DEFAULT_CASE_MAP);
91            if (folded.equals(cp)) continue;
92
93            // At this point, have different case folding.  Add
94            // the code point and its folded equivalent into the
95            // equivalency class.
96            TreeSet s = (TreeSet) equivClasses.get(folded);
97            if (s == null) {
98                s = new TreeSet();
99                s.add(folded); // add the case fold result itself
100                equivClasses.put(folded, s);
101            }
102            s.add(cp);
103        }
104        return equivClasses;
105    }
106
107    /**
108     * Analyze the case fold equivalency classes.  Break them into two
109     * groups: 'pairs', and 'nonpairs'.  Create a tally of the length
110     * configurations of the nonpairs.
111     *
112     * Length configurations of equivalency classes, as of Unicode
113     * 3.2.  Most of the classes (83%) have two single codepoints.
114     * Here "112:28" means there are 28 equivalency classes with 2
115     * single codepoints and one string of length 2.
116     *
117     * 11:656
118     * 111:16
119     * 1111:3
120     * 112:28
121     * 113:2
122     * 12:31
123     * 13:12
124     * 22:38
125     *
126     * Note: This method does not count the frequencies of the
127     * different length configurations (as shown above after ':'); it
128     * merely records which configurations occur.
129     *
130     * @param pairs Accumulate equivalency classes that consist of
131     * exactly two codepoints here.  This is 83+% of the classes.
132     * E.g., {"a", "A"}.
133     * @param nonpairs Accumulate other equivalency classes here, as
134     * lists of strings.  E,g, {"st", "\uFB05", "\uFB06"}.
135     * @param lengths Accumulate a list of unique length structures,
136     * not including pairs.  Each length structure is represented by a
137     * string of digits.  The digit string "12" means the equivalency
138     * class contains a single code point and a string of length 2.
139     * Typical contents of 'lengths': { "111", "1111", "112",
140     * "113", "12", "13", "22" }.  Note the absence of "11".
141     */
142    static void analyzeCaseData(Map equivClasses,
143                                StringBuffer pairs,
144                                Vector nonpairs,
145                                Vector lengths) {
146        Iterator i = new TreeSet(equivClasses.keySet()).iterator();
147        StringBuffer buf = new StringBuffer();
148        while (i.hasNext()) {
149            Object key = i.next();
150            Vector v = new Vector((Set) equivClasses.get(key));
151            if (v.size() == 2) {
152                String a = (String) v.elementAt(0);
153                String b = (String) v.elementAt(1);
154                if (a.length() == 1 && b.length() == 1) {
155                    pairs.append(a).append(b);
156                    continue;
157                    // Note that pairs are included in 'lengths'
158                }
159            }
160            String[] a = new String[v.size()];
161            v.toArray(a);
162            nonpairs.add(a);
163            //int singleCount = 0;
164            //int stringCount = 0;
165            // Make a string of the lengths, e.g., "111" means 3
166            // single code points; "13" means a single code point
167            // and a string of length 3.
168            v.clear();
169            for (int j=0; j<a.length; ++j) {
170                v.add(new Integer(a[j].length()));
171            }
172            Collections.sort(v);
173            buf.setLength(0);
174            for (int j=0; j<v.size(); ++j) {
175                buf.append(String.valueOf(v.elementAt(j)));
176            }
177            if (!lengths.contains(buf.toString())) {
178                lengths.add(buf.toString());
179            }
180        }
181    }
182
183    @SuppressWarnings("resource")
184    static void generateCaseData() throws IOException {
185
186        Map equivClasses = createCaseFoldEquivalencyClasses();
187
188        // Accumulate equivalency classes that consist of exactly
189        // two codepoints here.  This is 83+% of the classes.
190        // E.g., {"a", "A"}.
191        StringBuffer pairs = new StringBuffer();
192
193        // Accumulate other equivalency classes here, as lists
194        // of strings.  E,g, {"st", "\uFB05", "\uFB06"}.
195        Vector nonpairs = new Vector(); // contains String[]
196        Vector lengths = new Vector(); // "111", "12", "22", etc.
197
198        analyzeCaseData(equivClasses, pairs, nonpairs, lengths);
199
200        //-------------------------------------------------------------
201        // Emit Java source
202        PrintStream out = new PrintStream(new FileOutputStream(JAVA_OUT));
203        System.out.println("Writing " + JAVA_OUT);
204
205        out.println("    // " + WARNING);
206        out.println("    private static final String CASE_PAIRS =");
207        out.println(Utility.formatForSource(pairs.toString()) + ";");
208        out.println();
209        out.println("    // " + WARNING);
210        out.println("    private static final String[][] CASE_NONPAIRS = {");
211        for (int j=0; j<nonpairs.size(); ++j) {
212            String[] a = (String[]) nonpairs.elementAt(j);
213            out.print("        {");
214            for (int k=0; k<a.length; ++k) {
215                if (k != 0) out.print(", ");
216                out.print(Utility.format1ForSource(a[k]));
217            }
218            out.println("},");
219        }
220        out.println("    };");
221
222        //-------------------------------------------------------------
223        // Emit C++ source
224
225        // In C++, the pairs are again emitted in an array, but this
226        // array is the final representation form -- it will not be
227        // reprocessed into a hash.  It will be binary searched by
228        // looking at the even elements [0], [2], [4], etc., and
229        // ignoring the odd elements.  The even elements must contain
230        // the folded members of the pairs.  That is, in the pair
231        // {'A', 'a'}, the even element must be 'a', not 'A'.  Then a
232        // code point to be located is first folded ('Y' => 'y') then
233        // it binary searched against [0]='A', [2]='B', etc.  When a
234        // match is found at k, the pair is [k], [k+1].
235
236        out = new PrintStream(new FileOutputStream(C_SET_OUT));
237        System.out.println("Writing " + C_SET_OUT);
238
239        // Sort the pairs.  They must be ordered by the folded element.
240        // Store these as two-character strings, with charAt(0) being
241        // the folded member of the pair.
242        TreeSet sortPairs = new TreeSet(new Comparator() {
243            public int compare(Object a, Object b) {
244                return ((int) ((String) a).charAt(0)) -
245                       ((int) ((String) b).charAt(0));
246            }
247            public boolean equals(Object obj) {
248                return false;
249            }
250        });
251        for (int i=0; i<pairs.length(); i+=2) {
252            String a = String.valueOf(pairs.charAt(i));
253            String b = String.valueOf(pairs.charAt(i+1));
254            String folded = UCharacter.foldCase(a, DEFAULT_CASE_MAP);
255            if (a.equals(folded)) {
256                sortPairs.add(a + b);
257            } else {
258                sortPairs.add(b + a);
259            }
260        }
261
262        // Emit the pairs
263        out.println("// " + WARNING);
264        out.println("static const UChar CASE_PAIRS[] = {");
265        Iterator it = sortPairs.iterator();
266        while (it.hasNext()) {
267            out.print("    ");
268            int n = 0;
269            while (n++ < 5 && it.hasNext()) {
270                String s = (String) it.next();
271                //out.print((int) s.charAt(0) + "," +
272                //                 (int) s.charAt(1) + ",");
273                out.print("0x" + Utility.hex(s.charAt(0)) + ",0x" +
274                                 Utility.hex(s.charAt(1)) + ",");
275            }
276            out.println();
277        }
278        out.println("};");
279        out.println();
280
281        // The non-pairs are encoded in the following way.  All the
282        // single codepoints in each class are grouped together
283        // followed by a zero.  Then each multi-character string is
284        // added, followed by a zero.  Finally, another zero is added.
285        // Some examples:
286        //  {"iQ", "R"}           =>  [ 'R', 0, 'i', 'Q', 0, 0 ]
287        //  {"S", "D", "F", "G"}  =>  [ 'S', 'D', 'F', 'G', 0, 0 ]
288        //  {"jW", "jY"}          =>  [ 0, 'j', 'W', 0, 'j', 'Y', 0, 0 ]
289        // The end-result is a short, flat array of UChar values that
290        // can be used to initialize a UChar[] array in C.
291
292        int maxLen = 0; // Maximum encoded length of any class, including zeros
293        out.println("// " + WARNING);
294        out.println("static const CaseEquivClass CASE_NONPAIRS[] = {");
295        for (int j=0; j<nonpairs.size(); ++j) {
296            int len = 0;
297            String[] a = (String[]) nonpairs.elementAt(j);
298            out.print("    {");
299            // Emit single code points
300            for (int k=0; k<a.length; ++k) {
301                if (a[k].length() != 1) continue;
302                //out.print((int) a[k].charAt(0) + ",");
303                out.print("0x"+Utility.hex(a[k].charAt(0)) + ",");
304                ++len;
305            }
306            out.print("0,  "); // End of single code points
307            ++len;
308            // Emit multi-character strings
309            for (int k=0; k<a.length; ++k) {
310                if (a[k].length() == 1) continue;
311                for (int m=0; m<a[k].length(); ++m) {
312                    //out.print((int) a[k].charAt(m) + ",");
313                    out.print("0x"+Utility.hex(a[k].charAt(m)) + ",");
314                    ++len;
315                }
316                out.print("0, "); // End of string
317                ++len;
318            }
319            out.println("0},"); // End of equivalency class
320            ++len;
321            if (len > maxLen) maxLen = len;
322        }
323        out.println("};");
324
325        // Make sure the CaseEquivClass data can fit.
326        if (maxLen > 8) {
327            throw new RuntimeException("Must adjust CaseEquivClass to accomodate " + maxLen + " UChars");
328        }
329
330        // Also make sure that we can map into this array using a
331        // CompactByteArray.  We could do this check above, but we
332        // keep it here, adjacent to the maxLen check.  We use one
333        // value (-1 == 255) to indicate "no value."
334        if (nonpairs.size() > 255) {
335            throw new RuntimeException("Too many CASE_NONPAIRS array elements to be indexed by a CompactByteArray");
336        }
337
338        //-------------------------------------------------------------
339        // Case-unique set:  All characters c for which closeOver(c)==c.
340
341        // UPDATE: Instead of using this, we're using the related
342        // notion of Case_Sensitive.  See below.  Note that
343        // Case_Sensitive != ^Case_Unique.
344
345        if (false) {
346            UnicodeSet caseUnique = new UnicodeSet();
347            for (int i = 0; i <= 0x10FFFF; ++i) {
348                String cp = UTF16.valueOf(i);
349                if (equivClasses.get(UCharacter.foldCase(cp, DEFAULT_CASE_MAP)) == null) {
350                    caseUnique.add(i);
351                }
352            }
353            // out.println("caseUnique = " + caseUnique.toPattern(true));
354        }
355
356        UnicodeSet caseSensitive = getCaseSensitive();
357        //System.out.println("caseSensitive = " + caseSensitive.toPattern(true));
358
359        // Now for C, emit an array of ranges
360        out = new PrintStream(new FileOutputStream(C_UCHAR_OUT));
361        System.out.println("Writing " + C_UCHAR_OUT);
362
363        out.println("/* " + WARNING + " */");
364        emitUCharRangesArray(out, caseSensitive, "CASE_SENSITIVE_RANGES");
365
366        // For Java, emit a string with the ranges (each pair of chars
367        // in the string is a range).
368        out = new PrintStream(new FileOutputStream(JAVA_CHARPROP_OUT));
369        System.out.println("Writing " + JAVA_CHARPROP_OUT);
370        out.println("    // " + WARNING);
371        emitRangesString(out, caseSensitive, "CASE_SENSITIVE_RANGES");
372    }
373
374    /**
375     * Create the set of case-sensitive characters.  These are characters
376     * that participate in any case mapping operation as a source or
377     * as a member of a target string.
378     */
379    static UnicodeSet getCaseSensitive() {
380        UnicodeSet caseSensitive = new UnicodeSet();
381        Locale loc = Locale.US;
382        BreakIterator bi = BreakIterator.getTitleInstance(loc);
383        for (int c = 0; c <= 0x10FFFF; ++c) {
384            String cp = UTF16.valueOf(c);
385            for (int j=0; j<4; ++j) {
386                String s = null;
387                switch (j) {
388                case 0: s = UCharacter.toUpperCase(loc, cp); break;
389                case 1: s = UCharacter.toLowerCase(loc, cp); break;
390                case 2: s = UCharacter.toTitleCase(loc, cp, bi); break;
391                case 3: s = UCharacter.foldCase(cp, DEFAULT_CASE_MAP); break;
392                }
393                if (!s.equals(cp)) {
394                    int cc;
395                    for (int k=0; k<s.length(); k+=UTF16.getCharCount(cc)) {
396                        cc = UTF16.charAt(s, k);
397                        caseSensitive.add(cc);
398                    }
399                    for (int k=0; k<cp.length(); k+=UTF16.getCharCount(cc)) {
400                        cc = UTF16.charAt(cp, k);
401                        caseSensitive.add(cc);
402                    }
403                }
404            }
405            // Also do the single-codepoint API.  This shouldn't add any
406            // code points, but we do it for completeness.
407            for (int j=0; j<4; ++j) {
408                int d = 0;
409                switch (j) {
410                case 0: d = UCharacter.toUpperCase(c); break;
411                case 1: d = UCharacter.toLowerCase(c); break;
412                case 2: d = UCharacter.toTitleCase(c); break;
413                case 3: d = UCharacter.foldCase(c, DEFAULT_CASE_MAP); break;
414                }
415                if (d != c) {
416                    if (!caseSensitive.contains(c) ||
417                        !caseSensitive.contains(d)) {
418                        System.out.println("Warning: code point " + c +
419                                           " => " + d + " created NEW MAPPING"+
420                                           " for Case_Sensitive");
421                    }
422                    caseSensitive.add(c);
423                    caseSensitive.add(d);
424                }
425            }
426        }
427        return caseSensitive;
428    }
429
430    /**
431     * Given a UnicodeSet, emit it as an array of UChar pairs.  Each
432     * pair will be the start/end of a range.  Code points >= U+10000
433     * will be represented as surrogate pairs.
434     */
435    static void emitUCharRangesArray(PrintStream out, UnicodeSet set, String id) {
436        // Store the pairs in a StringBuffer.  This handles surrogate
437        // representation.
438        StringBuffer buf = new StringBuffer();
439        for (int i=0; i<set.getRangeCount(); ++i) {
440            UTF16.append(buf, set.getRangeStart(i));
441            UTF16.append(buf, set.getRangeEnd(i));
442        }
443        // Emit the pairs
444        out.println("static const UChar " + id + "[] = {");
445        for (int i=0; i<buf.length(); ) {
446            out.print("    ");
447            for (int n=0; n++<10 && i<buf.length(); ++i) {
448                out.print("0x" + Utility.hex(buf.charAt(i), 4) + ',');
449            }
450            out.println();
451        }
452        out.println("};");
453        out.println("#define " + id + "_LENGTH (sizeof(" + id +
454                    ")/sizeof(" + id + "[0]))");
455    }
456
457    /**
458     * Given a UnicodeSet, emit it as a Java string.  The most economical
459     * format is not the pattern, but instead a pairs list, with each
460     * range pair represented as two adjacent characters.
461     */
462    static void emitRangesString(PrintStream out, UnicodeSet set, String id) {
463        // Store the pairs in a StringBuffer.  This handles surrogate
464        // representation.
465        StringBuffer buf = new StringBuffer();
466        for (int i=0; i<set.getRangeCount(); ++i) {
467            UTF16.append(buf, set.getRangeStart(i));
468            UTF16.append(buf, set.getRangeEnd(i));
469        }
470        // Emit the pairs
471        out.println("    private static final String " + id + " =");
472        out.println(Utility.formatForSource(buf.toString()) + ";");
473    }
474}
475