1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/**
4 *******************************************************************************
5 * Copyright (C) 2000-2010, International Business Machines Corporation and    *
6 * others. All Rights Reserved.                                                *
7 *******************************************************************************
8 */
9package com.ibm.icu.dev.test.translit;
10
11import com.ibm.icu.text.UTF16;
12import com.ibm.icu.text.UnicodeSet;
13
14public final class TestUtility {
15
16    public static String hex(char ch) {
17        String foo = Integer.toString(ch,16).toUpperCase();
18        return "0000".substring(0,4-foo.length()) + foo;
19    }
20
21    public static String hex(int ch) {
22        String foo = Integer.toString(ch,16).toUpperCase();
23        return "00000000".substring(0,4-foo.length()) + foo;
24    }
25
26    public static String hex(String s) {
27      return hex(s,",");
28    }
29
30    public static String hex(String s, String sep) {
31      if (s.length() == 0) return "";
32      String result = hex(s.charAt(0));
33      for (int i = 1; i < s.length(); ++i) {
34        result += sep;
35        result += hex(s.charAt(i));
36      }
37      return result;
38    }
39
40    public static String replace(String source, String toBeReplaced, String replacement) {
41        StringBuffer results = new StringBuffer();
42        int len = toBeReplaced.length();
43        for (int i = 0; i < source.length(); ++i) {
44            if (source.regionMatches(false, i, toBeReplaced, 0, len)) {
45                results.append(replacement);
46                i += len - 1; // minus one, since we will increment
47            } else {
48                results.append(source.charAt(i));
49            }
50        }
51        return results.toString();
52    }
53
54    public static String replaceAll(String source, UnicodeSet set, String replacement) {
55        StringBuffer results = new StringBuffer();
56        int cp;
57        for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
58            cp = UTF16.charAt(source,i);
59            if (set.contains(cp)) {
60                results.append(replacement);
61            } else {
62                UTF16.append(results, cp);
63            }
64        }
65        return results.toString();
66    }
67
68    // COMMENTED OUT ALL THE OLD SCRIPT STUFF
69    /*
70    public static byte getScript(char c) {
71      return getScript(getBlock(c));
72    }
73
74    public static byte getScript(byte block) {
75      return blockToScript[block];
76    }
77
78    public static byte getBlock(char c) {
79      int index = c >> 7;
80      byte block = charToBlock[index];
81      while (block < 0) { // take care of exceptions, blocks split across 128 boundaries
82          int[] tuple = split[-block-1];
83          if (c < tuple[0]) block = (byte)tuple[1];
84          else block = (byte)tuple[2];
85      }
86      return block;
87    }
88
89    // returns next letter of script, or 0xFFFF if done
90
91    public static char getNextLetter(char c, byte script) {
92        while (c < 0xFFFF) {
93            ++c;
94            if (getScript(c) == script && Character.isLetter(c)) {
95                return c;
96            }
97        }
98        return c;
99    }
100
101    // Supplements to Character methods; these methods go through
102    // UCharacter if possible.  If not, they fall back to Character.
103
104    public static boolean isUnassigned(char c) {
105        try {
106            return UCharacter.getType(c) == UCharacterCategory.UNASSIGNED;
107        } catch (NullPointerException e) {
108            System.out.print("");
109        }
110        return Character.getType(c) == Character.UNASSIGNED;
111    }
112
113    public static boolean isLetter(char c) {
114        try {
115            return UCharacter.isLetter(c);
116        } catch (NullPointerException e) {
117            System.out.print("");
118        }
119        return Character.isLetter(c);
120    }
121
122  public static void main(String[] args) {
123    System.out.println("Blocks: ");
124    byte lastblock = -128;
125    for (char cc = 0; cc < 0xFFFF; ++cc) {
126      byte block = TestUtility.getBlock(cc);
127      if (block != lastblock) {
128        System.out.println(TestUtility.hex(cc) + "\t" + block);
129        lastblock = block;
130      }
131    }
132    System.out.println();
133    System.out.println("Scripts: ");
134    byte lastScript = -128;
135    for (char cc = 0; cc < 0xFFFF; ++cc) {
136      byte script = TestUtility.getScript(cc);
137      if (script != lastScript) {
138        System.out.println(TestUtility.hex(cc) + "\t" + script);
139        lastScript = script;
140      }
141    }
142  }
143
144
145
146    public static final byte // SCRIPT CODE
147        COMMON_SCRIPT = 0,
148        LATIN_SCRIPT = 1,
149        GREEK_SCRIPT = 2,
150        CYRILLIC_SCRIPT = 3,
151        ARMENIAN_SCRIPT = 4,
152        HEBREW_SCRIPT = 5,
153        ARABIC_SCRIPT = 6,
154        SYRIAC_SCRIPT = 7,
155        THAANA_SCRIPT = 8,
156        DEVANAGARI_SCRIPT = 9,
157        BENGALI_SCRIPT = 10,
158        GURMUKHI_SCRIPT = 11,
159        GUJARATI_SCRIPT = 12,
160        ORIYA_SCRIPT = 13,
161        TAMIL_SCRIPT = 14,
162        TELUGU_SCRIPT = 15,
163        KANNADA_SCRIPT = 16,
164        MALAYALAM_SCRIPT = 17,
165        SINHALA_SCRIPT = 18,
166        THAI_SCRIPT = 19,
167        LAO_SCRIPT = 20,
168        TIBETAN_SCRIPT = 21,
169        MYANMAR_SCRIPT = 22,
170        GEORGIAN_SCRIPT = 23,
171        JAMO_SCRIPT = 24,
172        HANGUL_SCRIPT = 25,
173        ETHIOPIC_SCRIPT = 26,
174        CHEROKEE_SCRIPT = 27,
175        ABORIGINAL_SCRIPT = 28,
176        OGHAM_SCRIPT = 29,
177        RUNIC_SCRIPT = 30,
178        KHMER_SCRIPT = 31,
179        MONGOLIAN_SCRIPT = 32,
180        HIRAGANA_SCRIPT = 33,
181        KATAKANA_SCRIPT = 34,
182        BOPOMOFO_SCRIPT = 35,
183        HAN_SCRIPT = 36,
184        YI_SCRIPT = 37;
185
186    public static final byte // block code
187        RESERVED_BLOCK = 0,
188        BASIC_LATIN = 1,
189        LATIN_1_SUPPLEMENT = 2,
190        LATIN_EXTENDED_A = 3,
191        LATIN_EXTENDED_B = 4,
192        IPA_EXTENSIONS = 5,
193        SPACING_MODIFIER_LETTERS = 6,
194        COMBINING_DIACRITICAL_MARKS = 7,
195        GREEK = 8,
196        CYRILLIC = 9,
197        ARMENIAN = 10,
198        HEBREW = 11,
199        ARABIC = 12,
200        SYRIAC = 13,
201        THAANA = 14,
202        DEVANAGARI = 15,
203        BENGALI = 16,
204        GURMUKHI = 17,
205        GUJARATI = 18,
206        ORIYA = 19,
207        TAMIL = 20,
208        TELUGU = 21,
209        KANNADA = 22,
210        MALAYALAM = 23,
211        SINHALA = 24,
212        THAI = 25,
213        LAO = 26,
214        TIBETAN = 27,
215        MYANMAR = 28,
216        GEORGIAN = 29,
217        HANGUL_JAMO = 30,
218        ETHIOPIC = 31,
219        CHEROKEE = 32,
220        UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33,
221        OGHAM = 34,
222        RUNIC = 35,
223        KHMER = 36,
224        MONGOLIAN = 37,
225        LATIN_EXTENDED_ADDITIONAL = 38,
226        GREEK_EXTENDED = 39,
227        GENERAL_PUNCTUATION = 40,
228        SUPERSCRIPTS_AND_SUBSCRIPTS = 41,
229        CURRENCY_SYMBOLS = 42,
230        COMBINING_MARKS_FOR_SYMBOLS = 43,
231        LETTERLIKE_SYMBOLS = 44,
232        NUMBER_FORMS = 45,
233        ARROWS = 46,
234        MATHEMATICAL_OPERATORS = 47,
235        MISCELLANEOUS_TECHNICAL = 48,
236        CONTROL_PICTURES = 49,
237        OPTICAL_CHARACTER_RECOGNITION = 50,
238        ENCLOSED_ALPHANUMERICS = 51,
239        BOX_DRAWING = 52,
240        BLOCK_ELEMENTS = 53,
241        GEOMETRIC_SHAPES = 54,
242        MISCELLANEOUS_SYMBOLS = 55,
243        DINGBATS = 56,
244        BRAILLE_PATTERNS = 57,
245        CJK_RADICALS_SUPPLEMENT = 58,
246        KANGXI_RADICALS = 59,
247        IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60,
248        CJK_SYMBOLS_AND_PUNCTUATION = 61,
249        HIRAGANA = 62,
250        KATAKANA = 63,
251        BOPOMOFO = 64,
252        HANGUL_COMPATIBILITY_JAMO = 65,
253        KANBUN = 66,
254        BOPOMOFO_EXTENDED = 67,
255        ENCLOSED_CJK_LETTERS_AND_MONTHS = 68,
256        CJK_COMPATIBILITY = 69,
257        CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70,
258        CJK_UNIFIED_IDEOGRAPHS = 71,
259        YI_SYLLABLES = 72,
260        YI_RADICALS = 73,
261        HANGUL_SYLLABLES = 74,
262        HIGH_SURROGATES = 75,
263        HIGH_PRIVATE_USE_SURROGATES = 76,
264        LOW_SURROGATES = 77,
265        PRIVATE_USE = 78,
266        CJK_COMPATIBILITY_IDEOGRAPHS = 79,
267        ALPHABETIC_PRESENTATION_FORMS = 80,
268        ARABIC_PRESENTATION_FORMS_A = 81,
269        COMBINING_HALF_MARKS = 82,
270        CJK_COMPATIBILITY_FORMS = 83,
271        SMALL_FORM_VARIANTS = 84,
272        ARABIC_PRESENTATION_FORMS_B = 85,
273        SPECIALS = 86,
274        HALFWIDTH_AND_FULLWIDTH_FORMS = 87;
275
276    static final byte[] blockToScript = {
277        COMMON_SCRIPT, // 0, <RESERVED_BLOCK>
278        LATIN_SCRIPT, // 1, BASIC_LATIN
279        LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT
280        LATIN_SCRIPT, // 3, LATIN_EXTENDED_A
281        LATIN_SCRIPT, // 4, LATIN_EXTENDED_B
282        LATIN_SCRIPT, // 5, IPA_EXTENSIONS
283        COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS
284        COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS
285        GREEK_SCRIPT, // 8, GREEK
286        CYRILLIC_SCRIPT, // 9, CYRILLIC
287        ARMENIAN_SCRIPT, // 10, ARMENIAN
288        HEBREW_SCRIPT, // 11, HEBREW
289        ARABIC_SCRIPT, // 12, ARABIC
290        SYRIAC_SCRIPT, // 13, SYRIAC
291        THAANA_SCRIPT, // 14, THAANA
292        DEVANAGARI_SCRIPT, // 15, DEVANAGARI
293        BENGALI_SCRIPT, // 16, BENGALI
294        GURMUKHI_SCRIPT, // 17, GURMUKHI
295        GUJARATI_SCRIPT, // 18, GUJARATI
296        ORIYA_SCRIPT, // 19, ORIYA
297        TAMIL_SCRIPT, // 20, TAMIL
298        TELUGU_SCRIPT, // 21, TELUGU
299        KANNADA_SCRIPT, // 22, KANNADA
300        MALAYALAM_SCRIPT, // 23, MALAYALAM
301        SINHALA_SCRIPT, // 24, SINHALA
302        THAI_SCRIPT, // 25, THAI
303        LAO_SCRIPT, // 26, LAO
304        TIBETAN_SCRIPT, // 27, TIBETAN
305        MYANMAR_SCRIPT, // 28, MYANMAR
306        GEORGIAN_SCRIPT, // 29, GEORGIAN
307        JAMO_SCRIPT, // 30, HANGUL_JAMO
308        ETHIOPIC_SCRIPT, // 31, ETHIOPIC
309        CHEROKEE_SCRIPT, // 32, CHEROKEE
310        ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
311        OGHAM_SCRIPT, // 34, OGHAM
312        RUNIC_SCRIPT, // 35, RUNIC
313        KHMER_SCRIPT, // 36, KHMER
314        MONGOLIAN_SCRIPT, // 37, MONGOLIAN
315        LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL
316        GREEK_SCRIPT, // 39, GREEK_EXTENDED
317        COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION
318        COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS
319        COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS
320        COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS
321        COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS
322        COMMON_SCRIPT, // 45, NUMBER_FORMS
323        COMMON_SCRIPT, // 46, ARROWS
324        COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS
325        COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL
326        COMMON_SCRIPT, // 49, CONTROL_PICTURES
327        COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION
328        COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS
329        COMMON_SCRIPT, // 52, BOX_DRAWING
330        COMMON_SCRIPT, // 53, BLOCK_ELEMENTS
331        COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES
332        COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS
333        COMMON_SCRIPT, // 56, DINGBATS
334        COMMON_SCRIPT, // 57, BRAILLE_PATTERNS
335        HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT
336        HAN_SCRIPT, // 59, KANGXI_RADICALS
337        HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS
338        COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION
339        HIRAGANA_SCRIPT, // 62, HIRAGANA
340        KATAKANA_SCRIPT, // 63, KATAKANA
341        BOPOMOFO_SCRIPT, // 64, BOPOMOFO
342        JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO
343        HAN_SCRIPT, // 66, KANBUN
344        BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED
345        COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS
346        COMMON_SCRIPT, // 69, CJK_COMPATIBILITY
347        HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
348        HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS
349        YI_SCRIPT, // 72, YI_SYLLABLES
350        YI_SCRIPT, // 73, YI_RADICALS
351        HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES
352        COMMON_SCRIPT, // 75, HIGH_SURROGATES
353        COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES
354        COMMON_SCRIPT, // 77, LOW_SURROGATES
355        COMMON_SCRIPT, // 78, PRIVATE_USE
356        HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS
357        COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS
358        ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A
359        COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS
360        COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS
361        COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS
362        ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B
363        COMMON_SCRIPT, // 86, SPECIALS
364        COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS
365        COMMON_SCRIPT, // 88, SPECIALS
366    };
367
368    // could be further reduced to a byte array, but I didn't bother.
369    static final int[][] split = {
370        {0x0250, 4, 5}, // -1
371        {0x02B0, 5, 6}, // -2
372        {0x0370, 7, 8}, // -3
373        {0x0530, 0, 10}, // -4
374        {0x0590, 10, 11}, // -5
375        {0x0750, 13, 0}, // -6
376        {0x07C0, 14, 0}, // -7
377        {0x10A0, 28, 29}, // -8
378        {0x13A0, 0, 32}, // -9
379        {0x16A0, 34, 35}, // -10
380        {0x18B0, 37, 0}, // -11
381        {0x2070, 40, 41}, // -12
382        {0x20A0, 41, -31}, // -13
383        {0x2150, 44, 45}, // -14
384        {0x2190, 45, 46}, // -15
385        {0x2440, 49, -32}, // -16
386        {0x25A0, 53, 54}, // -17
387        {0x27C0, 56, 0}, // -18
388        {0x2FE0, 59, -33}, // -19
389        {0x3040, 61, 62}, // -20
390        {0x30A0, 62, 63}, // -21
391        {0x3130, 64, 65}, // -22
392        {0x3190, 65, -34}, // -23
393        {0x4DB6, 70, 0}, // -24
394        {0xA490, 72, -35}, // -25
395        {0xD7A4, 74, 0}, // -26
396        {0xFB50, 80, 81}, // -27
397        {0xFE20, 0, -36}, // -28
398        {0xFEFF, 85, 86}, // -29
399        {0xFFF0, 87, -37}, // -30
400        {0x20D0, 42, 43}, // -31
401        {0x2460, 50, 51}, // -32
402        {0x2FF0, 0, 60}, // -33
403        {0x31A0, 66, -38}, // -34
404        {0xA4D0, 73, 0}, //-35
405        {0xFE30, 82, -39}, //-36
406        {0xFFFE, 88, 0}, //-37
407        {0x31C0, 67, 0}, // -38
408        {0xFE50, 83, -40}, //-39
409        {0xFE70, 84, 85} // -40
410    };
411
412    static final byte[] charToBlock = {
413      1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7,
414      0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27,
415      28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36,
416      37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39,
417      -12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18,
418      57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19,
419      -20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70,
420      70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
421      70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
422      70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71,
423      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
424      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
425      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
426      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
427      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
428      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
429      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
430      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
431      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
432      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
433      72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0,
434      0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74,
435      74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
436      74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
437      74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
438      74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
439      74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26,
440      75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77,
441      78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
442      78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
443      78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
444      78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30
445    };
446    */
447}
448