1/**
2 *******************************************************************************
3 * Copyright (C) 2000-2010, International Business Machines Corporation and    *
4 * others. All Rights Reserved.                                                *
5 *******************************************************************************
6 */
7package com.ibm.icu.dev.test.translit;
8
9import com.ibm.icu.text.UTF16;
10import com.ibm.icu.text.UnicodeSet;
11
12public final class TestUtility {
13
14    public static String hex(char ch) {
15        String foo = Integer.toString(ch,16).toUpperCase();
16        return "0000".substring(0,4-foo.length()) + foo;
17    }
18
19    public static String hex(int ch) {
20        String foo = Integer.toString(ch,16).toUpperCase();
21        return "00000000".substring(0,4-foo.length()) + foo;
22    }
23
24    public static String hex(String s) {
25      return hex(s,",");
26    }
27
28    public static String hex(String s, String sep) {
29      if (s.length() == 0) return "";
30      String result = hex(s.charAt(0));
31      for (int i = 1; i < s.length(); ++i) {
32        result += sep;
33        result += hex(s.charAt(i));
34      }
35      return result;
36    }
37
38    public static String replace(String source, String toBeReplaced, String replacement) {
39        StringBuffer results = new StringBuffer();
40        int len = toBeReplaced.length();
41        for (int i = 0; i < source.length(); ++i) {
42            if (source.regionMatches(false, i, toBeReplaced, 0, len)) {
43                results.append(replacement);
44                i += len - 1; // minus one, since we will increment
45            } else {
46                results.append(source.charAt(i));
47            }
48        }
49        return results.toString();
50    }
51
52    public static String replaceAll(String source, UnicodeSet set, String replacement) {
53        StringBuffer results = new StringBuffer();
54        int cp;
55        for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
56            cp = UTF16.charAt(source,i);
57            if (set.contains(cp)) {
58                results.append(replacement);
59            } else {
60                UTF16.append(results, cp);
61            }
62        }
63        return results.toString();
64    }
65
66    // COMMENTED OUT ALL THE OLD SCRIPT STUFF
67    /*
68    public static byte getScript(char c) {
69      return getScript(getBlock(c));
70    }
71
72    public static byte getScript(byte block) {
73      return blockToScript[block];
74    }
75
76    public static byte getBlock(char c) {
77      int index = c >> 7;
78      byte block = charToBlock[index];
79      while (block < 0) { // take care of exceptions, blocks split across 128 boundaries
80          int[] tuple = split[-block-1];
81          if (c < tuple[0]) block = (byte)tuple[1];
82          else block = (byte)tuple[2];
83      }
84      return block;
85    }
86
87    // returns next letter of script, or 0xFFFF if done
88
89    public static char getNextLetter(char c, byte script) {
90        while (c < 0xFFFF) {
91            ++c;
92            if (getScript(c) == script && Character.isLetter(c)) {
93                return c;
94            }
95        }
96        return c;
97    }
98
99    // Supplements to Character methods; these methods go through
100    // UCharacter if possible.  If not, they fall back to Character.
101
102    public static boolean isUnassigned(char c) {
103        try {
104            return UCharacter.getType(c) == UCharacterCategory.UNASSIGNED;
105        } catch (NullPointerException e) {
106            System.out.print("");
107        }
108        return Character.getType(c) == Character.UNASSIGNED;
109    }
110
111    public static boolean isLetter(char c) {
112        try {
113            return UCharacter.isLetter(c);
114        } catch (NullPointerException e) {
115            System.out.print("");
116        }
117        return Character.isLetter(c);
118    }
119
120  public static void main(String[] args) {
121    System.out.println("Blocks: ");
122    byte lastblock = -128;
123    for (char cc = 0; cc < 0xFFFF; ++cc) {
124      byte block = TestUtility.getBlock(cc);
125      if (block != lastblock) {
126        System.out.println(TestUtility.hex(cc) + "\t" + block);
127        lastblock = block;
128      }
129    }
130    System.out.println();
131    System.out.println("Scripts: ");
132    byte lastScript = -128;
133    for (char cc = 0; cc < 0xFFFF; ++cc) {
134      byte script = TestUtility.getScript(cc);
135      if (script != lastScript) {
136        System.out.println(TestUtility.hex(cc) + "\t" + script);
137        lastScript = script;
138      }
139    }
140  }
141
142
143
144    public static final byte // SCRIPT CODE
145        COMMON_SCRIPT = 0,
146        LATIN_SCRIPT = 1,
147        GREEK_SCRIPT = 2,
148        CYRILLIC_SCRIPT = 3,
149        ARMENIAN_SCRIPT = 4,
150        HEBREW_SCRIPT = 5,
151        ARABIC_SCRIPT = 6,
152        SYRIAC_SCRIPT = 7,
153        THAANA_SCRIPT = 8,
154        DEVANAGARI_SCRIPT = 9,
155        BENGALI_SCRIPT = 10,
156        GURMUKHI_SCRIPT = 11,
157        GUJARATI_SCRIPT = 12,
158        ORIYA_SCRIPT = 13,
159        TAMIL_SCRIPT = 14,
160        TELUGU_SCRIPT = 15,
161        KANNADA_SCRIPT = 16,
162        MALAYALAM_SCRIPT = 17,
163        SINHALA_SCRIPT = 18,
164        THAI_SCRIPT = 19,
165        LAO_SCRIPT = 20,
166        TIBETAN_SCRIPT = 21,
167        MYANMAR_SCRIPT = 22,
168        GEORGIAN_SCRIPT = 23,
169        JAMO_SCRIPT = 24,
170        HANGUL_SCRIPT = 25,
171        ETHIOPIC_SCRIPT = 26,
172        CHEROKEE_SCRIPT = 27,
173        ABORIGINAL_SCRIPT = 28,
174        OGHAM_SCRIPT = 29,
175        RUNIC_SCRIPT = 30,
176        KHMER_SCRIPT = 31,
177        MONGOLIAN_SCRIPT = 32,
178        HIRAGANA_SCRIPT = 33,
179        KATAKANA_SCRIPT = 34,
180        BOPOMOFO_SCRIPT = 35,
181        HAN_SCRIPT = 36,
182        YI_SCRIPT = 37;
183
184    public static final byte // block code
185        RESERVED_BLOCK = 0,
186        BASIC_LATIN = 1,
187        LATIN_1_SUPPLEMENT = 2,
188        LATIN_EXTENDED_A = 3,
189        LATIN_EXTENDED_B = 4,
190        IPA_EXTENSIONS = 5,
191        SPACING_MODIFIER_LETTERS = 6,
192        COMBINING_DIACRITICAL_MARKS = 7,
193        GREEK = 8,
194        CYRILLIC = 9,
195        ARMENIAN = 10,
196        HEBREW = 11,
197        ARABIC = 12,
198        SYRIAC = 13,
199        THAANA = 14,
200        DEVANAGARI = 15,
201        BENGALI = 16,
202        GURMUKHI = 17,
203        GUJARATI = 18,
204        ORIYA = 19,
205        TAMIL = 20,
206        TELUGU = 21,
207        KANNADA = 22,
208        MALAYALAM = 23,
209        SINHALA = 24,
210        THAI = 25,
211        LAO = 26,
212        TIBETAN = 27,
213        MYANMAR = 28,
214        GEORGIAN = 29,
215        HANGUL_JAMO = 30,
216        ETHIOPIC = 31,
217        CHEROKEE = 32,
218        UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33,
219        OGHAM = 34,
220        RUNIC = 35,
221        KHMER = 36,
222        MONGOLIAN = 37,
223        LATIN_EXTENDED_ADDITIONAL = 38,
224        GREEK_EXTENDED = 39,
225        GENERAL_PUNCTUATION = 40,
226        SUPERSCRIPTS_AND_SUBSCRIPTS = 41,
227        CURRENCY_SYMBOLS = 42,
228        COMBINING_MARKS_FOR_SYMBOLS = 43,
229        LETTERLIKE_SYMBOLS = 44,
230        NUMBER_FORMS = 45,
231        ARROWS = 46,
232        MATHEMATICAL_OPERATORS = 47,
233        MISCELLANEOUS_TECHNICAL = 48,
234        CONTROL_PICTURES = 49,
235        OPTICAL_CHARACTER_RECOGNITION = 50,
236        ENCLOSED_ALPHANUMERICS = 51,
237        BOX_DRAWING = 52,
238        BLOCK_ELEMENTS = 53,
239        GEOMETRIC_SHAPES = 54,
240        MISCELLANEOUS_SYMBOLS = 55,
241        DINGBATS = 56,
242        BRAILLE_PATTERNS = 57,
243        CJK_RADICALS_SUPPLEMENT = 58,
244        KANGXI_RADICALS = 59,
245        IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60,
246        CJK_SYMBOLS_AND_PUNCTUATION = 61,
247        HIRAGANA = 62,
248        KATAKANA = 63,
249        BOPOMOFO = 64,
250        HANGUL_COMPATIBILITY_JAMO = 65,
251        KANBUN = 66,
252        BOPOMOFO_EXTENDED = 67,
253        ENCLOSED_CJK_LETTERS_AND_MONTHS = 68,
254        CJK_COMPATIBILITY = 69,
255        CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70,
256        CJK_UNIFIED_IDEOGRAPHS = 71,
257        YI_SYLLABLES = 72,
258        YI_RADICALS = 73,
259        HANGUL_SYLLABLES = 74,
260        HIGH_SURROGATES = 75,
261        HIGH_PRIVATE_USE_SURROGATES = 76,
262        LOW_SURROGATES = 77,
263        PRIVATE_USE = 78,
264        CJK_COMPATIBILITY_IDEOGRAPHS = 79,
265        ALPHABETIC_PRESENTATION_FORMS = 80,
266        ARABIC_PRESENTATION_FORMS_A = 81,
267        COMBINING_HALF_MARKS = 82,
268        CJK_COMPATIBILITY_FORMS = 83,
269        SMALL_FORM_VARIANTS = 84,
270        ARABIC_PRESENTATION_FORMS_B = 85,
271        SPECIALS = 86,
272        HALFWIDTH_AND_FULLWIDTH_FORMS = 87;
273
274    static final byte[] blockToScript = {
275        COMMON_SCRIPT, // 0, <RESERVED_BLOCK>
276        LATIN_SCRIPT, // 1, BASIC_LATIN
277        LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT
278        LATIN_SCRIPT, // 3, LATIN_EXTENDED_A
279        LATIN_SCRIPT, // 4, LATIN_EXTENDED_B
280        LATIN_SCRIPT, // 5, IPA_EXTENSIONS
281        COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS
282        COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS
283        GREEK_SCRIPT, // 8, GREEK
284        CYRILLIC_SCRIPT, // 9, CYRILLIC
285        ARMENIAN_SCRIPT, // 10, ARMENIAN
286        HEBREW_SCRIPT, // 11, HEBREW
287        ARABIC_SCRIPT, // 12, ARABIC
288        SYRIAC_SCRIPT, // 13, SYRIAC
289        THAANA_SCRIPT, // 14, THAANA
290        DEVANAGARI_SCRIPT, // 15, DEVANAGARI
291        BENGALI_SCRIPT, // 16, BENGALI
292        GURMUKHI_SCRIPT, // 17, GURMUKHI
293        GUJARATI_SCRIPT, // 18, GUJARATI
294        ORIYA_SCRIPT, // 19, ORIYA
295        TAMIL_SCRIPT, // 20, TAMIL
296        TELUGU_SCRIPT, // 21, TELUGU
297        KANNADA_SCRIPT, // 22, KANNADA
298        MALAYALAM_SCRIPT, // 23, MALAYALAM
299        SINHALA_SCRIPT, // 24, SINHALA
300        THAI_SCRIPT, // 25, THAI
301        LAO_SCRIPT, // 26, LAO
302        TIBETAN_SCRIPT, // 27, TIBETAN
303        MYANMAR_SCRIPT, // 28, MYANMAR
304        GEORGIAN_SCRIPT, // 29, GEORGIAN
305        JAMO_SCRIPT, // 30, HANGUL_JAMO
306        ETHIOPIC_SCRIPT, // 31, ETHIOPIC
307        CHEROKEE_SCRIPT, // 32, CHEROKEE
308        ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
309        OGHAM_SCRIPT, // 34, OGHAM
310        RUNIC_SCRIPT, // 35, RUNIC
311        KHMER_SCRIPT, // 36, KHMER
312        MONGOLIAN_SCRIPT, // 37, MONGOLIAN
313        LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL
314        GREEK_SCRIPT, // 39, GREEK_EXTENDED
315        COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION
316        COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS
317        COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS
318        COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS
319        COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS
320        COMMON_SCRIPT, // 45, NUMBER_FORMS
321        COMMON_SCRIPT, // 46, ARROWS
322        COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS
323        COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL
324        COMMON_SCRIPT, // 49, CONTROL_PICTURES
325        COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION
326        COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS
327        COMMON_SCRIPT, // 52, BOX_DRAWING
328        COMMON_SCRIPT, // 53, BLOCK_ELEMENTS
329        COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES
330        COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS
331        COMMON_SCRIPT, // 56, DINGBATS
332        COMMON_SCRIPT, // 57, BRAILLE_PATTERNS
333        HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT
334        HAN_SCRIPT, // 59, KANGXI_RADICALS
335        HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS
336        COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION
337        HIRAGANA_SCRIPT, // 62, HIRAGANA
338        KATAKANA_SCRIPT, // 63, KATAKANA
339        BOPOMOFO_SCRIPT, // 64, BOPOMOFO
340        JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO
341        HAN_SCRIPT, // 66, KANBUN
342        BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED
343        COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS
344        COMMON_SCRIPT, // 69, CJK_COMPATIBILITY
345        HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
346        HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS
347        YI_SCRIPT, // 72, YI_SYLLABLES
348        YI_SCRIPT, // 73, YI_RADICALS
349        HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES
350        COMMON_SCRIPT, // 75, HIGH_SURROGATES
351        COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES
352        COMMON_SCRIPT, // 77, LOW_SURROGATES
353        COMMON_SCRIPT, // 78, PRIVATE_USE
354        HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS
355        COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS
356        ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A
357        COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS
358        COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS
359        COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS
360        ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B
361        COMMON_SCRIPT, // 86, SPECIALS
362        COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS
363        COMMON_SCRIPT, // 88, SPECIALS
364    };
365
366    // could be further reduced to a byte array, but I didn't bother.
367    static final int[][] split = {
368        {0x0250, 4, 5}, // -1
369        {0x02B0, 5, 6}, // -2
370        {0x0370, 7, 8}, // -3
371        {0x0530, 0, 10}, // -4
372        {0x0590, 10, 11}, // -5
373        {0x0750, 13, 0}, // -6
374        {0x07C0, 14, 0}, // -7
375        {0x10A0, 28, 29}, // -8
376        {0x13A0, 0, 32}, // -9
377        {0x16A0, 34, 35}, // -10
378        {0x18B0, 37, 0}, // -11
379        {0x2070, 40, 41}, // -12
380        {0x20A0, 41, -31}, // -13
381        {0x2150, 44, 45}, // -14
382        {0x2190, 45, 46}, // -15
383        {0x2440, 49, -32}, // -16
384        {0x25A0, 53, 54}, // -17
385        {0x27C0, 56, 0}, // -18
386        {0x2FE0, 59, -33}, // -19
387        {0x3040, 61, 62}, // -20
388        {0x30A0, 62, 63}, // -21
389        {0x3130, 64, 65}, // -22
390        {0x3190, 65, -34}, // -23
391        {0x4DB6, 70, 0}, // -24
392        {0xA490, 72, -35}, // -25
393        {0xD7A4, 74, 0}, // -26
394        {0xFB50, 80, 81}, // -27
395        {0xFE20, 0, -36}, // -28
396        {0xFEFF, 85, 86}, // -29
397        {0xFFF0, 87, -37}, // -30
398        {0x20D0, 42, 43}, // -31
399        {0x2460, 50, 51}, // -32
400        {0x2FF0, 0, 60}, // -33
401        {0x31A0, 66, -38}, // -34
402        {0xA4D0, 73, 0}, //-35
403        {0xFE30, 82, -39}, //-36
404        {0xFFFE, 88, 0}, //-37
405        {0x31C0, 67, 0}, // -38
406        {0xFE50, 83, -40}, //-39
407        {0xFE70, 84, 85} // -40
408    };
409
410    static final byte[] charToBlock = {
411      1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7,
412      0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27,
413      28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36,
414      37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39,
415      -12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18,
416      57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19,
417      -20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70,
418      70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
419      70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
420      70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71,
421      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
422      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
423      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
424      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
425      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
426      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
427      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
428      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
429      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
430      71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
431      72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0,
432      0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74,
433      74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
434      74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
435      74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
436      74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
437      74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26,
438      75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77,
439      78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
440      78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
441      78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
442      78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30
443    };
444    */
445}
446