1/** 2 ******************************************************************************* 3 * Copyright (C) 2000-2010, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7package com.ibm.icu.dev.test.translit; 8 9import com.ibm.icu.text.UTF16; 10import com.ibm.icu.text.UnicodeSet; 11 12public final class TestUtility { 13 14 public static String hex(char ch) { 15 String foo = Integer.toString(ch,16).toUpperCase(); 16 return "0000".substring(0,4-foo.length()) + foo; 17 } 18 19 public static String hex(int ch) { 20 String foo = Integer.toString(ch,16).toUpperCase(); 21 return "00000000".substring(0,4-foo.length()) + foo; 22 } 23 24 public static String hex(String s) { 25 return hex(s,","); 26 } 27 28 public static String hex(String s, String sep) { 29 if (s.length() == 0) return ""; 30 String result = hex(s.charAt(0)); 31 for (int i = 1; i < s.length(); ++i) { 32 result += sep; 33 result += hex(s.charAt(i)); 34 } 35 return result; 36 } 37 38 public static String replace(String source, String toBeReplaced, String replacement) { 39 StringBuffer results = new StringBuffer(); 40 int len = toBeReplaced.length(); 41 for (int i = 0; i < source.length(); ++i) { 42 if (source.regionMatches(false, i, toBeReplaced, 0, len)) { 43 results.append(replacement); 44 i += len - 1; // minus one, since we will increment 45 } else { 46 results.append(source.charAt(i)); 47 } 48 } 49 return results.toString(); 50 } 51 52 public static String replaceAll(String source, UnicodeSet set, String replacement) { 53 StringBuffer results = new StringBuffer(); 54 int cp; 55 for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { 56 cp = UTF16.charAt(source,i); 57 if (set.contains(cp)) { 58 results.append(replacement); 59 } else { 60 UTF16.append(results, cp); 61 } 62 } 63 return results.toString(); 64 } 65 66 // COMMENTED OUT ALL THE OLD SCRIPT STUFF 67 /* 68 public static byte getScript(char c) { 69 return getScript(getBlock(c)); 70 } 71 72 public static byte getScript(byte block) { 73 return blockToScript[block]; 74 } 75 76 public static byte getBlock(char c) { 77 int index = c >> 7; 78 byte block = charToBlock[index]; 79 while (block < 0) { // take care of exceptions, blocks split across 128 boundaries 80 int[] tuple = split[-block-1]; 81 if (c < tuple[0]) block = (byte)tuple[1]; 82 else block = (byte)tuple[2]; 83 } 84 return block; 85 } 86 87 // returns next letter of script, or 0xFFFF if done 88 89 public static char getNextLetter(char c, byte script) { 90 while (c < 0xFFFF) { 91 ++c; 92 if (getScript(c) == script && Character.isLetter(c)) { 93 return c; 94 } 95 } 96 return c; 97 } 98 99 // Supplements to Character methods; these methods go through 100 // UCharacter if possible. If not, they fall back to Character. 101 102 public static boolean isUnassigned(char c) { 103 try { 104 return UCharacter.getType(c) == UCharacterCategory.UNASSIGNED; 105 } catch (NullPointerException e) { 106 System.out.print(""); 107 } 108 return Character.getType(c) == Character.UNASSIGNED; 109 } 110 111 public static boolean isLetter(char c) { 112 try { 113 return UCharacter.isLetter(c); 114 } catch (NullPointerException e) { 115 System.out.print(""); 116 } 117 return Character.isLetter(c); 118 } 119 120 public static void main(String[] args) { 121 System.out.println("Blocks: "); 122 byte lastblock = -128; 123 for (char cc = 0; cc < 0xFFFF; ++cc) { 124 byte block = TestUtility.getBlock(cc); 125 if (block != lastblock) { 126 System.out.println(TestUtility.hex(cc) + "\t" + block); 127 lastblock = block; 128 } 129 } 130 System.out.println(); 131 System.out.println("Scripts: "); 132 byte lastScript = -128; 133 for (char cc = 0; cc < 0xFFFF; ++cc) { 134 byte script = TestUtility.getScript(cc); 135 if (script != lastScript) { 136 System.out.println(TestUtility.hex(cc) + "\t" + script); 137 lastScript = script; 138 } 139 } 140 } 141 142 143 144 public static final byte // SCRIPT CODE 145 COMMON_SCRIPT = 0, 146 LATIN_SCRIPT = 1, 147 GREEK_SCRIPT = 2, 148 CYRILLIC_SCRIPT = 3, 149 ARMENIAN_SCRIPT = 4, 150 HEBREW_SCRIPT = 5, 151 ARABIC_SCRIPT = 6, 152 SYRIAC_SCRIPT = 7, 153 THAANA_SCRIPT = 8, 154 DEVANAGARI_SCRIPT = 9, 155 BENGALI_SCRIPT = 10, 156 GURMUKHI_SCRIPT = 11, 157 GUJARATI_SCRIPT = 12, 158 ORIYA_SCRIPT = 13, 159 TAMIL_SCRIPT = 14, 160 TELUGU_SCRIPT = 15, 161 KANNADA_SCRIPT = 16, 162 MALAYALAM_SCRIPT = 17, 163 SINHALA_SCRIPT = 18, 164 THAI_SCRIPT = 19, 165 LAO_SCRIPT = 20, 166 TIBETAN_SCRIPT = 21, 167 MYANMAR_SCRIPT = 22, 168 GEORGIAN_SCRIPT = 23, 169 JAMO_SCRIPT = 24, 170 HANGUL_SCRIPT = 25, 171 ETHIOPIC_SCRIPT = 26, 172 CHEROKEE_SCRIPT = 27, 173 ABORIGINAL_SCRIPT = 28, 174 OGHAM_SCRIPT = 29, 175 RUNIC_SCRIPT = 30, 176 KHMER_SCRIPT = 31, 177 MONGOLIAN_SCRIPT = 32, 178 HIRAGANA_SCRIPT = 33, 179 KATAKANA_SCRIPT = 34, 180 BOPOMOFO_SCRIPT = 35, 181 HAN_SCRIPT = 36, 182 YI_SCRIPT = 37; 183 184 public static final byte // block code 185 RESERVED_BLOCK = 0, 186 BASIC_LATIN = 1, 187 LATIN_1_SUPPLEMENT = 2, 188 LATIN_EXTENDED_A = 3, 189 LATIN_EXTENDED_B = 4, 190 IPA_EXTENSIONS = 5, 191 SPACING_MODIFIER_LETTERS = 6, 192 COMBINING_DIACRITICAL_MARKS = 7, 193 GREEK = 8, 194 CYRILLIC = 9, 195 ARMENIAN = 10, 196 HEBREW = 11, 197 ARABIC = 12, 198 SYRIAC = 13, 199 THAANA = 14, 200 DEVANAGARI = 15, 201 BENGALI = 16, 202 GURMUKHI = 17, 203 GUJARATI = 18, 204 ORIYA = 19, 205 TAMIL = 20, 206 TELUGU = 21, 207 KANNADA = 22, 208 MALAYALAM = 23, 209 SINHALA = 24, 210 THAI = 25, 211 LAO = 26, 212 TIBETAN = 27, 213 MYANMAR = 28, 214 GEORGIAN = 29, 215 HANGUL_JAMO = 30, 216 ETHIOPIC = 31, 217 CHEROKEE = 32, 218 UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33, 219 OGHAM = 34, 220 RUNIC = 35, 221 KHMER = 36, 222 MONGOLIAN = 37, 223 LATIN_EXTENDED_ADDITIONAL = 38, 224 GREEK_EXTENDED = 39, 225 GENERAL_PUNCTUATION = 40, 226 SUPERSCRIPTS_AND_SUBSCRIPTS = 41, 227 CURRENCY_SYMBOLS = 42, 228 COMBINING_MARKS_FOR_SYMBOLS = 43, 229 LETTERLIKE_SYMBOLS = 44, 230 NUMBER_FORMS = 45, 231 ARROWS = 46, 232 MATHEMATICAL_OPERATORS = 47, 233 MISCELLANEOUS_TECHNICAL = 48, 234 CONTROL_PICTURES = 49, 235 OPTICAL_CHARACTER_RECOGNITION = 50, 236 ENCLOSED_ALPHANUMERICS = 51, 237 BOX_DRAWING = 52, 238 BLOCK_ELEMENTS = 53, 239 GEOMETRIC_SHAPES = 54, 240 MISCELLANEOUS_SYMBOLS = 55, 241 DINGBATS = 56, 242 BRAILLE_PATTERNS = 57, 243 CJK_RADICALS_SUPPLEMENT = 58, 244 KANGXI_RADICALS = 59, 245 IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60, 246 CJK_SYMBOLS_AND_PUNCTUATION = 61, 247 HIRAGANA = 62, 248 KATAKANA = 63, 249 BOPOMOFO = 64, 250 HANGUL_COMPATIBILITY_JAMO = 65, 251 KANBUN = 66, 252 BOPOMOFO_EXTENDED = 67, 253 ENCLOSED_CJK_LETTERS_AND_MONTHS = 68, 254 CJK_COMPATIBILITY = 69, 255 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70, 256 CJK_UNIFIED_IDEOGRAPHS = 71, 257 YI_SYLLABLES = 72, 258 YI_RADICALS = 73, 259 HANGUL_SYLLABLES = 74, 260 HIGH_SURROGATES = 75, 261 HIGH_PRIVATE_USE_SURROGATES = 76, 262 LOW_SURROGATES = 77, 263 PRIVATE_USE = 78, 264 CJK_COMPATIBILITY_IDEOGRAPHS = 79, 265 ALPHABETIC_PRESENTATION_FORMS = 80, 266 ARABIC_PRESENTATION_FORMS_A = 81, 267 COMBINING_HALF_MARKS = 82, 268 CJK_COMPATIBILITY_FORMS = 83, 269 SMALL_FORM_VARIANTS = 84, 270 ARABIC_PRESENTATION_FORMS_B = 85, 271 SPECIALS = 86, 272 HALFWIDTH_AND_FULLWIDTH_FORMS = 87; 273 274 static final byte[] blockToScript = { 275 COMMON_SCRIPT, // 0, <RESERVED_BLOCK> 276 LATIN_SCRIPT, // 1, BASIC_LATIN 277 LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT 278 LATIN_SCRIPT, // 3, LATIN_EXTENDED_A 279 LATIN_SCRIPT, // 4, LATIN_EXTENDED_B 280 LATIN_SCRIPT, // 5, IPA_EXTENSIONS 281 COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS 282 COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS 283 GREEK_SCRIPT, // 8, GREEK 284 CYRILLIC_SCRIPT, // 9, CYRILLIC 285 ARMENIAN_SCRIPT, // 10, ARMENIAN 286 HEBREW_SCRIPT, // 11, HEBREW 287 ARABIC_SCRIPT, // 12, ARABIC 288 SYRIAC_SCRIPT, // 13, SYRIAC 289 THAANA_SCRIPT, // 14, THAANA 290 DEVANAGARI_SCRIPT, // 15, DEVANAGARI 291 BENGALI_SCRIPT, // 16, BENGALI 292 GURMUKHI_SCRIPT, // 17, GURMUKHI 293 GUJARATI_SCRIPT, // 18, GUJARATI 294 ORIYA_SCRIPT, // 19, ORIYA 295 TAMIL_SCRIPT, // 20, TAMIL 296 TELUGU_SCRIPT, // 21, TELUGU 297 KANNADA_SCRIPT, // 22, KANNADA 298 MALAYALAM_SCRIPT, // 23, MALAYALAM 299 SINHALA_SCRIPT, // 24, SINHALA 300 THAI_SCRIPT, // 25, THAI 301 LAO_SCRIPT, // 26, LAO 302 TIBETAN_SCRIPT, // 27, TIBETAN 303 MYANMAR_SCRIPT, // 28, MYANMAR 304 GEORGIAN_SCRIPT, // 29, GEORGIAN 305 JAMO_SCRIPT, // 30, HANGUL_JAMO 306 ETHIOPIC_SCRIPT, // 31, ETHIOPIC 307 CHEROKEE_SCRIPT, // 32, CHEROKEE 308 ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS 309 OGHAM_SCRIPT, // 34, OGHAM 310 RUNIC_SCRIPT, // 35, RUNIC 311 KHMER_SCRIPT, // 36, KHMER 312 MONGOLIAN_SCRIPT, // 37, MONGOLIAN 313 LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL 314 GREEK_SCRIPT, // 39, GREEK_EXTENDED 315 COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION 316 COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS 317 COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS 318 COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS 319 COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS 320 COMMON_SCRIPT, // 45, NUMBER_FORMS 321 COMMON_SCRIPT, // 46, ARROWS 322 COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS 323 COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL 324 COMMON_SCRIPT, // 49, CONTROL_PICTURES 325 COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION 326 COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS 327 COMMON_SCRIPT, // 52, BOX_DRAWING 328 COMMON_SCRIPT, // 53, BLOCK_ELEMENTS 329 COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES 330 COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS 331 COMMON_SCRIPT, // 56, DINGBATS 332 COMMON_SCRIPT, // 57, BRAILLE_PATTERNS 333 HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT 334 HAN_SCRIPT, // 59, KANGXI_RADICALS 335 HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS 336 COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION 337 HIRAGANA_SCRIPT, // 62, HIRAGANA 338 KATAKANA_SCRIPT, // 63, KATAKANA 339 BOPOMOFO_SCRIPT, // 64, BOPOMOFO 340 JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO 341 HAN_SCRIPT, // 66, KANBUN 342 BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED 343 COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS 344 COMMON_SCRIPT, // 69, CJK_COMPATIBILITY 345 HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 346 HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS 347 YI_SCRIPT, // 72, YI_SYLLABLES 348 YI_SCRIPT, // 73, YI_RADICALS 349 HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES 350 COMMON_SCRIPT, // 75, HIGH_SURROGATES 351 COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES 352 COMMON_SCRIPT, // 77, LOW_SURROGATES 353 COMMON_SCRIPT, // 78, PRIVATE_USE 354 HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS 355 COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS 356 ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A 357 COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS 358 COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS 359 COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS 360 ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B 361 COMMON_SCRIPT, // 86, SPECIALS 362 COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS 363 COMMON_SCRIPT, // 88, SPECIALS 364 }; 365 366 // could be further reduced to a byte array, but I didn't bother. 367 static final int[][] split = { 368 {0x0250, 4, 5}, // -1 369 {0x02B0, 5, 6}, // -2 370 {0x0370, 7, 8}, // -3 371 {0x0530, 0, 10}, // -4 372 {0x0590, 10, 11}, // -5 373 {0x0750, 13, 0}, // -6 374 {0x07C0, 14, 0}, // -7 375 {0x10A0, 28, 29}, // -8 376 {0x13A0, 0, 32}, // -9 377 {0x16A0, 34, 35}, // -10 378 {0x18B0, 37, 0}, // -11 379 {0x2070, 40, 41}, // -12 380 {0x20A0, 41, -31}, // -13 381 {0x2150, 44, 45}, // -14 382 {0x2190, 45, 46}, // -15 383 {0x2440, 49, -32}, // -16 384 {0x25A0, 53, 54}, // -17 385 {0x27C0, 56, 0}, // -18 386 {0x2FE0, 59, -33}, // -19 387 {0x3040, 61, 62}, // -20 388 {0x30A0, 62, 63}, // -21 389 {0x3130, 64, 65}, // -22 390 {0x3190, 65, -34}, // -23 391 {0x4DB6, 70, 0}, // -24 392 {0xA490, 72, -35}, // -25 393 {0xD7A4, 74, 0}, // -26 394 {0xFB50, 80, 81}, // -27 395 {0xFE20, 0, -36}, // -28 396 {0xFEFF, 85, 86}, // -29 397 {0xFFF0, 87, -37}, // -30 398 {0x20D0, 42, 43}, // -31 399 {0x2460, 50, 51}, // -32 400 {0x2FF0, 0, 60}, // -33 401 {0x31A0, 66, -38}, // -34 402 {0xA4D0, 73, 0}, //-35 403 {0xFE30, 82, -39}, //-36 404 {0xFFFE, 88, 0}, //-37 405 {0x31C0, 67, 0}, // -38 406 {0xFE50, 83, -40}, //-39 407 {0xFE70, 84, 85} // -40 408 }; 409 410 static final byte[] charToBlock = { 411 1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7, 412 0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27, 413 28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36, 414 37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39, 415 -12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18, 416 57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19, 417 -20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70, 418 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 419 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 420 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71, 421 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 422 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 423 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 424 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 425 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 426 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 427 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 428 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 429 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 430 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 431 72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0, 432 0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74, 433 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 434 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 435 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 436 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 437 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26, 438 75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77, 439 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 440 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 441 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 442 78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30 443 }; 444 */ 445} 446