1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/** 4 ******************************************************************************* 5 * Copyright (C) 2000-2010, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9package com.ibm.icu.dev.test.translit; 10 11import com.ibm.icu.text.UTF16; 12import com.ibm.icu.text.UnicodeSet; 13 14public final class TestUtility { 15 16 public static String hex(char ch) { 17 String foo = Integer.toString(ch,16).toUpperCase(); 18 return "0000".substring(0,4-foo.length()) + foo; 19 } 20 21 public static String hex(int ch) { 22 String foo = Integer.toString(ch,16).toUpperCase(); 23 return "00000000".substring(0,4-foo.length()) + foo; 24 } 25 26 public static String hex(String s) { 27 return hex(s,","); 28 } 29 30 public static String hex(String s, String sep) { 31 if (s.length() == 0) return ""; 32 String result = hex(s.charAt(0)); 33 for (int i = 1; i < s.length(); ++i) { 34 result += sep; 35 result += hex(s.charAt(i)); 36 } 37 return result; 38 } 39 40 public static String replace(String source, String toBeReplaced, String replacement) { 41 StringBuffer results = new StringBuffer(); 42 int len = toBeReplaced.length(); 43 for (int i = 0; i < source.length(); ++i) { 44 if (source.regionMatches(false, i, toBeReplaced, 0, len)) { 45 results.append(replacement); 46 i += len - 1; // minus one, since we will increment 47 } else { 48 results.append(source.charAt(i)); 49 } 50 } 51 return results.toString(); 52 } 53 54 public static String replaceAll(String source, UnicodeSet set, String replacement) { 55 StringBuffer results = new StringBuffer(); 56 int cp; 57 for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { 58 cp = UTF16.charAt(source,i); 59 if (set.contains(cp)) { 60 results.append(replacement); 61 } else { 62 UTF16.append(results, cp); 63 } 64 } 65 return results.toString(); 66 } 67 68 // COMMENTED OUT ALL THE OLD SCRIPT STUFF 69 /* 70 public static byte getScript(char c) { 71 return getScript(getBlock(c)); 72 } 73 74 public static byte getScript(byte block) { 75 return blockToScript[block]; 76 } 77 78 public static byte getBlock(char c) { 79 int index = c >> 7; 80 byte block = charToBlock[index]; 81 while (block < 0) { // take care of exceptions, blocks split across 128 boundaries 82 int[] tuple = split[-block-1]; 83 if (c < tuple[0]) block = (byte)tuple[1]; 84 else block = (byte)tuple[2]; 85 } 86 return block; 87 } 88 89 // returns next letter of script, or 0xFFFF if done 90 91 public static char getNextLetter(char c, byte script) { 92 while (c < 0xFFFF) { 93 ++c; 94 if (getScript(c) == script && Character.isLetter(c)) { 95 return c; 96 } 97 } 98 return c; 99 } 100 101 // Supplements to Character methods; these methods go through 102 // UCharacter if possible. If not, they fall back to Character. 103 104 public static boolean isUnassigned(char c) { 105 try { 106 return UCharacter.getType(c) == UCharacterCategory.UNASSIGNED; 107 } catch (NullPointerException e) { 108 System.out.print(""); 109 } 110 return Character.getType(c) == Character.UNASSIGNED; 111 } 112 113 public static boolean isLetter(char c) { 114 try { 115 return UCharacter.isLetter(c); 116 } catch (NullPointerException e) { 117 System.out.print(""); 118 } 119 return Character.isLetter(c); 120 } 121 122 public static void main(String[] args) { 123 System.out.println("Blocks: "); 124 byte lastblock = -128; 125 for (char cc = 0; cc < 0xFFFF; ++cc) { 126 byte block = TestUtility.getBlock(cc); 127 if (block != lastblock) { 128 System.out.println(TestUtility.hex(cc) + "\t" + block); 129 lastblock = block; 130 } 131 } 132 System.out.println(); 133 System.out.println("Scripts: "); 134 byte lastScript = -128; 135 for (char cc = 0; cc < 0xFFFF; ++cc) { 136 byte script = TestUtility.getScript(cc); 137 if (script != lastScript) { 138 System.out.println(TestUtility.hex(cc) + "\t" + script); 139 lastScript = script; 140 } 141 } 142 } 143 144 145 146 public static final byte // SCRIPT CODE 147 COMMON_SCRIPT = 0, 148 LATIN_SCRIPT = 1, 149 GREEK_SCRIPT = 2, 150 CYRILLIC_SCRIPT = 3, 151 ARMENIAN_SCRIPT = 4, 152 HEBREW_SCRIPT = 5, 153 ARABIC_SCRIPT = 6, 154 SYRIAC_SCRIPT = 7, 155 THAANA_SCRIPT = 8, 156 DEVANAGARI_SCRIPT = 9, 157 BENGALI_SCRIPT = 10, 158 GURMUKHI_SCRIPT = 11, 159 GUJARATI_SCRIPT = 12, 160 ORIYA_SCRIPT = 13, 161 TAMIL_SCRIPT = 14, 162 TELUGU_SCRIPT = 15, 163 KANNADA_SCRIPT = 16, 164 MALAYALAM_SCRIPT = 17, 165 SINHALA_SCRIPT = 18, 166 THAI_SCRIPT = 19, 167 LAO_SCRIPT = 20, 168 TIBETAN_SCRIPT = 21, 169 MYANMAR_SCRIPT = 22, 170 GEORGIAN_SCRIPT = 23, 171 JAMO_SCRIPT = 24, 172 HANGUL_SCRIPT = 25, 173 ETHIOPIC_SCRIPT = 26, 174 CHEROKEE_SCRIPT = 27, 175 ABORIGINAL_SCRIPT = 28, 176 OGHAM_SCRIPT = 29, 177 RUNIC_SCRIPT = 30, 178 KHMER_SCRIPT = 31, 179 MONGOLIAN_SCRIPT = 32, 180 HIRAGANA_SCRIPT = 33, 181 KATAKANA_SCRIPT = 34, 182 BOPOMOFO_SCRIPT = 35, 183 HAN_SCRIPT = 36, 184 YI_SCRIPT = 37; 185 186 public static final byte // block code 187 RESERVED_BLOCK = 0, 188 BASIC_LATIN = 1, 189 LATIN_1_SUPPLEMENT = 2, 190 LATIN_EXTENDED_A = 3, 191 LATIN_EXTENDED_B = 4, 192 IPA_EXTENSIONS = 5, 193 SPACING_MODIFIER_LETTERS = 6, 194 COMBINING_DIACRITICAL_MARKS = 7, 195 GREEK = 8, 196 CYRILLIC = 9, 197 ARMENIAN = 10, 198 HEBREW = 11, 199 ARABIC = 12, 200 SYRIAC = 13, 201 THAANA = 14, 202 DEVANAGARI = 15, 203 BENGALI = 16, 204 GURMUKHI = 17, 205 GUJARATI = 18, 206 ORIYA = 19, 207 TAMIL = 20, 208 TELUGU = 21, 209 KANNADA = 22, 210 MALAYALAM = 23, 211 SINHALA = 24, 212 THAI = 25, 213 LAO = 26, 214 TIBETAN = 27, 215 MYANMAR = 28, 216 GEORGIAN = 29, 217 HANGUL_JAMO = 30, 218 ETHIOPIC = 31, 219 CHEROKEE = 32, 220 UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33, 221 OGHAM = 34, 222 RUNIC = 35, 223 KHMER = 36, 224 MONGOLIAN = 37, 225 LATIN_EXTENDED_ADDITIONAL = 38, 226 GREEK_EXTENDED = 39, 227 GENERAL_PUNCTUATION = 40, 228 SUPERSCRIPTS_AND_SUBSCRIPTS = 41, 229 CURRENCY_SYMBOLS = 42, 230 COMBINING_MARKS_FOR_SYMBOLS = 43, 231 LETTERLIKE_SYMBOLS = 44, 232 NUMBER_FORMS = 45, 233 ARROWS = 46, 234 MATHEMATICAL_OPERATORS = 47, 235 MISCELLANEOUS_TECHNICAL = 48, 236 CONTROL_PICTURES = 49, 237 OPTICAL_CHARACTER_RECOGNITION = 50, 238 ENCLOSED_ALPHANUMERICS = 51, 239 BOX_DRAWING = 52, 240 BLOCK_ELEMENTS = 53, 241 GEOMETRIC_SHAPES = 54, 242 MISCELLANEOUS_SYMBOLS = 55, 243 DINGBATS = 56, 244 BRAILLE_PATTERNS = 57, 245 CJK_RADICALS_SUPPLEMENT = 58, 246 KANGXI_RADICALS = 59, 247 IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60, 248 CJK_SYMBOLS_AND_PUNCTUATION = 61, 249 HIRAGANA = 62, 250 KATAKANA = 63, 251 BOPOMOFO = 64, 252 HANGUL_COMPATIBILITY_JAMO = 65, 253 KANBUN = 66, 254 BOPOMOFO_EXTENDED = 67, 255 ENCLOSED_CJK_LETTERS_AND_MONTHS = 68, 256 CJK_COMPATIBILITY = 69, 257 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70, 258 CJK_UNIFIED_IDEOGRAPHS = 71, 259 YI_SYLLABLES = 72, 260 YI_RADICALS = 73, 261 HANGUL_SYLLABLES = 74, 262 HIGH_SURROGATES = 75, 263 HIGH_PRIVATE_USE_SURROGATES = 76, 264 LOW_SURROGATES = 77, 265 PRIVATE_USE = 78, 266 CJK_COMPATIBILITY_IDEOGRAPHS = 79, 267 ALPHABETIC_PRESENTATION_FORMS = 80, 268 ARABIC_PRESENTATION_FORMS_A = 81, 269 COMBINING_HALF_MARKS = 82, 270 CJK_COMPATIBILITY_FORMS = 83, 271 SMALL_FORM_VARIANTS = 84, 272 ARABIC_PRESENTATION_FORMS_B = 85, 273 SPECIALS = 86, 274 HALFWIDTH_AND_FULLWIDTH_FORMS = 87; 275 276 static final byte[] blockToScript = { 277 COMMON_SCRIPT, // 0, <RESERVED_BLOCK> 278 LATIN_SCRIPT, // 1, BASIC_LATIN 279 LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT 280 LATIN_SCRIPT, // 3, LATIN_EXTENDED_A 281 LATIN_SCRIPT, // 4, LATIN_EXTENDED_B 282 LATIN_SCRIPT, // 5, IPA_EXTENSIONS 283 COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS 284 COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS 285 GREEK_SCRIPT, // 8, GREEK 286 CYRILLIC_SCRIPT, // 9, CYRILLIC 287 ARMENIAN_SCRIPT, // 10, ARMENIAN 288 HEBREW_SCRIPT, // 11, HEBREW 289 ARABIC_SCRIPT, // 12, ARABIC 290 SYRIAC_SCRIPT, // 13, SYRIAC 291 THAANA_SCRIPT, // 14, THAANA 292 DEVANAGARI_SCRIPT, // 15, DEVANAGARI 293 BENGALI_SCRIPT, // 16, BENGALI 294 GURMUKHI_SCRIPT, // 17, GURMUKHI 295 GUJARATI_SCRIPT, // 18, GUJARATI 296 ORIYA_SCRIPT, // 19, ORIYA 297 TAMIL_SCRIPT, // 20, TAMIL 298 TELUGU_SCRIPT, // 21, TELUGU 299 KANNADA_SCRIPT, // 22, KANNADA 300 MALAYALAM_SCRIPT, // 23, MALAYALAM 301 SINHALA_SCRIPT, // 24, SINHALA 302 THAI_SCRIPT, // 25, THAI 303 LAO_SCRIPT, // 26, LAO 304 TIBETAN_SCRIPT, // 27, TIBETAN 305 MYANMAR_SCRIPT, // 28, MYANMAR 306 GEORGIAN_SCRIPT, // 29, GEORGIAN 307 JAMO_SCRIPT, // 30, HANGUL_JAMO 308 ETHIOPIC_SCRIPT, // 31, ETHIOPIC 309 CHEROKEE_SCRIPT, // 32, CHEROKEE 310 ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS 311 OGHAM_SCRIPT, // 34, OGHAM 312 RUNIC_SCRIPT, // 35, RUNIC 313 KHMER_SCRIPT, // 36, KHMER 314 MONGOLIAN_SCRIPT, // 37, MONGOLIAN 315 LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL 316 GREEK_SCRIPT, // 39, GREEK_EXTENDED 317 COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION 318 COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS 319 COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS 320 COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS 321 COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS 322 COMMON_SCRIPT, // 45, NUMBER_FORMS 323 COMMON_SCRIPT, // 46, ARROWS 324 COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS 325 COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL 326 COMMON_SCRIPT, // 49, CONTROL_PICTURES 327 COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION 328 COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS 329 COMMON_SCRIPT, // 52, BOX_DRAWING 330 COMMON_SCRIPT, // 53, BLOCK_ELEMENTS 331 COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES 332 COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS 333 COMMON_SCRIPT, // 56, DINGBATS 334 COMMON_SCRIPT, // 57, BRAILLE_PATTERNS 335 HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT 336 HAN_SCRIPT, // 59, KANGXI_RADICALS 337 HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS 338 COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION 339 HIRAGANA_SCRIPT, // 62, HIRAGANA 340 KATAKANA_SCRIPT, // 63, KATAKANA 341 BOPOMOFO_SCRIPT, // 64, BOPOMOFO 342 JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO 343 HAN_SCRIPT, // 66, KANBUN 344 BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED 345 COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS 346 COMMON_SCRIPT, // 69, CJK_COMPATIBILITY 347 HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 348 HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS 349 YI_SCRIPT, // 72, YI_SYLLABLES 350 YI_SCRIPT, // 73, YI_RADICALS 351 HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES 352 COMMON_SCRIPT, // 75, HIGH_SURROGATES 353 COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES 354 COMMON_SCRIPT, // 77, LOW_SURROGATES 355 COMMON_SCRIPT, // 78, PRIVATE_USE 356 HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS 357 COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS 358 ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A 359 COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS 360 COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS 361 COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS 362 ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B 363 COMMON_SCRIPT, // 86, SPECIALS 364 COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS 365 COMMON_SCRIPT, // 88, SPECIALS 366 }; 367 368 // could be further reduced to a byte array, but I didn't bother. 369 static final int[][] split = { 370 {0x0250, 4, 5}, // -1 371 {0x02B0, 5, 6}, // -2 372 {0x0370, 7, 8}, // -3 373 {0x0530, 0, 10}, // -4 374 {0x0590, 10, 11}, // -5 375 {0x0750, 13, 0}, // -6 376 {0x07C0, 14, 0}, // -7 377 {0x10A0, 28, 29}, // -8 378 {0x13A0, 0, 32}, // -9 379 {0x16A0, 34, 35}, // -10 380 {0x18B0, 37, 0}, // -11 381 {0x2070, 40, 41}, // -12 382 {0x20A0, 41, -31}, // -13 383 {0x2150, 44, 45}, // -14 384 {0x2190, 45, 46}, // -15 385 {0x2440, 49, -32}, // -16 386 {0x25A0, 53, 54}, // -17 387 {0x27C0, 56, 0}, // -18 388 {0x2FE0, 59, -33}, // -19 389 {0x3040, 61, 62}, // -20 390 {0x30A0, 62, 63}, // -21 391 {0x3130, 64, 65}, // -22 392 {0x3190, 65, -34}, // -23 393 {0x4DB6, 70, 0}, // -24 394 {0xA490, 72, -35}, // -25 395 {0xD7A4, 74, 0}, // -26 396 {0xFB50, 80, 81}, // -27 397 {0xFE20, 0, -36}, // -28 398 {0xFEFF, 85, 86}, // -29 399 {0xFFF0, 87, -37}, // -30 400 {0x20D0, 42, 43}, // -31 401 {0x2460, 50, 51}, // -32 402 {0x2FF0, 0, 60}, // -33 403 {0x31A0, 66, -38}, // -34 404 {0xA4D0, 73, 0}, //-35 405 {0xFE30, 82, -39}, //-36 406 {0xFFFE, 88, 0}, //-37 407 {0x31C0, 67, 0}, // -38 408 {0xFE50, 83, -40}, //-39 409 {0xFE70, 84, 85} // -40 410 }; 411 412 static final byte[] charToBlock = { 413 1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7, 414 0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27, 415 28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36, 416 37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39, 417 -12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18, 418 57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19, 419 -20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70, 420 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 421 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 422 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71, 423 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 424 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 425 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 426 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 427 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 428 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 429 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 430 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 431 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 432 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 433 72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0, 434 0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74, 435 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 436 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 437 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 438 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 439 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26, 440 75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77, 441 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 442 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 443 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 444 78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30 445 }; 446 */ 447} 448