1/* GENERATED SOURCE. DO NOT MODIFY. */ 2// © 2016 and later: Unicode, Inc. and others. 3// License & terms of use: http://www.unicode.org/copyright.html#License 4/* 5 ******************************************************************************* 6 * Copyright (C) 1996-2016, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11package android.icu.dev.test.normalizer; 12 13import java.text.StringCharacterIterator; 14import java.util.Random; 15 16import org.junit.Test; 17 18import android.icu.dev.test.TestFmwk; 19import android.icu.impl.Norm2AllModes; 20import android.icu.impl.Normalizer2Impl; 21import android.icu.impl.USerializedSet; 22import android.icu.impl.Utility; 23import android.icu.lang.UCharacter; 24import android.icu.lang.UCharacterCategory; 25import android.icu.lang.UProperty; 26import android.icu.text.FilteredNormalizer2; 27import android.icu.text.Normalizer; 28import android.icu.text.Normalizer2; 29import android.icu.text.UCharacterIterator; 30import android.icu.text.UTF16; 31import android.icu.text.UnicodeSet; 32import android.icu.text.UnicodeSetIterator; 33 34 35public class BasicTest extends TestFmwk { 36 String[][] canonTests = { 37 // Input Decomposed Composed 38 { "cat", "cat", "cat" }, 39 { "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", }, 40 41 { "\u1e0a", "D\u0307", "\u1e0a" }, // D-dot_above 42 { "D\u0307", "D\u0307", "\u1e0a" }, // D dot_above 43 44 { "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_below dot_above 45 { "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_above dot_below 46 { "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D dot_below dot_above 47 48 { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above 49 { "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below 50 51 { "\u1E14", "E\u0304\u0300", "\u1E14" }, // E-macron-grave 52 { "\u0112\u0300", "E\u0304\u0300", "\u1E14" }, // E-macron + grave 53 { "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" }, // E-grave + macron 54 55 { "\u212b", "A\u030a", "\u00c5" }, // angstrom_sign 56 { "\u00c5", "A\u030a", "\u00c5" }, // A-ring 57 58 { "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" }, 59 { "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" }, 60 61 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated with 3.0 62 { "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" }, //updated with 3.0 63 64 { "Henry IV", "Henry IV", "Henry IV" }, 65 { "Henry \u2163", "Henry \u2163", "Henry \u2163" }, 66 67 { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana) 68 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten 69 { "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" }, // hw_ka + hw_ten 70 { "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" }, // ka + hw_ten 71 { "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" }, // hw_ka + ten 72 73 { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" }, 74 {"\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e","\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165", "\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165"}, 75 }; 76 77 String[][] compatTests = { 78 // Input Decomposed Composed 79 { "cat", "cat", "cat" }, 80 { "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", }, // Alef-Lamed vs. Alef, Lamed 81 82 { "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" }, 83 { "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" }, // ffi ligature -> f + f + i 84 85 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated for 3.0 86 { "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" }, // ffi ligature -> f + f + i 87 88 { "Henry IV", "Henry IV", "Henry IV" }, 89 { "Henry \u2163", "Henry IV", "Henry IV" }, 90 91 { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana) 92 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten 93 94 { "\uFF76\u3099", "\u30AB\u3099", "\u30AC" }, // hw_ka + ten 95 96 /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/ 97 { "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" }, // hw_ka + hw_ten 98 { "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" }, // ka + hw_ten 99 100 }; 101 102 // With Canonical decomposition, Hangul syllables should get decomposed 103 // into Jamo, but Jamo characters should not be decomposed into 104 // conjoining Jamo 105 String[][] hangulCanon = { 106 // Input Decomposed Composed 107 { "\ud4db", "\u1111\u1171\u11b6", "\ud4db" }, 108 { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" }, 109 }; 110 111 // With compatibility decomposition turned on, 112 // it should go all the way down to conjoining Jamo characters. 113 // THIS IS NO LONGER TRUE IN UNICODE v2.1.8, SO THIS TEST IS OBSOLETE 114 String[][] hangulCompat = { 115 // Input Decomposed Composed 116 // { "\ud4db", "\u1111\u116e\u1175\u11af\u11c2", "\ud478\u1175\u11af\u11c2" }, 117 }; 118 119 @Test 120 public void TestHangulCompose() 121 throws Exception{ 122 // Make sure that the static composition methods work 123 logln("Canonical composition..."); 124 staticTest(Normalizer.NFC, hangulCanon, 2); 125 logln("Compatibility composition..."); 126 staticTest(Normalizer.NFKC, hangulCompat, 2); 127 // Now try iterative composition.... 128 logln("Iterative composition..."); 129 Normalizer norm = new Normalizer("", Normalizer.NFC,0); 130 iterateTest(norm, hangulCanon, 2); 131 132 norm.setMode(Normalizer.NFKD); 133 iterateTest(norm, hangulCompat, 2); 134 135 // And finally, make sure you can do it in reverse too 136 logln("Reverse iteration..."); 137 norm.setMode(Normalizer.NFC); 138 backAndForth(norm, hangulCanon); 139 } 140 141 @Test 142 public void TestHangulDecomp() throws Exception{ 143 // Make sure that the static decomposition methods work 144 logln("Canonical decomposition..."); 145 staticTest(Normalizer.NFD, hangulCanon, 1); 146 logln("Compatibility decomposition..."); 147 staticTest(Normalizer.NFKD, hangulCompat, 1); 148 149 // Now the iterative decomposition methods... 150 logln("Iterative decomposition..."); 151 Normalizer norm = new Normalizer("", Normalizer.NFD,0); 152 iterateTest(norm, hangulCanon, 1); 153 154 norm.setMode(Normalizer.NFKD); 155 iterateTest(norm, hangulCompat, 1); 156 157 // And finally, make sure you can do it in reverse too 158 logln("Reverse iteration..."); 159 norm.setMode(Normalizer.NFD); 160 backAndForth(norm, hangulCanon); 161 } 162 @Test 163 public void TestNone() throws Exception{ 164 Normalizer norm = new Normalizer("", Normalizer.NONE,0); 165 iterateTest(norm, canonTests, 0); 166 staticTest(Normalizer.NONE, canonTests, 0); 167 } 168 @Test 169 public void TestDecomp() throws Exception{ 170 Normalizer norm = new Normalizer("", Normalizer.NFD,0); 171 iterateTest(norm, canonTests, 1); 172 staticTest(Normalizer.NFD, canonTests, 1); 173 decomposeTest(Normalizer.NFD, canonTests, 1); 174 } 175 176 @Test 177 public void TestCompatDecomp() throws Exception{ 178 Normalizer norm = new Normalizer("", Normalizer.NFKD,0); 179 iterateTest(norm, compatTests, 1); 180 staticTest(Normalizer.NFKD,compatTests, 1); 181 decomposeTest(Normalizer.NFKD,compatTests, 1); 182 } 183 184 @Test 185 public void TestCanonCompose() throws Exception{ 186 Normalizer norm = new Normalizer("", Normalizer.NFC,0); 187 iterateTest(norm, canonTests, 2); 188 staticTest(Normalizer.NFC, canonTests, 2); 189 composeTest(Normalizer.NFC, canonTests, 2); 190 } 191 192 @Test 193 public void TestCompatCompose() throws Exception{ 194 Normalizer norm = new Normalizer("", Normalizer.NFKC,0); 195 iterateTest(norm, compatTests, 2); 196 staticTest(Normalizer.NFKC,compatTests, 2); 197 composeTest(Normalizer.NFKC,compatTests, 2); 198 } 199 200 @Test 201 public void TestExplodingBase() throws Exception{ 202 // \u017f - Latin small letter long s 203 // \u0307 - combining dot above 204 // \u1e61 - Latin small letter s with dot above 205 // \u1e9b - Latin small letter long s with dot above 206 String[][] canon = { 207 // Input Decomposed Composed 208 { "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" }, 209 { "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" }, 210 }; 211 String[][] compat = { 212 // Input Decomposed Composed 213 { "\u017f", "s", "s" }, 214 { "\u1e9b", "s\u0307", "\u1e61" }, 215 }; 216 217 staticTest(Normalizer.NFD, canon, 1); 218 staticTest(Normalizer.NFC, canon, 2); 219 220 staticTest(Normalizer.NFKD, compat, 1); 221 staticTest(Normalizer.NFKC, compat, 2); 222 223 } 224 225 /** 226 * The Tibetan vowel sign AA, 0f71, was messed up prior to 227 * Unicode version 2.1.9. 228 * Once 2.1.9 or 3.0 is released, uncomment this test. 229 */ 230 @Test 231 public void TestTibetan() throws Exception{ 232 String[][] decomp = { 233 { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" } 234 }; 235 String[][] compose = { 236 { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" } 237 }; 238 239 staticTest(Normalizer.NFD, decomp, 1); 240 staticTest(Normalizer.NFKD,decomp, 2); 241 staticTest(Normalizer.NFC, compose, 1); 242 staticTest(Normalizer.NFKC,compose, 2); 243 } 244 245 /** 246 * Make sure characters in the CompositionExclusion.txt list do not get 247 * composed to. 248 */ 249 @Test 250 public void TestCompositionExclusion() 251 throws Exception{ 252 // This list is generated from CompositionExclusion.txt. 253 // Update whenever the normalizer tables are updated. Note 254 // that we test all characters listed, even those that can be 255 // derived from the Unicode DB and are therefore commented 256 // out. 257 String EXCLUDED = 258 "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" + 259 "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" + 260 "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" + 261 "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" + 262 "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" + 263 "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" + 264 "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u1FDB" + 265 "\u1FE3\u1FEB\u1FEE\u1FEF\u1FF9\u1FFB\u1FFD\u2000" + 266 "\u2001\u2126\u212A\u212B\u2329\u232A\uF900\uFA10" + 267 "\uFA12\uFA15\uFA20\uFA22\uFA25\uFA26\uFA2A\uFB1F" + 268 "\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E\uFB2F\uFB30\uFB31" + 269 "\uFB32\uFB33\uFB34\uFB35\uFB36\uFB38\uFB39\uFB3A" + 270 "\uFB3B\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46" + 271 "\uFB47\uFB48\uFB49\uFB4A\uFB4B\uFB4C\uFB4D\uFB4E"; 272 for (int i=0; i<EXCLUDED.length(); ++i) { 273 String a = String.valueOf(EXCLUDED.charAt(i)); 274 String b = Normalizer.normalize(a, Normalizer.NFKD); 275 String c = Normalizer.normalize(b, Normalizer.NFC); 276 if (c.equals(a)) { 277 errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " + 278 hex(b) + " x COMPOSE => " + 279 hex(c)); 280 } else if (isVerbose()) { 281 logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " + 282 hex(b) + " x COMPOSE => " + 283 hex(c)); 284 } 285 } 286 // The following method works too, but it is somewhat 287 // incestuous. It uses UInfo, which is the same database that 288 // NormalizerBuilder uses, so if something is wrong with 289 // UInfo, the following test won't show it. All it will show 290 // is that NormalizerBuilder has been run with whatever the 291 // current UInfo is. 292 // 293 // We comment this out in favor of the test above, which 294 // provides independent verification (but also requires 295 // independent updating). 296// logln("---"); 297// UInfo uinfo = new UInfo(); 298// for (int i=0; i<=0xFFFF; ++i) { 299// if (!uinfo.isExcludedComposition((char)i) || 300// (!uinfo.hasCanonicalDecomposition((char)i) && 301// !uinfo.hasCompatibilityDecomposition((char)i))) continue; 302// String a = String.valueOf((char)i); 303// String b = Normalizer.normalize(a,Normalizer.DECOMP_COMPAT,0); 304// String c = Normalizer.normalize(b,Normalizer.COMPOSE,0); 305// if (c.equals(a)) { 306// errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " + 307// hex(b) + " x COMPOSE => " + 308// hex(c)); 309// } else if (isVerbose()) { 310// logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " + 311// hex(b) + " x COMPOSE => " + 312// hex(c)); 313// } 314// } 315 } 316 317 /** 318 * Test for a problem that showed up just before ICU 1.6 release 319 * having to do with combining characters with an index of zero. 320 * Such characters do not participate in any canonical 321 * decompositions. However, having an index of zero means that 322 * they all share one typeMask[] entry, that is, they all have to 323 * map to the same canonical class, which is not the case, in 324 * reality. 325 */ 326 @Test 327 public void TestZeroIndex() 328 throws Exception{ 329 String[] DATA = { 330 // Expect col1 x COMPOSE_COMPAT => col2 331 // Expect col2 x DECOMP => col3 332 "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300", 333 "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300", 334 "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300", 335 "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327", 336 "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321", 337 }; 338 339 for (int i=0; i<DATA.length; i+=3) { 340 String a = DATA[i]; 341 String b = Normalizer.normalize(a, Normalizer.NFKC); 342 String exp = DATA[i+1]; 343 if (b.equals(exp)) { 344 logln("Ok: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b)); 345 } else { 346 errln("FAIL: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b) + 347 ", expect " + hex(exp)); 348 } 349 a = Normalizer.normalize(b, Normalizer.NFD); 350 exp = DATA[i+2]; 351 if (a.equals(exp)) { 352 logln("Ok: " + hex(b) + " x DECOMP => " + hex(a)); 353 } else { 354 errln("FAIL: " + hex(b) + " x DECOMP => " + hex(a) + 355 ", expect " + hex(exp)); 356 } 357 } 358 } 359 360 /** 361 * Test for a problem found by Verisign. Problem is that 362 * characters at the start of a string are not put in canonical 363 * order correctly by compose() if there is no starter. 364 */ 365 @Test 366 public void TestVerisign() 367 throws Exception{ 368 String[] inputs = { 369 "\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f", 370 "\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad" 371 }; 372 String[] outputs = { 373 "\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f", 374 "\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4" 375 }; 376 377 for (int i = 0; i < inputs.length; ++i) { 378 String input = inputs[i]; 379 String output = outputs[i]; 380 String result = Normalizer.decompose(input, false); 381 if (!result.equals(output)) { 382 errln("FAIL input: " + hex(input)); 383 errln(" decompose: " + hex(result)); 384 errln(" expected: " + hex(output)); 385 } 386 result = Normalizer.compose(input, false); 387 if (!result.equals(output)) { 388 errln("FAIL input: " + hex(input)); 389 errln(" compose: " + hex(result)); 390 errln(" expected: " + hex(output)); 391 } 392 } 393 394 } 395 @Test 396 public void TestQuickCheckResultNO() 397 throws Exception{ 398 final char CPNFD[] = {0x00C5, 0x0407, 0x1E00, 0x1F57, 0x220C, 399 0x30AE, 0xAC00, 0xD7A3, 0xFB36, 0xFB4E}; 400 final char CPNFC[] = {0x0340, 0x0F93, 0x1F77, 0x1FBB, 0x1FEB, 401 0x2000, 0x232A, 0xF900, 0xFA1E, 0xFB4E}; 402 final char CPNFKD[] = {0x00A0, 0x02E4, 0x1FDB, 0x24EA, 0x32FE, 403 0xAC00, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D}; 404 final char CPNFKC[] = {0x00A0, 0x017F, 0x2000, 0x24EA, 0x32FE, 405 0x33FE, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D}; 406 407 408 final int SIZE = 10; 409 410 int count = 0; 411 for (; count < SIZE; count ++) 412 { 413 if (Normalizer.quickCheck(String.valueOf(CPNFD[count]), 414 Normalizer.NFD,0) != Normalizer.NO) 415 { 416 errln("ERROR in NFD quick check at U+" + 417 Integer.toHexString(CPNFD[count])); 418 return; 419 } 420 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), 421 Normalizer.NFC,0) !=Normalizer.NO) 422 { 423 errln("ERROR in NFC quick check at U+"+ 424 Integer.toHexString(CPNFC[count])); 425 return; 426 } 427 if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]), 428 Normalizer.NFKD,0) != Normalizer.NO) 429 { 430 errln("ERROR in NFKD quick check at U+"+ 431 Integer.toHexString(CPNFKD[count])); 432 return; 433 } 434 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 435 Normalizer.NFKC,0) !=Normalizer.NO) 436 { 437 errln("ERROR in NFKC quick check at U+"+ 438 Integer.toHexString(CPNFKC[count])); 439 return; 440 } 441 // for improving coverage 442 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 443 Normalizer.NFKC) !=Normalizer.NO) 444 { 445 errln("ERROR in NFKC quick check at U+"+ 446 Integer.toHexString(CPNFKC[count])); 447 return; 448 } 449 } 450 } 451 452 453 @Test 454 public void TestQuickCheckResultYES() 455 throws Exception{ 456 final char CPNFD[] = {0x00C6, 0x017F, 0x0F74, 0x1000, 0x1E9A, 457 0x2261, 0x3075, 0x4000, 0x5000, 0xF000}; 458 final char CPNFC[] = {0x0400, 0x0540, 0x0901, 0x1000, 0x1500, 459 0x1E9A, 0x3000, 0x4000, 0x5000, 0xF000}; 460 final char CPNFKD[] = {0x00AB, 0x02A0, 0x1000, 0x1027, 0x2FFB, 461 0x3FFF, 0x4FFF, 0xA000, 0xF000, 0xFA27}; 462 final char CPNFKC[] = {0x00B0, 0x0100, 0x0200, 0x0A02, 0x1000, 463 0x2010, 0x3030, 0x4000, 0xA000, 0xFA0E}; 464 465 final int SIZE = 10; 466 int count = 0; 467 468 char cp = 0; 469 while (cp < 0xA0) 470 { 471 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFD,0) 472 != Normalizer.YES) 473 { 474 errln("ERROR in NFD quick check at U+"+ 475 Integer.toHexString(cp)); 476 return; 477 } 478 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFC,0) 479 != Normalizer.YES) 480 { 481 errln("ERROR in NFC quick check at U+"+ 482 Integer.toHexString(cp)); 483 return; 484 } 485 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKD,0) 486 != Normalizer.YES) 487 { 488 errln("ERROR in NFKD quick check at U+" + 489 Integer.toHexString(cp)); 490 return; 491 } 492 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC,0) 493 != Normalizer.YES) 494 { 495 errln("ERROR in NFKC quick check at U+"+ 496 Integer.toHexString(cp)); 497 return; 498 } 499 // improve the coverage 500 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC) 501 != Normalizer.YES) 502 { 503 errln("ERROR in NFKC quick check at U+"+ 504 Integer.toHexString(cp)); 505 return; 506 } 507 cp++; 508 } 509 510 for (; count < SIZE; count ++) 511 { 512 if (Normalizer.quickCheck(String.valueOf(CPNFD[count]), 513 Normalizer.NFD,0)!=Normalizer.YES) 514 { 515 errln("ERROR in NFD quick check at U+"+ 516 Integer.toHexString(CPNFD[count])); 517 return; 518 } 519 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), 520 Normalizer.NFC,0)!=Normalizer.YES) 521 { 522 errln("ERROR in NFC quick check at U+"+ 523 Integer.toHexString(CPNFC[count])); 524 return; 525 } 526 if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]), 527 Normalizer.NFKD,0)!=Normalizer.YES) 528 { 529 errln("ERROR in NFKD quick check at U+"+ 530 Integer.toHexString(CPNFKD[count])); 531 return; 532 } 533 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 534 Normalizer.NFKC,0)!=Normalizer.YES) 535 { 536 errln("ERROR in NFKC quick check at U+"+ 537 Integer.toHexString(CPNFKC[count])); 538 return; 539 } 540 // improve the coverage 541 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 542 Normalizer.NFKC)!=Normalizer.YES) 543 { 544 errln("ERROR in NFKC quick check at U+"+ 545 Integer.toHexString(CPNFKC[count])); 546 return; 547 } 548 } 549 } 550 @Test 551 public void TestBengali() throws Exception{ 552 String input = "\u09bc\u09be\u09cd\u09be"; 553 String output=Normalizer.normalize(input,Normalizer.NFC); 554 if(!input.equals(output)){ 555 errln("ERROR in NFC of string"); 556 } 557 } 558 @Test 559 public void TestQuickCheckResultMAYBE() 560 throws Exception{ 561 562 final char[] CPNFC = {0x0306, 0x0654, 0x0BBE, 0x102E, 0x1161, 563 0x116A, 0x1173, 0x1175, 0x3099, 0x309A}; 564 final char[] CPNFKC = {0x0300, 0x0654, 0x0655, 0x09D7, 0x0B3E, 565 0x0DCF, 0xDDF, 0x102E, 0x11A8, 0x3099}; 566 567 568 final int SIZE = 10; 569 570 int count = 0; 571 572 /* NFD and NFKD does not have any MAYBE codepoints */ 573 for (; count < SIZE; count ++) 574 { 575 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), 576 Normalizer.NFC,0)!=Normalizer.MAYBE) 577 { 578 errln("ERROR in NFC quick check at U+"+ 579 Integer.toHexString(CPNFC[count])); 580 return; 581 } 582 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 583 Normalizer.NFKC,0)!=Normalizer.MAYBE) 584 { 585 errln("ERROR in NFKC quick check at U+"+ 586 Integer.toHexString(CPNFKC[count])); 587 return; 588 } 589 if (Normalizer.quickCheck(new char[]{CPNFC[count]}, 590 Normalizer.NFC,0)!=Normalizer.MAYBE) 591 { 592 errln("ERROR in NFC quick check at U+"+ 593 Integer.toHexString(CPNFC[count])); 594 return; 595 } 596 if (Normalizer.quickCheck(new char[]{CPNFKC[count]}, 597 Normalizer.NFKC,0)!=Normalizer.MAYBE) 598 { 599 errln("ERROR in NFKC quick check at U+"+ 600 Integer.toHexString(CPNFKC[count])); 601 return; 602 } 603 if (Normalizer.quickCheck(new char[]{CPNFKC[count]}, 604 Normalizer.NONE,0)!=Normalizer.YES) 605 { 606 errln("ERROR in NONE quick check at U+"+ 607 Integer.toHexString(CPNFKC[count])); 608 return; 609 } 610 } 611 } 612 613 @Test 614 public void TestQuickCheckStringResult() 615 throws Exception{ 616 int count; 617 String d; 618 String c; 619 620 for (count = 0; count < canonTests.length; count ++) 621 { 622 d = canonTests[count][1]; 623 c = canonTests[count][2]; 624 if (Normalizer.quickCheck(d,Normalizer.NFD,0) 625 != Normalizer.YES) 626 { 627 errln("ERROR in NFD quick check for string at count " + count); 628 return; 629 } 630 631 if (Normalizer.quickCheck(c, Normalizer.NFC,0) 632 == Normalizer.NO) 633 { 634 errln("ERROR in NFC quick check for string at count " + count); 635 return; 636 } 637 } 638 639 for (count = 0; count < compatTests.length; count ++) 640 { 641 d = compatTests[count][1]; 642 c = compatTests[count][2]; 643 if (Normalizer.quickCheck(d, Normalizer.NFKD,0) 644 != Normalizer.YES) 645 { 646 errln("ERROR in NFKD quick check for string at count " + count); 647 return; 648 } 649 650 if (Normalizer.quickCheck(c, Normalizer.NFKC,0) 651 != Normalizer.YES) 652 { 653 errln("ERROR in NFKC quick check for string at count " + count); 654 return; 655 } 656 } 657 } 658 659 static final int qcToInt(Normalizer.QuickCheckResult qc) { 660 if(qc==Normalizer.NO) { 661 return 0; 662 } else if(qc==Normalizer.YES) { 663 return 1; 664 } else /* Normalizer.MAYBE */ { 665 return 2; 666 } 667 } 668 669 @Test 670 public void TestQuickCheckPerCP() { 671 int c, lead, trail; 672 String s, nfd; 673 int lccc1, lccc2, tccc1, tccc2; 674 int qc1, qc2; 675 676 if( 677 UCharacter.getIntPropertyMaxValue(UProperty.NFD_QUICK_CHECK)!=1 || // YES 678 UCharacter.getIntPropertyMaxValue(UProperty.NFKD_QUICK_CHECK)!=1 || 679 UCharacter.getIntPropertyMaxValue(UProperty.NFC_QUICK_CHECK)!=2 || // MAYBE 680 UCharacter.getIntPropertyMaxValue(UProperty.NFKC_QUICK_CHECK)!=2 || 681 UCharacter.getIntPropertyMaxValue(UProperty.LEAD_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) || 682 UCharacter.getIntPropertyMaxValue(UProperty.TRAIL_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) 683 ) { 684 errln("wrong result from one of the u_getIntPropertyMaxValue(UCHAR_NF*_QUICK_CHECK) or UCHAR_*_CANONICAL_COMBINING_CLASS"); 685 } 686 687 /* 688 * compare the quick check property values for some code points 689 * to the quick check results for checking same-code point strings 690 */ 691 c=0; 692 while(c<0x110000) { 693 s=UTF16.valueOf(c); 694 695 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFC_QUICK_CHECK); 696 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFC)); 697 if(qc1!=qc2) { 698 errln("getIntPropertyValue(NFC)="+qc1+" != "+qc2+"=quickCheck(NFC) for U+"+Integer.toHexString(c)); 699 } 700 701 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFD_QUICK_CHECK); 702 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFD)); 703 if(qc1!=qc2) { 704 errln("getIntPropertyValue(NFD)="+qc1+" != "+qc2+"=quickCheck(NFD) for U+"+Integer.toHexString(c)); 705 } 706 707 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKC_QUICK_CHECK); 708 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKC)); 709 if(qc1!=qc2) { 710 errln("getIntPropertyValue(NFKC)="+qc1+" != "+qc2+"=quickCheck(NFKC) for U+"+Integer.toHexString(c)); 711 } 712 713 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKD_QUICK_CHECK); 714 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKD)); 715 if(qc1!=qc2) { 716 errln("getIntPropertyValue(NFKD)="+qc1+" != "+qc2+"=quickCheck(NFKD) for U+"+Integer.toHexString(c)); 717 } 718 719 nfd=Normalizer.normalize(s, Normalizer.NFD); 720 lead=UTF16.charAt(nfd, 0); 721 trail=UTF16.charAt(nfd, nfd.length()-1); 722 723 lccc1=UCharacter.getIntPropertyValue(c, UProperty.LEAD_CANONICAL_COMBINING_CLASS); 724 lccc2=UCharacter.getCombiningClass(lead); 725 tccc1=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS); 726 tccc2=UCharacter.getCombiningClass(trail); 727 728 if(lccc1!=lccc2) { 729 errln("getIntPropertyValue(lccc)="+lccc1+" != "+lccc2+"=getCombiningClass(lead) for U+"+Integer.toHexString(c)); 730 } 731 if(tccc1!=tccc2) { 732 errln("getIntPropertyValue(tccc)="+tccc1+" != "+tccc2+"=getCombiningClass(trail) for U+"+Integer.toHexString(c)); 733 } 734 735 /* skip some code points */ 736 c=(20*c)/19+1; 737 } 738 } 739 740 //------------------------------------------------------------------------ 741 // Internal utilities 742 // 743 //------------------------------------------------------------------------ 744 // Internal utilities 745 // 746 747/* private void backAndForth(Normalizer iter, String input) 748 { 749 iter.setText(input); 750 751 // Run through the iterator forwards and stick it into a StringBuffer 752 StringBuffer forward = new StringBuffer(); 753 for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) { 754 forward.append(ch); 755 } 756 757 // Now do it backwards 758 StringBuffer reverse = new StringBuffer(); 759 for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) { 760 reverse.insert(0, ch); 761 } 762 763 if (!forward.toString().equals(reverse.toString())) { 764 errln("FAIL: Forward/reverse mismatch for input " + hex(input) 765 + ", forward: " + hex(forward) + ", backward: "+hex(reverse)); 766 } else if (isVerbose()) { 767 logln("Ok: Forward/reverse for input " + hex(input) 768 + ", forward: " + hex(forward) + ", backward: "+hex(reverse)); 769 } 770 }*/ 771 772 private void backAndForth(Normalizer iter, String[][] tests) 773 { 774 for (int i = 0; i < tests.length; i++) 775 { 776 iter.setText(tests[i][0]); 777 778 // Run through the iterator forwards and stick it into a 779 // StringBuffer 780 StringBuffer forward = new StringBuffer(); 781 for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) { 782 forward.append(ch); 783 } 784 785 // Now do it backwards 786 StringBuffer reverse = new StringBuffer(); 787 for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) { 788 reverse.insert(0, ch); 789 } 790 791 if (!forward.toString().equals(reverse.toString())) { 792 errln("FAIL: Forward/reverse mismatch for input " 793 + hex(tests[i][0]) + ", forward: " + hex(forward) 794 + ", backward: " + hex(reverse)); 795 } else if (isVerbose()) { 796 logln("Ok: Forward/reverse for input " + hex(tests[i][0]) 797 + ", forward: " + hex(forward) + ", backward: " 798 + hex(reverse)); 799 } 800 } 801 } 802 803 private void staticTest (Normalizer.Mode mode, 804 String[][] tests, int outCol) throws Exception{ 805 for (int i = 0; i < tests.length; i++) 806 { 807 String input = Utility.unescape(tests[i][0]); 808 String expect = Utility.unescape(tests[i][outCol]); 809 810 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 811 812 String output = Normalizer.normalize(input, mode); 813 814 if (!output.equals(expect)) { 815 errln("FAIL: case " + i 816 + " expected '" + expect + "' (" + hex(expect) + ")" 817 + " but got '" + output + "' (" + hex(output) + ")" ); 818 } 819 } 820 char[] output = new char[1]; 821 for (int i = 0; i < tests.length; i++) 822 { 823 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 824 String expect =Utility.unescape( tests[i][outCol]); 825 826 logln("Normalizing '" + new String(input) + "' (" + 827 hex(new String(input)) + ")" ); 828 int reqLength=0; 829 while(true){ 830 try{ 831 reqLength=Normalizer.normalize(input,output, mode,0); 832 if(reqLength<=output.length ){ 833 break; 834 } 835 }catch(IndexOutOfBoundsException e){ 836 output= new char[Integer.parseInt(e.getMessage())]; 837 continue; 838 } 839 } 840 if (!expect.equals(new String(output,0,reqLength))) { 841 errln("FAIL: case " + i 842 + " expected '" + expect + "' (" + hex(expect) + ")" 843 + " but got '" + new String(output) 844 + "' (" + hex(new String(output)) + ")" ); 845 } 846 } 847 } 848 private void decomposeTest(Normalizer.Mode mode, 849 String[][] tests, int outCol) throws Exception{ 850 for (int i = 0; i < tests.length; i++) 851 { 852 String input = Utility.unescape(tests[i][0]); 853 String expect = Utility.unescape(tests[i][outCol]); 854 855 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 856 857 String output = Normalizer.decompose(input, mode==Normalizer.NFKD); 858 859 if (!output.equals(expect)) { 860 errln("FAIL: case " + i 861 + " expected '" + expect + "' (" + hex(expect) + ")" 862 + " but got '" + output + "' (" + hex(output) + ")" ); 863 } 864 } 865 char[] output = new char[1]; 866 for (int i = 0; i < tests.length; i++) 867 { 868 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 869 String expect = Utility.unescape(tests[i][outCol]); 870 871 logln("Normalizing '" + new String(input) + "' (" + 872 hex(new String(input)) + ")" ); 873 int reqLength=0; 874 while(true){ 875 try{ 876 reqLength=Normalizer.decompose(input,output, mode==Normalizer.NFKD,0); 877 if(reqLength<=output.length ){ 878 break; 879 } 880 }catch(IndexOutOfBoundsException e){ 881 output= new char[Integer.parseInt(e.getMessage())]; 882 continue; 883 } 884 } 885 if (!expect.equals(new String(output,0,reqLength))) { 886 errln("FAIL: case " + i 887 + " expected '" + expect + "' (" + hex(expect) + ")" 888 + " but got '" + new String(output) 889 + "' (" + hex(new String(output)) + ")" ); 890 } 891 } 892 output = new char[1]; 893 for (int i = 0; i < tests.length; i++) 894 { 895 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 896 String expect = Utility.unescape(tests[i][outCol]); 897 898 logln("Normalizing '" + new String(input) + "' (" + 899 hex(new String(input)) + ")" ); 900 int reqLength=0; 901 while(true){ 902 try{ 903 reqLength=Normalizer.decompose(input,0,input.length,output,0,output.length, mode==Normalizer.NFKD,0); 904 if(reqLength<=output.length ){ 905 break; 906 } 907 }catch(IndexOutOfBoundsException e){ 908 output= new char[Integer.parseInt(e.getMessage())]; 909 continue; 910 } 911 } 912 if (!expect.equals(new String(output,0,reqLength))) { 913 errln("FAIL: case " + i 914 + " expected '" + expect + "' (" + hex(expect) + ")" 915 + " but got '" + new String(output) 916 + "' (" + hex(new String(output)) + ")" ); 917 } 918 char[] output2 = new char[reqLength * 2]; 919 System.arraycopy(output, 0, output2, 0, reqLength); 920 int retLength = Normalizer.decompose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0); 921 if(retLength != reqLength){ 922 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength); 923 } 924 } 925 } 926 927 private void composeTest(Normalizer.Mode mode, 928 String[][] tests, int outCol) throws Exception{ 929 for (int i = 0; i < tests.length; i++) 930 { 931 String input = Utility.unescape(tests[i][0]); 932 String expect = Utility.unescape(tests[i][outCol]); 933 934 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 935 936 String output = Normalizer.compose(input, mode==Normalizer.NFKC); 937 938 if (!output.equals(expect)) { 939 errln("FAIL: case " + i 940 + " expected '" + expect + "' (" + hex(expect) + ")" 941 + " but got '" + output + "' (" + hex(output) + ")" ); 942 } 943 } 944 char[] output = new char[1]; 945 for (int i = 0; i < tests.length; i++) 946 { 947 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 948 String expect = Utility.unescape(tests[i][outCol]); 949 950 logln("Normalizing '" + new String(input) + "' (" + 951 hex(new String(input)) + ")" ); 952 int reqLength=0; 953 while(true){ 954 try{ 955 reqLength=Normalizer.compose(input,output, mode==Normalizer.NFKC,0); 956 if(reqLength<=output.length ){ 957 break; 958 } 959 }catch(IndexOutOfBoundsException e){ 960 output= new char[Integer.parseInt(e.getMessage())]; 961 continue; 962 } 963 } 964 if (!expect.equals(new String(output,0,reqLength))) { 965 errln("FAIL: case " + i 966 + " expected '" + expect + "' (" + hex(expect) + ")" 967 + " but got '" + new String(output) 968 + "' (" + hex(new String(output)) + ")" ); 969 } 970 } 971 output = new char[1]; 972 for (int i = 0; i < tests.length; i++) 973 { 974 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 975 String expect = Utility.unescape(tests[i][outCol]); 976 977 logln("Normalizing '" + new String(input) + "' (" + 978 hex(new String(input)) + ")" ); 979 int reqLength=0; 980 while(true){ 981 try{ 982 reqLength=Normalizer.compose(input,0,input.length, output, 0, output.length, mode==Normalizer.NFKC,0); 983 if(reqLength<=output.length ){ 984 break; 985 } 986 }catch(IndexOutOfBoundsException e){ 987 output= new char[Integer.parseInt(e.getMessage())]; 988 continue; 989 } 990 } 991 if (!expect.equals(new String(output,0,reqLength))) { 992 errln("FAIL: case " + i 993 + " expected '" + expect + "' (" + hex(expect) + ")" 994 + " but got '" + new String(output) 995 + "' (" + hex(new String(output)) + ")" ); 996 } 997 998 char[] output2 = new char[reqLength * 2]; 999 System.arraycopy(output, 0, output2, 0, reqLength); 1000 int retLength = Normalizer.compose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0); 1001 if(retLength != reqLength){ 1002 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength); 1003 } 1004 } 1005 } 1006 private void iterateTest(Normalizer iter, String[][] tests, int outCol){ 1007 for (int i = 0; i < tests.length; i++) 1008 { 1009 String input = Utility.unescape(tests[i][0]); 1010 String expect = Utility.unescape(tests[i][outCol]); 1011 1012 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 1013 1014 iter.setText(input); 1015 assertEqual(expect, iter, "case " + i + " "); 1016 } 1017 } 1018 1019 private void assertEqual(String expected, Normalizer iter, String msg) 1020 { 1021 int index = 0; 1022 int ch; 1023 UCharacterIterator cIter = UCharacterIterator.getInstance(expected); 1024 1025 while ((ch=iter.next())!= Normalizer.DONE){ 1026 if (index >= expected.length()) { 1027 errln("FAIL: " + msg + "Unexpected character '" + (char)ch 1028 + "' (" + hex(ch) + ")" 1029 + " at index " + index); 1030 break; 1031 } 1032 int want = UTF16.charAt(expected,index); 1033 if (ch != want) { 1034 errln("FAIL: " + msg + "got '" + (char)ch 1035 + "' (" + hex(ch) + ")" 1036 + " but expected '" + want + "' (" + hex(want)+ ")" 1037 + " at index " + index); 1038 } 1039 index+= UTF16.getCharCount(ch); 1040 } 1041 if (index < expected.length()) { 1042 errln("FAIL: " + msg + "Only got " + index + " chars, expected " 1043 + expected.length()); 1044 } 1045 1046 cIter.setToLimit(); 1047 while((ch=iter.previous())!=Normalizer.DONE){ 1048 int want = cIter.previousCodePoint(); 1049 if (ch != want ) { 1050 errln("FAIL: " + msg + "got '" + (char)ch 1051 + "' (" + hex(ch) + ")" 1052 + " but expected '" + want + "' (" + hex(want) + ")" 1053 + " at index " + index); 1054 } 1055 } 1056 } 1057 //-------------------------------------------------------------------------- 1058 1059 // NOTE: These tests are used for quick debugging so are not ported 1060 // to ICU4C tsnorm.cpp in intltest 1061 // 1062 1063 @Test 1064 public void TestDebugStatic(){ 1065 String in = Utility.unescape("\\U0001D157\\U0001D165"); 1066 if(!Normalizer.isNormalized(in,Normalizer.NFC,0)){ 1067 errln("isNormalized failed"); 1068 } 1069 1070 String input = "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1071 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1072 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1073 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1074 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1075 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1076 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1077 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1078 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ 1079 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1080 "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1081 "d\u031B\u0307\u0323"; 1082 String expect = "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+ 1083 "\u11AA\u1100\u116F\u11AA\uD834\uDD57\uD834\uDD65"+ 1084 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1085 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1086 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1087 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1088 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1089 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1090 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1091 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1092 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1093 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1094 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1095 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1096 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1097 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1098 "\uD834\uDD57\uD834\uDD65aaaaaaaaaaaaaaaaaazzzzzz"+ 1099 "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1100 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1101 "bbbbbbbbbbbbbbbbbbbbbbbbccccccccccccccccccccccccccccc"+ 1102 "cccccccccccccccccccccccccccccccccccccccccccccccc"+ 1103 "ddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1104 "dddddddddddddddddddddddd"+ 1105 "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+ 1106 "\u11AA\u1100\u116F\u11AA\u0064\u031B\u0323\u0307"; 1107 String output = Normalizer.normalize(Utility.unescape(input), 1108 Normalizer.NFD); 1109 if(!expect.equals(output)){ 1110 errln("FAIL expected: "+hex(expect) + " got: "+hex(output)); 1111 } 1112 1113 1114 1115 } 1116 @Test 1117 public void TestDebugIter(){ 1118 String src = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e"); 1119 String expected = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e"); 1120 Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(src)), 1121 Normalizer.NONE,0); 1122 int index = 0; 1123 int ch; 1124 UCharacterIterator cIter = UCharacterIterator.getInstance(expected); 1125 1126 while ((ch=iter.next())!= Normalizer.DONE){ 1127 if (index >= expected.length()) { 1128 errln("FAIL: " + "Unexpected character '" + (char)ch 1129 + "' (" + hex(ch) + ")" 1130 + " at index " + index); 1131 break; 1132 } 1133 int want = UTF16.charAt(expected,index); 1134 if (ch != want) { 1135 errln("FAIL: " + "got '" + (char)ch 1136 + "' (" + hex(ch) + ")" 1137 + " but expected '" + want + "' (" + hex(want)+ ")" 1138 + " at index " + index); 1139 } 1140 index+= UTF16.getCharCount(ch); 1141 } 1142 if (index < expected.length()) { 1143 errln("FAIL: " + "Only got " + index + " chars, expected " 1144 + expected.length()); 1145 } 1146 1147 cIter.setToLimit(); 1148 while((ch=iter.previous())!=Normalizer.DONE){ 1149 int want = cIter.previousCodePoint(); 1150 if (ch != want ) { 1151 errln("FAIL: " + "got '" + (char)ch 1152 + "' (" + hex(ch) + ")" 1153 + " but expected '" + want + "' (" + hex(want) + ")" 1154 + " at index " + index); 1155 } 1156 } 1157 } 1158 @Test 1159 public void TestDebugIterOld(){ 1160 String input = "\\U0001D15E"; 1161 String expected = "\uD834\uDD57\uD834\uDD65"; 1162 String expectedReverse = "\uD834\uDD65\uD834\uDD57"; 1163 int index = 0; 1164 int ch; 1165 Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(input)), 1166 Normalizer.NFKC,0); 1167 StringBuffer got = new StringBuffer(); 1168 for (ch = iter.first();ch!=Normalizer.DONE;ch=iter.next()) 1169 { 1170 if (index >= expected.length()) { 1171 errln("FAIL: " + "Unexpected character '" + (char)ch + 1172 "' (" + hex(ch) + ")" + " at index " + index); 1173 break; 1174 } 1175 got.append(UCharacter.toString(ch)); 1176 index++; 1177 } 1178 if (!expected.equals(got.toString())) { 1179 errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")" 1180 + " but expected '" + expected + "' (" 1181 + hex(expected) + ")"); 1182 } 1183 if (got.length() < expected.length()) { 1184 errln("FAIL: " + "Only got " + index + " chars, expected " 1185 + expected.length()); 1186 } 1187 1188 logln("Reverse Iteration\n"); 1189 iter.setIndexOnly(iter.endIndex()); 1190 got.setLength(0); 1191 for(ch=iter.previous();ch!=Normalizer.DONE;ch=iter.previous()){ 1192 if (index >= expected.length()) { 1193 errln("FAIL: " + "Unexpected character '" + (char)ch 1194 + "' (" + hex(ch) + ")" + " at index " + index); 1195 break; 1196 } 1197 got.append(UCharacter.toString(ch)); 1198 } 1199 if (!expectedReverse.equals(got.toString())) { 1200 errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")" 1201 + " but expected '" + expected 1202 + "' (" + hex(expected) + ")"); 1203 } 1204 if (got.length() < expected.length()) { 1205 errln("FAIL: " + "Only got " + index + " chars, expected " 1206 + expected.length()); 1207 } 1208 1209 } 1210 //-------------------------------------------------------------------------- 1211 // helper class for TestPreviousNext() 1212 // simple UTF-32 character iterator 1213 class UCharIterator { 1214 1215 public UCharIterator(int[] src, int len, int index){ 1216 1217 s=src; 1218 length=len; 1219 i=index; 1220 } 1221 1222 public int current() { 1223 if(i<length) { 1224 return s[i]; 1225 } else { 1226 return -1; 1227 } 1228 } 1229 1230 public int next() { 1231 if(i<length) { 1232 return s[i++]; 1233 } else { 1234 return -1; 1235 } 1236 } 1237 1238 public int previous() { 1239 if(i>0) { 1240 return s[--i]; 1241 } else { 1242 return -1; 1243 } 1244 } 1245 1246 public int getIndex() { 1247 return i; 1248 } 1249 1250 private int[] s; 1251 private int length, i; 1252 } 1253 @Test 1254 public void TestPreviousNext() { 1255 // src and expect strings 1256 char src[]={ 1257 UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999), 1258 UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f), 1259 0xc4, 1260 0x1ed0 1261 }; 1262 int expect[]={ 1263 0x831d, 1264 0x1d158, 0x1d165, 1265 0x41, 0x308, 1266 0x4f, 0x302, 0x301 1267 }; 1268 1269 // expected src indexes corresponding to expect indexes 1270 int expectIndex[]={ 1271 0, 1272 2, 2, 1273 4, 4, 1274 5, 5, 5, 1275 6 // behind last character 1276 }; 1277 1278 // initial indexes into the src and expect strings 1279 1280 final int SRC_MIDDLE=4; 1281 final int EXPECT_MIDDLE=3; 1282 1283 1284 // movement vector 1285 // - for previous(), 0 for current(), + for next() 1286 // not const so that we can terminate it below for the error message 1287 String moves="0+0+0--0-0-+++0--+++++++0--------"; 1288 1289 // iterators 1290 Normalizer iter = new Normalizer(new String(src), 1291 Normalizer.NFD,0); 1292 UCharIterator iter32 = new UCharIterator(expect, expect.length, 1293 EXPECT_MIDDLE); 1294 1295 int c1, c2; 1296 char m; 1297 1298 // initially set the indexes into the middle of the strings 1299 iter.setIndexOnly(SRC_MIDDLE); 1300 1301 // move around and compare the iteration code points with 1302 // the expected ones 1303 int movesIndex =0; 1304 while(movesIndex<moves.length()) { 1305 m=moves.charAt(movesIndex++); 1306 if(m=='-') { 1307 c1=iter.previous(); 1308 c2=iter32.previous(); 1309 } else if(m=='0') { 1310 c1=iter.current(); 1311 c2=iter32.current(); 1312 } else /* m=='+' */ { 1313 c1=iter.next(); 1314 c2=iter32.next(); 1315 } 1316 1317 // compare results 1318 if(c1!=c2) { 1319 // copy the moves until the current (m) move, and terminate 1320 String history = moves.substring(0,movesIndex); 1321 errln("error: mismatch in Normalizer iteration at "+history+": " 1322 +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2)); 1323 break; 1324 } 1325 1326 // compare indexes 1327 if(iter.getIndex()!=expectIndex[iter32.getIndex()]) { 1328 // copy the moves until the current (m) move, and terminate 1329 String history = moves.substring(0,movesIndex); 1330 errln("error: index mismatch in Normalizer iteration at " 1331 +history+ " : "+ "Normalizer index " +iter.getIndex() 1332 +" expected "+ expectIndex[iter32.getIndex()]); 1333 break; 1334 } 1335 } 1336 } 1337 // Only in ICU4j 1338 @Test 1339 public void TestPreviousNextJCI() { 1340 // src and expect strings 1341 char src[]={ 1342 UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999), 1343 UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f), 1344 0xc4, 1345 0x1ed0 1346 }; 1347 int expect[]={ 1348 0x831d, 1349 0x1d158, 0x1d165, 1350 0x41, 0x308, 1351 0x4f, 0x302, 0x301 1352 }; 1353 1354 // expected src indexes corresponding to expect indexes 1355 int expectIndex[]={ 1356 0, 1357 2, 2, 1358 4, 4, 1359 5, 5, 5, 1360 6 // behind last character 1361 }; 1362 1363 // initial indexes into the src and expect strings 1364 1365 final int SRC_MIDDLE=4; 1366 final int EXPECT_MIDDLE=3; 1367 1368 1369 // movement vector 1370 // - for previous(), 0 for current(), + for next() 1371 // not const so that we can terminate it below for the error message 1372 String moves="0+0+0--0-0-+++0--+++++++0--------"; 1373 1374 // iterators 1375 StringCharacterIterator text = new StringCharacterIterator(new String(src)); 1376 Normalizer iter = new Normalizer(text,Normalizer.NFD,0); 1377 UCharIterator iter32 = new UCharIterator(expect, expect.length, 1378 EXPECT_MIDDLE); 1379 1380 int c1, c2; 1381 char m; 1382 1383 // initially set the indexes into the middle of the strings 1384 iter.setIndexOnly(SRC_MIDDLE); 1385 1386 // move around and compare the iteration code points with 1387 // the expected ones 1388 int movesIndex =0; 1389 while(movesIndex<moves.length()) { 1390 m=moves.charAt(movesIndex++); 1391 if(m=='-') { 1392 c1=iter.previous(); 1393 c2=iter32.previous(); 1394 } else if(m=='0') { 1395 c1=iter.current(); 1396 c2=iter32.current(); 1397 } else /* m=='+' */ { 1398 c1=iter.next(); 1399 c2=iter32.next(); 1400 } 1401 1402 // compare results 1403 if(c1!=c2) { 1404 // copy the moves until the current (m) move, and terminate 1405 String history = moves.substring(0,movesIndex); 1406 errln("error: mismatch in Normalizer iteration at "+history+": " 1407 +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2)); 1408 break; 1409 } 1410 1411 // compare indexes 1412 if(iter.getIndex()!=expectIndex[iter32.getIndex()]) { 1413 // copy the moves until the current (m) move, and terminate 1414 String history = moves.substring(0,movesIndex); 1415 errln("error: index mismatch in Normalizer iteration at " 1416 +history+ " : "+ "Normalizer index " +iter.getIndex() 1417 +" expected "+ expectIndex[iter32.getIndex()]); 1418 break; 1419 } 1420 } 1421 } 1422 1423 // test APIs that are not otherwise used - improve test coverage 1424 @Test 1425 public void TestNormalizerAPI() throws Exception { 1426 try{ 1427 // instantiate a Normalizer from a CharacterIterator 1428 String s=Utility.unescape("a\u0308\uac00\\U0002f800"); 1429 // make s a bit longer and more interesting 1430 UCharacterIterator iter = UCharacterIterator.getInstance(s+s); 1431 Normalizer norm = new Normalizer(iter, Normalizer.NFC,0); 1432 if(norm.next()!=0xe4) { 1433 errln("error in Normalizer(CharacterIterator).next()"); 1434 } 1435 1436 // test clone(), ==, and hashCode() 1437 Normalizer clone=(Normalizer)norm.clone(); 1438 if(clone.equals(norm)) { 1439 errln("error in Normalizer(Normalizer(CharacterIterator)).clone()!=norm"); 1440 } 1441 1442 if(clone.getLength()!= norm.getLength()){ 1443 errln("error in Normalizer.getBeginIndex()"); 1444 } 1445 // clone must have the same hashCode() 1446 //if(clone.hashCode()!=norm.hashCode()) { 1447 // errln("error in Normalizer(Normalizer(CharacterIterator)).clone().hashCode()!=copy.hashCode()"); 1448 //} 1449 if(clone.next()!=0xac00) { 1450 errln("error in Normalizer(Normalizer(CharacterIterator)).next()"); 1451 } 1452 int ch = clone.next(); 1453 if(ch!=0x4e3d) { 1454 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next()"); 1455 } 1456 // position changed, must change hashCode() 1457 if(clone.hashCode()==norm.hashCode()) { 1458 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next().hashCode()==copy.hashCode()"); 1459 } 1460 1461 // test compose() and decompose() 1462 StringBuffer tel; 1463 String nfkc, nfkd; 1464 tel=new StringBuffer("\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121"); 1465 tel.insert(1,(char)0x0301); 1466 1467 nfkc=Normalizer.compose(tel.toString(), true); 1468 nfkd=Normalizer.decompose(tel.toString(), true); 1469 if( 1470 !nfkc.equals(Utility.unescape("TE\u0139TELTELTELTELTELTELTELTELTEL"))|| 1471 !nfkd.equals(Utility.unescape("TEL\u0301TELTELTELTELTELTELTELTELTEL")) 1472 ) { 1473 errln("error in Normalizer::(de)compose(): wrong result(s)"); 1474 } 1475 1476 // test setIndex() 1477 ch=norm.setIndex(3); 1478 if(ch!=0x4e3d) { 1479 errln("error in Normalizer(CharacterIterator).setIndex(3)"); 1480 } 1481 1482 // test setText(CharacterIterator) and getText() 1483 String out, out2; 1484 clone.setText(iter); 1485 1486 out = clone.getText(); 1487 out2 = iter.getText(); 1488 if( !out.equals(out2) || 1489 clone.startIndex()!=0|| 1490 clone.endIndex()!=iter.getLength() 1491 ) { 1492 errln("error in Normalizer::setText() or Normalizer::getText()"); 1493 } 1494 1495 char[] fillIn1 = new char[clone.getLength()]; 1496 char[] fillIn2 = new char[iter.getLength()]; 1497 int len = clone.getText(fillIn1); 1498 iter.getText(fillIn2,0); 1499 if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){ 1500 errln("error in Normalizer.getText(). Normalizer: "+ 1501 Utility.hex(new String(fillIn1))+ 1502 " Iter: " + Utility.hex(new String(fillIn2))); 1503 } 1504 1505 clone.setText(fillIn1); 1506 len = clone.getText(fillIn2); 1507 if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){ 1508 errln("error in Normalizer.setText() or Normalizer.getText()"+ 1509 Utility.hex(new String(fillIn1))+ 1510 " Iter: " + Utility.hex(new String(fillIn2))); 1511 } 1512 1513 // test setText(UChar *), getUMode() and setMode() 1514 clone.setText(s); 1515 clone.setIndexOnly(1); 1516 clone.setMode(Normalizer.NFD); 1517 if(clone.getMode()!=Normalizer.NFD) { 1518 errln("error in Normalizer::setMode() or Normalizer::getMode()"); 1519 } 1520 if(clone.next()!=0x308 || clone.next()!=0x1100) { 1521 errln("error in Normalizer::setText() or Normalizer::setMode()"); 1522 } 1523 1524 // test last()/previous() with an internal buffer overflow 1525 StringBuffer buf = new StringBuffer("aaaaaaaaaa"); 1526 buf.setCharAt(10-1,'\u0308'); 1527 clone.setText(buf); 1528 if(clone.last()!=0x308) { 1529 errln("error in Normalizer(10*U+0308).last()"); 1530 } 1531 1532 // test UNORM_NONE 1533 norm.setMode(Normalizer.NONE); 1534 if(norm.first()!=0x61 || norm.next()!=0x308 || norm.last()!=0x2f800) { 1535 errln("error in Normalizer(UNORM_NONE).first()/next()/last()"); 1536 } 1537 out=Normalizer.normalize(s, Normalizer.NONE); 1538 if(!out.equals(s)) { 1539 errln("error in Normalizer::normalize(UNORM_NONE)"); 1540 } 1541 ch = 0x1D15E; 1542 String exp = "\\U0001D157\\U0001D165"; 1543 String ns = Normalizer.normalize(ch,Normalizer.NFC); 1544 if(!ns.equals(Utility.unescape(exp))){ 1545 errln("error in Normalizer.normalize(int,Mode)"); 1546 } 1547 ns = Normalizer.normalize(ch,Normalizer.NFC,0); 1548 if(!ns.equals(Utility.unescape(exp))){ 1549 errln("error in Normalizer.normalize(int,Mode,int)"); 1550 } 1551 }catch(Exception e){ 1552 throw e; 1553 } 1554 } 1555 1556 @Test 1557 public void TestConcatenate() { 1558 1559 Object[][]cases=new Object[][]{ 1560 /* mode, left, right, result */ 1561 { 1562 Normalizer.NFC, 1563 "re", 1564 "\u0301sum\u00e9", 1565 "r\u00e9sum\u00e9" 1566 }, 1567 { 1568 Normalizer.NFC, 1569 "a\u1100", 1570 "\u1161bcdefghijk", 1571 "a\uac00bcdefghijk" 1572 }, 1573 /* ### TODO: add more interesting cases */ 1574 { 1575 Normalizer.NFD, 1576 "\u03B1\u0345", 1577 "\u0C4D\uD804\uDCBA\uD834\uDD69", // 0C4D 110BA 1D169 1578 "\u03B1\uD834\uDD69\uD804\uDCBA\u0C4D\u0345" // 03B1 1D169 110BA 0C4D 0345 1579 } 1580 }; 1581 1582 String left, right, expect, result; 1583 Normalizer.Mode mode; 1584 int i; 1585 1586 /* test concatenation */ 1587 for(i=0; i<cases.length; ++i) { 1588 mode = (Normalizer.Mode)cases[i][0]; 1589 1590 left=(String)cases[i][1]; 1591 right=(String)cases[i][2]; 1592 expect=(String)cases[i][3]; 1593 { 1594 result=Normalizer.concatenate(left, right, mode,0); 1595 if(!result.equals(expect)) { 1596 errln("error in Normalizer.concatenate(), cases[] failed" 1597 +", result==expect: expected: " 1598 + hex(expect)+" =========> got: " + hex(result)); 1599 } 1600 } 1601 { 1602 result=Normalizer.concatenate(left.toCharArray(), right.toCharArray(), mode,0); 1603 if(!result.equals(expect)) { 1604 errln("error in Normalizer.concatenate(), cases[] failed" 1605 +", result==expect: expected: " 1606 + hex(expect)+" =========> got: " + hex(result)); 1607 } 1608 } 1609 } 1610 1611 mode= Normalizer.NFC; // (Normalizer.Mode)cases2[0][0]; 1612 char[] destination = "My resume is here".toCharArray(); 1613 left = "resume"; 1614 right = "re\u0301sum\u00e9 is HERE"; 1615 expect = "My r\u00e9sum\u00e9 is HERE"; 1616 1617 // Concatenates 're' with '\u0301sum\u00e9 is HERE' and places the result at 1618 // position 3 of string 'My resume is here'. 1619 Normalizer.concatenate(left.toCharArray(), 0, 2, right.toCharArray(), 2, 15, 1620 destination, 3, 17, mode, 0); 1621 if(!String.valueOf(destination).equals(expect)) { 1622 errln("error in Normalizer.concatenate(), cases2[] failed" 1623 +", result==expect: expected: " 1624 + hex(expect) + " =========> got: " + hex(destination)); 1625 } 1626 1627 // Error case when result of concatenation won't fit into destination array. 1628 try { 1629 Normalizer.concatenate(left.toCharArray(), 0, 2, right.toCharArray(), 2, 15, 1630 destination, 3, 16, mode, 0); 1631 } catch (IndexOutOfBoundsException e) { 1632 assertTrue("Normalizer.concatenate() failed", e.getMessage().equals("14")); 1633 return; 1634 } 1635 fail("Normalizer.concatenate() tested for failure but passed"); 1636 } 1637 1638 private final int RAND_MAX = 0x7fff; 1639 1640 @Test 1641 public void TestCheckFCD() 1642 { 1643 char[] FAST = {0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 1644 0x0008, 0x0009, 0x000A}; 1645 1646 char[] FALSE = {0x0001, 0x0002, 0x02EA, 0x03EB, 0x0300, 0x0301, 1647 0x02B9, 0x0314, 0x0315, 0x0316}; 1648 1649 char[] TRUE = {0x0030, 0x0040, 0x0440, 0x056D, 0x064F, 0x06E7, 1650 0x0050, 0x0730, 0x09EE, 0x1E10}; 1651 1652 char[][] datastr= { {0x0061, 0x030A, 0x1E05, 0x0302, 0}, 1653 {0x0061, 0x030A, 0x00E2, 0x0323, 0}, 1654 {0x0061, 0x0323, 0x00E2, 0x0323, 0}, 1655 {0x0061, 0x0323, 0x1E05, 0x0302, 0} 1656 }; 1657 Normalizer.QuickCheckResult result[] = {Normalizer.YES, Normalizer.NO, Normalizer.NO, Normalizer.YES}; 1658 1659 char[] datachar= { 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 1660 0x6a, 1661 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 1662 0xea, 1663 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 1664 0x0307, 0x0308, 0x0309, 0x030a, 1665 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 1666 0x0327, 0x0328, 0x0329, 0x032a, 1667 0x1e00, 0x1e01, 0x1e02, 0x1e03, 0x1e04, 0x1e05, 0x1e06, 1668 0x1e07, 0x1e08, 0x1e09, 0x1e0a 1669 }; 1670 1671 int count = 0; 1672 1673 if (Normalizer.quickCheck(FAST,0,FAST.length, Normalizer.FCD,0) != Normalizer.YES) 1674 errln("Normalizer.quickCheck(FCD) failed: expected value for fast Normalizer.quickCheck is Normalizer.YES\n"); 1675 if (Normalizer.quickCheck(FALSE,0, FALSE.length,Normalizer.FCD,0) != Normalizer.NO) 1676 errln("Normalizer.quickCheck(FCD) failed: expected value for error Normalizer.quickCheck is Normalizer.NO\n"); 1677 if (Normalizer.quickCheck(TRUE,0,TRUE.length,Normalizer.FCD,0) != Normalizer.YES) 1678 errln("Normalizer.quickCheck(FCD) failed: expected value for correct Normalizer.quickCheck is Normalizer.YES\n"); 1679 1680 1681 while (count < 4) 1682 { 1683 Normalizer.QuickCheckResult fcdresult = Normalizer.quickCheck(datastr[count],0,datastr[count].length, Normalizer.FCD,0); 1684 if (result[count] != fcdresult) { 1685 errln("Normalizer.quickCheck(FCD) failed: Data set "+ count 1686 + " expected value "+ result[count]); 1687 } 1688 count ++; 1689 } 1690 1691 /* random checks of long strings */ 1692 //srand((unsigned)time( NULL )); 1693 Random rand = createRandom(); // use test framework's random 1694 1695 for (count = 0; count < 50; count ++) 1696 { 1697 int size = 0; 1698 Normalizer.QuickCheckResult testresult = Normalizer.YES; 1699 char[] data= new char[20]; 1700 char[] norm= new char[100]; 1701 char[] nfd = new char[100]; 1702 int normStart = 0; 1703 int nfdsize = 0; 1704 while (size != 19) { 1705 data[size] = datachar[rand.nextInt(RAND_MAX)*50/RAND_MAX]; 1706 logln("0x"+data[size]); 1707 normStart += Normalizer.normalize(data,size,size+1, 1708 norm,normStart,100, 1709 Normalizer.NFD,0); 1710 size ++; 1711 } 1712 logln("\n"); 1713 1714 nfdsize = Normalizer.normalize(data,0,size, nfd,0,nfd.length,Normalizer.NFD,0); 1715 // nfdsize = unorm_normalize(data, size, UNORM_NFD, UCOL_IGNORE_HANGUL, 1716 // nfd, 100, &status); 1717 if (nfdsize != normStart || Utility.arrayRegionMatches(nfd,0, norm,0,nfdsize) ==false) { 1718 testresult = Normalizer.NO; 1719 } 1720 if (testresult == Normalizer.YES) { 1721 logln("result Normalizer.YES\n"); 1722 } 1723 else { 1724 logln("result Normalizer.NO\n"); 1725 } 1726 1727 if (Normalizer.quickCheck(data,0,data.length, Normalizer.FCD,0) != testresult) { 1728 errln("Normalizer.quickCheck(FCD) failed: expected "+ testresult +" for random data: "+hex(new String(data)) ); 1729 } 1730 } 1731 } 1732 1733 1734 // reference implementation of Normalizer::compare 1735 private int ref_norm_compare(String s1, String s2, int options) { 1736 String t1, t2,r1,r2; 1737 1738 int normOptions=options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT; 1739 1740 if((options&Normalizer.COMPARE_IGNORE_CASE)!=0) { 1741 // NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) 1742 r1 = Normalizer.decompose(s1,false,normOptions); 1743 r2 = Normalizer.decompose(s2,false,normOptions); 1744 r1 = UCharacter.foldCase(r1,options); 1745 r2 = UCharacter.foldCase(r2,options); 1746 }else{ 1747 r1 = s1; 1748 r2 = s2; 1749 } 1750 1751 t1 = Normalizer.decompose(r1, false, normOptions); 1752 t2 = Normalizer.decompose(r2, false, normOptions); 1753 1754 if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) { 1755 UTF16.StringComparator comp 1756 = new UTF16.StringComparator(true, false, 1757 UTF16.StringComparator.FOLD_CASE_DEFAULT); 1758 return comp.compare(t1,t2); 1759 } else { 1760 return t1.compareTo(t2); 1761 } 1762 1763 } 1764 1765 // test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately 1766 private int norm_compare(String s1, String s2, int options) { 1767 int normOptions=options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT; 1768 1769 if( Normalizer.YES==Normalizer.quickCheck(s1,Normalizer.FCD,normOptions) && 1770 Normalizer.YES==Normalizer.quickCheck(s2,Normalizer.FCD,normOptions)) { 1771 options|=Normalizer.INPUT_IS_FCD; 1772 } 1773 1774 int cmpStrings = Normalizer.compare(s1, s2, options); 1775 int cmpArrays = Normalizer.compare( 1776 s1.toCharArray(), 0, s1.length(), 1777 s2.toCharArray(), 0, s2.length(), options); 1778 assertEquals("compare strings == compare char arrays", cmpStrings, cmpArrays); 1779 return cmpStrings; 1780 } 1781 1782 // reference implementation of UnicodeString::caseCompare 1783 private int ref_case_compare(String s1, String s2, int options) { 1784 String t1, t2; 1785 1786 t1=s1; 1787 t2=s2; 1788 1789 t1 = UCharacter.foldCase(t1,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0)); 1790 t2 = UCharacter.foldCase(t2,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0)); 1791 1792 if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) { 1793 UTF16.StringComparator comp 1794 = new UTF16.StringComparator(true, false, 1795 UTF16.StringComparator.FOLD_CASE_DEFAULT); 1796 return comp.compare(t1,t2); 1797 } else { 1798 return t1.compareTo(t2); 1799 } 1800 1801 } 1802 1803 // reduce an integer to -1/0/1 1804 private static int sign(int value) { 1805 if(value==0) { 1806 return 0; 1807 } else { 1808 return (value>>31)|1; 1809 } 1810 } 1811 private static String signString(int value) { 1812 if(value<0) { 1813 return "<0"; 1814 } else if(value==0) { 1815 return "=0"; 1816 } else /* value>0 */ { 1817 return ">0"; 1818 } 1819 } 1820 // test Normalizer::compare and unorm_compare (thinly wrapped by the former) 1821 // by comparing it with its semantic equivalent 1822 // since we trust the pieces, this is sufficient 1823 1824 // test each string with itself and each other 1825 // each time with all options 1826 private String strings[]=new String[]{ 1827 // some cases from NormalizationTest.txt 1828 // 0..3 1829 "D\u031B\u0307\u0323", 1830 "\u1E0C\u031B\u0307", 1831 "D\u031B\u0323\u0307", 1832 "d\u031B\u0323\u0307", 1833 1834 // 4..6 1835 "\u00E4", 1836 "a\u0308", 1837 "A\u0308", 1838 1839 // Angstrom sign = A ring 1840 // 7..10 1841 "\u212B", 1842 "\u00C5", 1843 "A\u030A", 1844 "a\u030A", 1845 1846 // 11.14 1847 "a\u059A\u0316\u302A\u032Fb", 1848 "a\u302A\u0316\u032F\u059Ab", 1849 "a\u302A\u0316\u032F\u059Ab", 1850 "A\u059A\u0316\u302A\u032Fb", 1851 1852 // from ICU case folding tests 1853 // 15..20 1854 "A\u00df\u00b5\ufb03\\U0001040c\u0131", 1855 "ass\u03bcffi\\U00010434i", 1856 "\u0061\u0042\u0131\u03a3\u00df\ufb03\ud93f\udfff", 1857 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udfff", 1858 "\u0041\u0062\u0131\u03c3\u0053\u0073\u0066\u0046\u0069\ud93f\udfff", 1859 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udffd", 1860 1861 // U+d800 U+10001 see implementation comment in unorm_cmpEquivFold 1862 // vs. U+10000 at bottom - code point order 1863 // 21..22 1864 "\ud800\ud800\udc01", 1865 "\ud800\udc00", 1866 1867 // other code point order tests from ustrtest.cpp 1868 // 23..31 1869 "\u20ac\ud801", 1870 "\u20ac\ud800\udc00", 1871 "\ud800", 1872 "\ud800\uff61", 1873 "\udfff", 1874 "\uff61\udfff", 1875 "\uff61\ud800\udc02", 1876 "\ud800\udc02", 1877 "\ud84d\udc56", 1878 1879 // long strings, see cnormtst.c/TestNormCoverage() 1880 // equivalent if case-insensitive 1881 // 32..33 1882 "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1883 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1884 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1885 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1886 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1887 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1888 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1889 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1890 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ 1891 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1892 "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1893 "d\u031B\u0307\u0323", 1894 1895 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+ 1896 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1897 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1898 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1899 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1900 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1901 "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1902 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1903 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ 1904 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1905 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+ 1906 "\u1E0C\u031B\u0307", 1907 1908 // some strings that may make a difference whether the compare function 1909 // case-folds or decomposes first 1910 // 34..41 1911 "\u0360\u0345\u0334", 1912 "\u0360\u03b9\u0334", 1913 1914 "\u0360\u1f80\u0334", 1915 "\u0360\u03b1\u0313\u03b9\u0334", 1916 1917 "\u0360\u1ffc\u0334", 1918 "\u0360\u03c9\u03b9\u0334", 1919 1920 "a\u0360\u0345\u0360\u0345b", 1921 "a\u0345\u0360\u0345\u0360b", 1922 1923 // interesting cases for canonical caseless match with turkic i handling 1924 // 42..43 1925 "\u00cc", 1926 "\u0069\u0300", 1927 1928 // strings with post-Unicode 3.2 normalization or normalization corrections 1929 // 44..45 1930 "\u00e4\u193b\\U0002f868", 1931 "\u0061\u193b\u0308\u36fc", 1932 1933 1934 }; 1935 1936 // all combinations of options 1937 // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions 1938 final class Temp { 1939 int options; 1940 String name; 1941 public Temp(int opt,String str){ 1942 options =opt; 1943 name = str; 1944 } 1945 1946 } 1947 // set UNORM_UNICODE_3_2 in one additional combination 1948 1949 private Temp[] opt = new Temp[]{ 1950 new Temp(0,"default"), 1951 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER, "code point order" ), 1952 new Temp(Normalizer.COMPARE_IGNORE_CASE, "ignore case" ), 1953 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE, "code point order & ignore case" ), 1954 new Temp(Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "ignore case & special i"), 1955 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "code point order & ignore case & special i"), 1956 new Temp(Normalizer.UNICODE_3_2 << Normalizer.COMPARE_NORM_OPTIONS_SHIFT, "Unicode 3.2") 1957 }; 1958 1959 1960 @Test 1961 public void TestCompareDebug(){ 1962 1963 String[] s = new String[100]; // at least as many items as in strings[] ! 1964 1965 1966 int i, j, k, count=strings.length; 1967 int result, refResult; 1968 1969 // create the UnicodeStrings 1970 for(i=0; i<count; ++i) { 1971 s[i]=Utility.unescape(strings[i]); 1972 } 1973 UTF16.StringComparator comp = new UTF16.StringComparator(true, false, 1974 UTF16.StringComparator.FOLD_CASE_DEFAULT); 1975 // test them each with each other 1976 1977 i = 42; 1978 j = 43; 1979 k = 2; 1980 // test Normalizer::compare 1981 result=norm_compare(s[i], s[j], opt[k].options); 1982 refResult=ref_norm_compare(s[i], s[j], opt[k].options); 1983 if(sign(result)!=sign(refResult)) { 1984 errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 1985 } 1986 1987 // test UnicodeString::caseCompare - same internal implementation function 1988 if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) { 1989 // result=s[i]. (s[j], opt[k].options); 1990 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) 1991 { 1992 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); 1993 } 1994 else { 1995 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); 1996 } 1997 1998 result=comp.compare(s[i],s[j]); 1999 refResult=ref_case_compare(s[i], s[j], opt[k].options); 2000 if(sign(result)!=sign(refResult)) { 2001 errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 2002 } 2003 } 2004 String value1 = "\u00dater\u00fd"; 2005 String value2 = "\u00fater\u00fd"; 2006 if(Normalizer.compare(value1,value2,0)!=0){ 2007 if(Normalizer.compare(value1,value2,Normalizer.COMPARE_IGNORE_CASE)==0){ 2008 2009 } 2010 } 2011 } 2012 2013 @Test 2014 public void TestCompare() { 2015 2016 String[] s = new String[100]; // at least as many items as in strings[] ! 2017 2018 int i, j, k, count=strings.length; 2019 int result, refResult; 2020 2021 // create the UnicodeStrings 2022 for(i=0; i<count; ++i) { 2023 s[i]=Utility.unescape(strings[i]); 2024 } 2025 UTF16.StringComparator comp = new UTF16.StringComparator(); 2026 // test them each with each other 2027 for(i=0; i<count; ++i) { 2028 for(j=i; j<count; ++j) { 2029 for(k=0; k<opt.length; ++k) { 2030 // test Normalizer::compare 2031 result=norm_compare(s[i], s[j], opt[k].options); 2032 refResult=ref_norm_compare(s[i], s[j], opt[k].options); 2033 if(sign(result)!=sign(refResult)) { 2034 errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 2035 } 2036 2037 // test UnicodeString::caseCompare - same internal implementation function 2038 if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) { 2039 // result=s[i]. (s[j], opt[k].options); 2040 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) 2041 { 2042 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); 2043 } 2044 else { 2045 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); 2046 } 2047 2048 comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0); 2049 // result=comp.caseCompare(s[i],s[j], opt[k].options); 2050 result=comp.compare(s[i],s[j]); 2051 refResult=ref_case_compare(s[i], s[j], opt[k].options); 2052 if(sign(result)!=sign(refResult)) { 2053 errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 2054 } 2055 } 2056 } 2057 } 2058 } 2059 2060 // test cases with i and I to make sure Turkic works 2061 char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 }; 2062 UnicodeSet set = new UnicodeSet(), iSet = new UnicodeSet(); 2063 Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstance().impl; 2064 nfcImpl.ensureCanonIterData(); 2065 2066 String s1, s2; 2067 2068 // collect all sets into one for contiguous output 2069 for(i=0; i<iI.length; ++i) { 2070 if(nfcImpl.getCanonStartSet(iI[i], iSet)) { 2071 set.addAll(iSet); 2072 } 2073 } 2074 2075 // test all of these precomposed characters 2076 Normalizer2 nfcNorm2 = Normalizer2.getNFCInstance(); 2077 UnicodeSetIterator it = new UnicodeSetIterator(set); 2078 int c; 2079 while(it.next() && (c=it.codepoint)!=UnicodeSetIterator.IS_STRING) { 2080 s1 = UTF16.valueOf(c); 2081 s2 = nfcNorm2.getDecomposition(c); 2082 for(k=0; k<opt.length; ++k) { 2083 // test Normalizer::compare 2084 2085 result= norm_compare(s1, s2, opt[k].options); 2086 refResult=ref_norm_compare(s1, s2, opt[k].options); 2087 if(sign(result)!=sign(refResult)) { 2088 errln("Normalizer.compare(U+"+hex(c)+" with its NFD, "+opt[k].name+")" 2089 + signString(result)+" should be "+signString(refResult)); 2090 } 2091 2092 // test UnicodeString::caseCompare - same internal implementation function 2093 if((opt[k].options & Normalizer.COMPARE_IGNORE_CASE)>0) { 2094 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) 2095 { 2096 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); 2097 } 2098 else { 2099 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); 2100 } 2101 2102 comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0); 2103 2104 result=comp.compare(s1,s2); 2105 refResult=ref_case_compare(s1, s2, opt[k].options); 2106 if(sign(result)!=sign(refResult)) { 2107 errln("UTF16.compare(U+"+hex(c)+" with its NFD, " 2108 +opt[k].name+")"+signString(result) +" should be "+signString(refResult)); 2109 } 2110 } 2111 } 2112 } 2113 2114 // test getDecomposition() for some characters that do not decompose 2115 if( nfcNorm2.getDecomposition(0x20)!=null || 2116 nfcNorm2.getDecomposition(0x4e00)!=null || 2117 nfcNorm2.getDecomposition(0x20002)!=null 2118 ) { 2119 errln("NFC.getDecomposition() returns TRUE for characters which do not have decompositions"); 2120 } 2121 2122 // test getRawDecomposition() for some characters that do not decompose 2123 if( nfcNorm2.getRawDecomposition(0x20)!=null || 2124 nfcNorm2.getRawDecomposition(0x4e00)!=null || 2125 nfcNorm2.getRawDecomposition(0x20002)!=null 2126 ) { 2127 errln("getRawDecomposition() returns TRUE for characters which do not have decompositions"); 2128 } 2129 2130 // test composePair() for some pairs of characters that do not compose 2131 if( nfcNorm2.composePair(0x20, 0x301)>=0 || 2132 nfcNorm2.composePair(0x61, 0x305)>=0 || 2133 nfcNorm2.composePair(0x1100, 0x1160)>=0 || 2134 nfcNorm2.composePair(0xac00, 0x11a7)>=0 2135 ) { 2136 errln("NFC.composePair() incorrectly composes some pairs of characters"); 2137 } 2138 2139 // test FilteredNormalizer2.getDecomposition() 2140 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff]"); 2141 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); 2142 if(fn2.getDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getDecomposition(0x100))) { 2143 errln("FilteredNormalizer2(NFC, ^A0-FF).getDecomposition() failed"); 2144 } 2145 2146 // test FilteredNormalizer2.getRawDecomposition() 2147 if(fn2.getRawDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getRawDecomposition(0x100))) { 2148 errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed"); 2149 } 2150 2151 // test FilteredNormalizer2::composePair() 2152 if( 0x100!=fn2.composePair(0x41, 0x304) || 2153 fn2.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08 2154 ) { 2155 errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed"); 2156 } 2157 } 2158 2159 // verify that case-folding does not un-FCD strings 2160 int countFoldFCDExceptions(int foldingOptions) { 2161 String s, d; 2162 int c; 2163 int count; 2164 int/*unsigned*/ cc, trailCC, foldCC, foldTrailCC; 2165 Normalizer.QuickCheckResult qcResult; 2166 int category; 2167 boolean isNFD; 2168 2169 2170 logln("Test if case folding may un-FCD a string (folding options 0x)"+hex(foldingOptions)); 2171 2172 count=0; 2173 for(c=0; c<=0x10ffff; ++c) { 2174 category=UCharacter.getType(c); 2175 if(category==UCharacterCategory.UNASSIGNED) { 2176 continue; // skip unassigned code points 2177 } 2178 if(c==0xac00) { 2179 c=0xd7a3; // skip Hangul - no case folding there 2180 continue; 2181 } 2182 // skip Han blocks - no case folding there either 2183 if(c==0x3400) { 2184 c=0x4db5; 2185 continue; 2186 } 2187 if(c==0x4e00) { 2188 c=0x9fa5; 2189 continue; 2190 } 2191 if(c==0x20000) { 2192 c=0x2a6d6; 2193 continue; 2194 } 2195 2196 s= UTF16.valueOf(c); 2197 2198 // get leading and trailing cc for c 2199 d= Normalizer.decompose(s,false); 2200 isNFD= s==d; 2201 cc=UCharacter.getCombiningClass(UTF16.charAt(d,0)); 2202 trailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1)); 2203 2204 // get leading and trailing cc for the case-folding of c 2205 UCharacter.foldCase(s,(foldingOptions==0)); 2206 d = Normalizer.decompose(s, false); 2207 foldCC=UCharacter.getCombiningClass(UTF16.charAt(d,0)); 2208 foldTrailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1)); 2209 2210 qcResult=Normalizer.quickCheck(s, Normalizer.FCD,0); 2211 2212 2213 // bad: 2214 // - character maps to empty string: adjacent characters may then need reordering 2215 // - folding has different leading/trailing cc's, and they don't become just 0 2216 // - folding itself is not FCD 2217 if( qcResult!=Normalizer.YES || 2218 s.length()==0 || 2219 (cc!=foldCC && foldCC!=0) || (trailCC!=foldTrailCC && foldTrailCC!=0) 2220 ) { 2221 ++count; 2222 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")"); 2223 //errln(" cc %02x trailCC %02x foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x quickCheck(folded)=%d", cc, trailCC, UTF16.charAt(d,0), foldCC, UTF16.charAt(d,d.length()-1), foldTrailCC, qcResult); 2224 continue; 2225 } 2226 2227 // also bad: 2228 // if a code point is in NFD but its case folding is not, then 2229 // unorm_compare will also fail 2230 if(isNFD && Normalizer.YES!=Normalizer.quickCheck(s, Normalizer.NFD,0)) { 2231 ++count; 2232 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")"); 2233 } 2234 } 2235 2236 logln("There are "+hex(count)+" code points for which case-folding may un-FCD a string (folding options"+foldingOptions+"x)" ); 2237 return count; 2238 } 2239 2240 @Test 2241 public void TestFindFoldFCDExceptions() { 2242 int count; 2243 2244 count=countFoldFCDExceptions(0); 2245 count+=countFoldFCDExceptions(Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I); 2246 if(count>0) { 2247 //* 2248 //* If case-folding un-FCDs any strings, then unorm_compare() must be 2249 //* re-implemented. 2250 //* It currently assumes that one can check for FCD then case-fold 2251 //* and then still have FCD strings for raw decomposition without reordering. 2252 //* 2253 errln("error: There are "+count+" code points for which case-folding"+ 2254 " may un-FCD a string for all folding options.\n See comment"+ 2255 " in BasicNormalizerTest::FindFoldFCDExceptions()!"); 2256 } 2257 } 2258 2259 @Test 2260 public void TestCombiningMarks(){ 2261 String src = "\u0f71\u0f72\u0f73\u0f74\u0f75"; 2262 String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74"; 2263 String result = Normalizer.decompose(src,false); 2264 if(!expected.equals(result)){ 2265 errln("Reordering of combining marks failed. Expected: "+Utility.hex(expected)+" Got: "+ Utility.hex(result)); 2266 } 2267 } 2268 2269 /* 2270 * Re-enable this test when UTC fixes UAX 21 2271 @Test 2272 public void TestUAX21Failure(){ 2273 final String[][] cases = new String[][]{ 2274 {"\u0061\u0345\u0360\u0345\u0062", "\u0061\u0360\u0345\u0345\u0062"}, 2275 {"\u0061\u0345\u0345\u0360\u0062", "\u0061\u0360\u0345\u0345\u0062"}, 2276 {"\u0061\u0345\u0360\u0362\u0360\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"}, 2277 {"\u0061\u0360\u0345\u0360\u0362\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"}, 2278 {"\u0061\u0345\u0360\u0362\u0361\u0062", "\u0061\u0362\u0360\u0361\u0345\u0062"}, 2279 {"\u0061\u0361\u0345\u0360\u0362\u0062", "\u0061\u0362\u0361\u0360\u0345\u0062"}, 2280 }; 2281 for(int i = 0; i< cases.length; i++){ 2282 String s1 =cases[0][0]; 2283 String s2 = cases[0][1]; 2284 if( (Normalizer.compare(s1,s2,Normalizer.FOLD_CASE_DEFAULT ==0)//case sensitive compare 2285 && 2286 (Normalizer.compare(s1,s2,Normalizer.COMPARE_IGNORE_CASE)!=0)){ 2287 errln("Normalizer.compare() failed for s1: " 2288 + Utility.hex(s1) +" s2: " + Utility.hex(s2)); 2289 } 2290 } 2291 } 2292 */ 2293 @Test 2294 public void TestFCNFKCClosure() { 2295 final class TestStruct{ 2296 int c; 2297 String s; 2298 TestStruct(int cp, String src){ 2299 c=cp; 2300 s=src; 2301 } 2302 } 2303 2304 TestStruct[] tests= new TestStruct[]{ 2305 new TestStruct( 0x00C4, "" ), 2306 new TestStruct( 0x00E4, "" ), 2307 new TestStruct( 0x037A, "\u0020\u03B9" ), 2308 new TestStruct( 0x03D2, "\u03C5" ), 2309 new TestStruct( 0x20A8, "\u0072\u0073" ) , 2310 new TestStruct( 0x210B, "\u0068" ), 2311 new TestStruct( 0x210C, "\u0068" ), 2312 new TestStruct( 0x2121, "\u0074\u0065\u006C" ), 2313 new TestStruct( 0x2122, "\u0074\u006D" ), 2314 new TestStruct( 0x2128, "\u007A" ), 2315 new TestStruct( 0x1D5DB,"\u0068" ), 2316 new TestStruct( 0x1D5ED,"\u007A" ), 2317 new TestStruct( 0x0061, "" ) 2318 }; 2319 2320 2321 for(int i = 0; i < tests.length; ++ i) { 2322 String result=Normalizer.getFC_NFKC_Closure(tests[i].c); 2323 if(!result.equals(new String(tests[i].s))) { 2324 errln("getFC_NFKC_Closure(U+"+Integer.toHexString(tests[i].c)+") is wrong"); 2325 } 2326 } 2327 2328 /* error handling */ 2329 2330 int length=Normalizer.getFC_NFKC_Closure(0x5c, null); 2331 if(length!=0){ 2332 errln("getFC_NFKC_Closure did not perform error handling correctly"); 2333 } 2334 } 2335 @Test 2336 public void TestBugJ2324(){ 2337 /* String[] input = new String[]{ 2338 //"\u30FD\u3099", 2339 "\u30FA\u309A", 2340 "\u30FB\u309A", 2341 "\u30FC\u309A", 2342 "\u30FE\u309A", 2343 "\u30FD\u309A", 2344 2345 };*/ 2346 String troublesome = "\u309A"; 2347 for(int i=0x3000; i<0x3100;i++){ 2348 String input = ((char)i)+troublesome; 2349 try{ 2350 /* String result =*/ Normalizer.compose(input,false); 2351 }catch(IndexOutOfBoundsException e){ 2352 errln("compose() failed for input: " + Utility.hex(input) + " Exception: " + e.toString()); 2353 } 2354 } 2355 2356 } 2357 2358 static final int D = 0, C = 1, KD= 2, KC = 3, FCD=4, NONE=5; 2359 2360 private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets) { 2361 skipSets[D].applyPattern("[[:NFD_QC=Yes:]&[:ccc=0:]]", false); 2362 skipSets[C].applyPattern("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false); 2363 skipSets[KD].applyPattern("[[:NFKD_QC=Yes:]&[:ccc=0:]]", false); 2364 skipSets[KC].applyPattern("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false); 2365 2366 // Remove from the NFC and NFKC sets all those characters that change 2367 // when a back-combining character is added. 2368 // First, get all of the back-combining characters and their combining classes. 2369 UnicodeSet combineBack=new UnicodeSet("[:NFC_QC=Maybe:]"); 2370 int numCombineBack=combineBack.size(); 2371 int[] combineBackCharsAndCc=new int[numCombineBack*2]; 2372 UnicodeSetIterator iter=new UnicodeSetIterator(combineBack); 2373 for(int i=0; i<numCombineBack; ++i) { 2374 iter.next(); 2375 int c=iter.codepoint; 2376 combineBackCharsAndCc[2*i]=c; 2377 combineBackCharsAndCc[2*i+1]=UCharacter.getCombiningClass(c); 2378 } 2379 2380 // We need not look at control codes, Han characters nor Hangul LVT syllables because they 2381 // do not combine forward. LV syllables are already removed. 2382 UnicodeSet notInteresting=new UnicodeSet("[[:C:][:Unified_Ideograph:][:HST=LVT:]]"); 2383 UnicodeSet unsure=((UnicodeSet)(skipSets[C].clone())).removeAll(notInteresting); 2384 // System.out.format("unsure.size()=%d\n", unsure.size()); 2385 2386 // For each character about which we are unsure, see if it changes when we add 2387 // one of the back-combining characters. 2388 Normalizer2 norm2=Normalizer2.getNFCInstance(); 2389 StringBuilder s=new StringBuilder(); 2390 iter.reset(unsure); 2391 while(iter.next()) { 2392 int c=iter.codepoint; 2393 s.delete(0, 0x7fffffff).appendCodePoint(c); 2394 int cLength=s.length(); 2395 int tccc=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS); 2396 for(int i=0; i<numCombineBack; ++i) { 2397 // If c's decomposition ends with a character with non-zero combining class, then 2398 // c can only change if it combines with a character with a non-zero combining class. 2399 int cc2=combineBackCharsAndCc[2*i+1]; 2400 if(tccc==0 || cc2!=0) { 2401 int c2=combineBackCharsAndCc[2*i]; 2402 s.appendCodePoint(c2); 2403 if(!norm2.isNormalized(s)) { 2404 // System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2); 2405 skipSets[C].remove(c); 2406 skipSets[KC].remove(c); 2407 break; 2408 } 2409 s.delete(cLength, 0x7fffffff); 2410 } 2411 } 2412 } 2413 return skipSets; 2414 } 2415 2416 @Test 2417 public void TestSkippable() { 2418 UnicodeSet[] skipSets = new UnicodeSet[] { 2419 new UnicodeSet(), //NFD 2420 new UnicodeSet(), //NFC 2421 new UnicodeSet(), //NFKD 2422 new UnicodeSet() //NFKC 2423 }; 2424 UnicodeSet[] expectSets = new UnicodeSet[] { 2425 new UnicodeSet(), 2426 new UnicodeSet(), 2427 new UnicodeSet(), 2428 new UnicodeSet() 2429 }; 2430 StringBuilder s, pattern; 2431 2432 // build NF*Skippable sets from runtime data 2433 skipSets[D].applyPattern("[:NFD_Inert:]"); 2434 skipSets[C].applyPattern("[:NFC_Inert:]"); 2435 skipSets[KD].applyPattern("[:NFKD_Inert:]"); 2436 skipSets[KC].applyPattern("[:NFKC_Inert:]"); 2437 2438 expectSets = initSkippables(expectSets); 2439 if(expectSets[D].contains(0x0350)){ 2440 errln("expectSets[D] contains 0x0350"); 2441 } 2442 for(int i=0; i<expectSets.length; ++i) { 2443 if(!skipSets[i].equals(expectSets[i])) { 2444 errln("error: TestSkippable skipSets["+i+"]!=expectedSets["+i+"]\n"); 2445 // Note: This used to depend on hardcoded UnicodeSet patterns generated by 2446 // Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by 2447 // running com.ibm.text.UCD.Main with the option NFSkippable. 2448 // Since ICU 4.6/Unicode 6, we are generating the 2449 // expectSets ourselves in initSkippables(). 2450 2451 s=new StringBuilder(); 2452 2453 s.append("\n\nskip= "); 2454 s.append(skipSets[i].toPattern(true)); 2455 s.append("\n\n"); 2456 2457 s.append("skip-expect="); 2458 pattern = new StringBuilder(((UnicodeSet)skipSets[i].clone()).removeAll(expectSets[i]).toPattern(true)); 2459 s.append(pattern); 2460 2461 pattern.delete(0,pattern.length()); 2462 s.append("\n\nexpect-skip="); 2463 pattern = new StringBuilder(((UnicodeSet)expectSets[i].clone()).removeAll(skipSets[i]).toPattern(true)); 2464 s.append(pattern); 2465 s.append("\n\n"); 2466 2467 pattern.delete(0,pattern.length()); 2468 s.append("\n\nintersection(expect,skip)="); 2469 UnicodeSet intersection = ((UnicodeSet) expectSets[i].clone()).retainAll(skipSets[i]); 2470 pattern = new StringBuilder(intersection.toPattern(true)); 2471 s.append(pattern); 2472 // Special: test coverage for append(char). 2473 s.append('\n'); 2474 s.append('\n'); 2475 2476 errln(s.toString()); 2477 } 2478 } 2479 } 2480 2481 @Test 2482 public void TestBugJ2068(){ 2483 String sample = "The quick brown fox jumped over the lazy dog"; 2484 UCharacterIterator text = UCharacterIterator.getInstance(sample); 2485 Normalizer norm = new Normalizer(text,Normalizer.NFC,0); 2486 text.setIndex(4); 2487 if(text.current() == norm.current()){ 2488 errln("Normalizer is not cloning the UCharacterIterator"); 2489 } 2490 } 2491 @Test 2492 public void TestGetCombiningClass(){ 2493 for(int i=0;i<0x10FFFF;i++){ 2494 int cc = UCharacter.getCombiningClass(i); 2495 if(0xD800<= i && i<=0xDFFF && cc >0 ){ 2496 cc = UCharacter.getCombiningClass(i); 2497 errln("CC: "+ cc + " for codepoint: " +Utility.hex(i,8)); 2498 } 2499 } 2500 } 2501 2502 @Test 2503 public void TestSerializedSet(){ 2504 USerializedSet sset=new USerializedSet(); 2505 UnicodeSet set = new UnicodeSet(); 2506 int start, end; 2507 2508 char[] serialized = { 2509 0x8007, // length 2510 3, // bmpLength 2511 0xc0, 0xfe, 0xfffc, 2512 1, 9, 0x10, 0xfffc 2513 }; 2514 sset.getSet(serialized, 0); 2515 2516 // collect all sets into one for contiguous output 2517 int[] startEnd = new int[2]; 2518 int count=sset.countRanges(); 2519 for(int j=0; j<count; ++j) { 2520 sset.getRange(j, startEnd); 2521 set.add(startEnd[0], startEnd[1]); 2522 } 2523 2524 // test all of these characters 2525 UnicodeSetIterator it = new UnicodeSetIterator(set); 2526 while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) { 2527 start=it.codepoint; 2528 end=it.codepointEnd; 2529 while(start<=end) { 2530 if(!sset.contains(start)){ 2531 errln("USerializedSet.contains failed for "+Utility.hex(start,8)); 2532 } 2533 ++start; 2534 } 2535 } 2536 } 2537 2538 @Test 2539 public void TestReturnFailure(){ 2540 char[] term = {'r','\u00e9','s','u','m','\u00e9' }; 2541 char[] decomposed_term = new char[10 + term.length + 2]; 2542 int rc = Normalizer.decompose(term,0,term.length, decomposed_term,0,decomposed_term.length,true, 0); 2543 int rc1 = Normalizer.decompose(term,0,term.length, decomposed_term,10,decomposed_term.length,true, 0); 2544 if(rc!=rc1){ 2545 errln("Normalizer decompose did not return correct length"); 2546 } 2547 } 2548 2549 private final static class TestCompositionCase { 2550 public Normalizer.Mode mode; 2551 public int options; 2552 public String input, expect; 2553 TestCompositionCase(Normalizer.Mode mode, int options, String input, String expect) { 2554 this.mode=mode; 2555 this.options=options; 2556 this.input=input; 2557 this.expect=expect; 2558 } 2559 } 2560 2561 @Test 2562 public void TestComposition() { 2563 final TestCompositionCase cases[]=new TestCompositionCase[]{ 2564 /* 2565 * special cases for UAX #15 bug 2566 * see Unicode Corrigendum #5: Normalization Idempotency 2567 * at http://unicode.org/versions/corrigendum5.html 2568 * (was Public Review Issue #29) 2569 */ 2570 new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327", "\u1100\u0300\u1161\u0327"), 2571 new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327\u11a8","\u1100\u0300\u1161\u0327\u11a8"), 2572 new TestCompositionCase(Normalizer.NFC, 0, "\uac00\u0300\u0327\u11a8", "\uac00\u0327\u0300\u11a8"), 2573 new TestCompositionCase(Normalizer.NFC, 0, "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"), 2574 2575 /* TODO: add test cases for UNORM_FCC here (j2151) */ 2576 }; 2577 2578 String output; 2579 int i; 2580 2581 for(i=0; i<cases.length; ++i) { 2582 output=Normalizer.normalize(cases[i].input, cases[i].mode, cases[i].options); 2583 if(!output.equals(cases[i].expect)) { 2584 errln("unexpected result for case "+i); 2585 } 2586 } 2587 } 2588 2589 @Test 2590 public void TestGetDecomposition() { 2591 Normalizer2 n2=Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE_CONTIGUOUS); 2592 String decomp=n2.getDecomposition(0x20); 2593 assertEquals("fcc.getDecomposition(space) failed", null, decomp); 2594 decomp=n2.getDecomposition(0xe4); 2595 assertEquals("fcc.getDecomposition(a-umlaut) failed", "a\u0308", decomp); 2596 decomp=n2.getDecomposition(0xac01); 2597 assertEquals("fcc.getDecomposition(Hangul syllable U+AC01) failed", "\u1100\u1161\u11a8", decomp); 2598 } 2599 2600 @Test 2601 public void TestGetRawDecomposition() { 2602 Normalizer2 n2=Normalizer2.getNFKCInstance(); 2603 /* 2604 * Raw decompositions from NFKC data are the Unicode Decomposition_Mapping values, 2605 * without recursive decomposition. 2606 */ 2607 2608 String decomp=n2.getRawDecomposition(0x20); 2609 assertEquals("nfkc.getRawDecomposition(space) failed", null, decomp); 2610 decomp=n2.getRawDecomposition(0xe4); 2611 assertEquals("nfkc.getRawDecomposition(a-umlaut) failed", "a\u0308", decomp); 2612 /* U+1E08 LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE */ 2613 decomp=n2.getRawDecomposition(0x1e08); 2614 assertEquals("nfkc.getRawDecomposition(c-cedilla-acute) failed", "\u00c7\u0301", decomp); 2615 /* U+212B ANGSTROM SIGN */ 2616 decomp=n2.getRawDecomposition(0x212b); 2617 assertEquals("nfkc.getRawDecomposition(angstrom sign) failed", "\u00c5", decomp); 2618 decomp=n2.getRawDecomposition(0xac00); 2619 assertEquals("nfkc.getRawDecomposition(Hangul syllable U+AC00) failed", "\u1100\u1161", decomp); 2620 /* A Hangul LVT syllable has a raw decomposition of an LV syllable + T. */ 2621 decomp=n2.getRawDecomposition(0xac01); 2622 assertEquals("nfkc.getRawDecomposition(Hangul syllable U+AC01) failed", "\uac00\u11a8", decomp); 2623 } 2624 2625 @Test 2626 public void TestCustomComp() { 2627 String [][] pairs={ 2628 { "\\uD801\\uE000\\uDFFE", "" }, 2629 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" }, 2630 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" }, 2631 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE002\\U000110B9\\u0327\\u0345" }, 2632 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" }, 2633 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" }, 2634 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" }, 2635 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" } 2636 }; 2637 Normalizer2 customNorm2; 2638 customNorm2= 2639 Normalizer2.getInstance( 2640 BasicTest.class.getResourceAsStream("/android/icu/dev/data/testdata/testnorm.nrm"), 2641 "testnorm", 2642 Normalizer2.Mode.COMPOSE); 2643 for(int i=0; i<pairs.length; ++i) { 2644 String[] pair=pairs[i]; 2645 String input=Utility.unescape(pair[0]); 2646 String expected=Utility.unescape(pair[1]); 2647 String result=customNorm2.normalize(input); 2648 if(!result.equals(expected)) { 2649 errln("custom compose Normalizer2 did not normalize input "+i+" as expected"); 2650 } 2651 } 2652 } 2653 2654 @Test 2655 public void TestCustomFCC() { 2656 String[][] pairs={ 2657 { "\\uD801\\uE000\\uDFFE", "" }, 2658 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" }, 2659 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" }, 2660 // The following expected result is different from CustomComp 2661 // because of only-contiguous composition. 2662 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE001\\U000110B9\\u0327\\u0308\\u0345" }, 2663 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" }, 2664 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" }, 2665 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" }, 2666 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" } 2667 }; 2668 Normalizer2 customNorm2; 2669 customNorm2= 2670 Normalizer2.getInstance( 2671 BasicTest.class.getResourceAsStream("/android/icu/dev/data/testdata/testnorm.nrm"), 2672 "testnorm", 2673 Normalizer2.Mode.COMPOSE_CONTIGUOUS); 2674 for(int i=0; i<pairs.length; ++i) { 2675 String[] pair=pairs[i]; 2676 String input=Utility.unescape(pair[0]); 2677 String expected=Utility.unescape(pair[1]); 2678 String result=customNorm2.normalize(input); 2679 if(!result.equals(expected)) { 2680 errln("custom FCC Normalizer2 did not normalize input "+i+" as expected"); 2681 } 2682 } 2683 } 2684 2685 @Test 2686 public void TestCanonIterData() { 2687 // For now, just a regression test. 2688 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl.ensureCanonIterData(); 2689 // U+0FB5 TIBETAN SUBJOINED LETTER SSA is the trailing character 2690 // in some decomposition mappings where there is a composition exclusion. 2691 // In fact, U+0FB5 is normalization-inert (NFC_QC=Yes, NFD_QC=Yes, ccc=0) 2692 // but it is not a segment starter because it occurs in a decomposition mapping. 2693 if(impl.isCanonSegmentStarter(0xfb5)) { 2694 errln("isCanonSegmentStarter(U+0fb5)=true is wrong"); 2695 } 2696 // For [:Segment_Starter:] to work right, not just the property function has to work right, 2697 // UnicodeSet also needs a correct range starts set. 2698 UnicodeSet segStarters=new UnicodeSet("[:Segment_Starter:]").freeze(); 2699 if(segStarters.contains(0xfb5)) { 2700 errln("[:Segment_Starter:].contains(U+0fb5)=true is wrong"); 2701 } 2702 // Try characters up to Kana and miscellaneous CJK but below Han (for expediency). 2703 for(int c=0; c<=0x33ff; ++c) { 2704 boolean isStarter=impl.isCanonSegmentStarter(c); 2705 boolean isContained=segStarters.contains(c); 2706 if(isStarter!=isContained) { 2707 errln(String.format( 2708 "discrepancy: isCanonSegmentStarter(U+%04x)=%5b != " + 2709 "[:Segment_Starter:].contains(same)", 2710 c, isStarter)); 2711 } 2712 } 2713 } 2714 2715 @Test 2716 public void TestFilteredNormalizer2() { 2717 Normalizer2 nfcNorm2=Normalizer2.getNFCInstance(); 2718 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]"); 2719 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); 2720 int c; 2721 for(c=0; c<=0x3ff; ++c) { 2722 int expectedCC= filter.contains(c) ? nfcNorm2.getCombiningClass(c) : 0; 2723 int cc=fn2.getCombiningClass(c); 2724 assertEquals( 2725 "FilteredNormalizer2(NFC, ^A0-FF,310-31F).getCombiningClass(U+"+hex(c)+ 2726 ")==filtered NFC.getCC()", 2727 expectedCC, cc); 2728 } 2729 2730 // More coverage. 2731 StringBuilder sb=new StringBuilder(); 2732 assertEquals("filtered normalize()", "ää\u0304", 2733 fn2.normalize("a\u0308ä\u0304", (Appendable)sb).toString()); 2734 assertTrue("filtered hasBoundaryAfter()", fn2.hasBoundaryAfter('ä')); 2735 assertTrue("filtered isInert()", fn2.isInert(0x0313)); 2736 } 2737 2738 @Test 2739 public void TestFilteredAppend() { 2740 Normalizer2 nfcNorm2=Normalizer2.getNFCInstance(); 2741 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]"); 2742 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); 2743 2744 // Append two strings that each contain a character outside the filter set. 2745 StringBuilder sb = new StringBuilder("a\u0313a"); 2746 String second = "\u0301\u0313"; 2747 assertEquals("append()", "a\u0313á\u0313", fn2.append(sb, second).toString()); 2748 2749 // Same, and also normalize the second string. 2750 sb.replace(0, 0x7fffffff, "a\u0313a"); 2751 assertEquals( 2752 "normalizeSecondAndAppend()", 2753 "a\u0313á\u0313", fn2.normalizeSecondAndAppend(sb, second).toString()); 2754 2755 // Normalizer2.normalize(String) uses spanQuickCheckYes() and normalizeSecondAndAppend(). 2756 assertEquals("normalize()", "a\u0313á\u0313", fn2.normalize("a\u0313a\u0301\u0313")); 2757 } 2758 2759 @Test 2760 public void TestGetEasyToUseInstance() { 2761 // Test input string: 2762 // U+00A0 -> <noBreak> 0020 2763 // U+00C7 0301 = 1E08 = 0043 0327 0301 2764 String in="\u00A0\u00C7\u0301"; 2765 Normalizer2 n2=Normalizer2.getNFCInstance(); 2766 String out=n2.normalize(in); 2767 assertEquals( 2768 "getNFCInstance() did not return an NFC instance " + 2769 "(normalizes to " + prettify(out) + ')', 2770 "\u00A0\u1E08", out); 2771 2772 n2=Normalizer2.getNFDInstance(); 2773 out=n2.normalize(in); 2774 assertEquals( 2775 "getNFDInstance() did not return an NFD instance " + 2776 "(normalizes to " + prettify(out) + ')', 2777 "\u00A0C\u0327\u0301", out); 2778 2779 n2=Normalizer2.getNFKCInstance(); 2780 out=n2.normalize(in); 2781 assertEquals( 2782 "getNFKCInstance() did not return an NFKC instance " + 2783 "(normalizes to " + prettify(out) + ')', 2784 " \u1E08", out); 2785 2786 n2=Normalizer2.getNFKDInstance(); 2787 out=n2.normalize(in); 2788 assertEquals( 2789 "getNFKDInstance() did not return an NFKD instance " + 2790 "(normalizes to " + prettify(out) + ')', 2791 " C\u0327\u0301", out); 2792 2793 n2=Normalizer2.getNFKCCasefoldInstance(); 2794 out=n2.normalize(in); 2795 assertEquals( 2796 "getNFKCCasefoldInstance() did not return an NFKC_Casefold instance " + 2797 "(normalizes to " + prettify(out) + ')', 2798 " \u1E09", out); 2799 } 2800 2801 @Test 2802 public void TestNFC() { 2803 // Coverage tests. 2804 Normalizer2 nfc = Normalizer2.getNFCInstance(); 2805 assertTrue("nfc.hasBoundaryAfter(space)", nfc.hasBoundaryAfter(' ')); 2806 assertFalse("nfc.hasBoundaryAfter(ä)", nfc.hasBoundaryAfter('ä')); 2807 } 2808 2809 @Test 2810 public void TestNFD() { 2811 // Coverage tests. 2812 Normalizer2 nfd = Normalizer2.getNFDInstance(); 2813 assertTrue("nfd.hasBoundaryAfter(space)", nfd.hasBoundaryAfter(' ')); 2814 assertFalse("nfd.hasBoundaryAfter(ä)", nfd.hasBoundaryAfter('ä')); 2815 } 2816 2817 @Test 2818 public void TestFCD() { 2819 // Coverage tests. 2820 Normalizer2 fcd = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.FCD); 2821 assertTrue("fcd.hasBoundaryAfter(space)", fcd.hasBoundaryAfter(' ')); 2822 assertFalse("fcd.hasBoundaryAfter(ä)", fcd.hasBoundaryAfter('ä')); 2823 assertTrue("fcd.isInert(space)", fcd.isInert(' ')); 2824 assertFalse("fcd.isInert(ä)", fcd.isInert('ä')); 2825 2826 // This implementation method is unreachable via public API. 2827 Norm2AllModes.FCDNormalizer2 impl = (Norm2AllModes.FCDNormalizer2)fcd; 2828 assertEquals("fcd impl.getQuickCheck(space)", 1, impl.getQuickCheck(' ')); 2829 assertEquals("fcd impl.getQuickCheck(ä)", 0, impl.getQuickCheck('ä')); 2830 } 2831 2832 @Test 2833 public void TestNoneNormalizer() { 2834 // Use the deprecated Mode Normalizer.NONE for coverage of the internal NoopNormalizer2 2835 // as far as its methods are reachable that way. 2836 assertEquals("NONE.concatenate()", "ä\u0327", 2837 Normalizer.concatenate("ä", "\u0327", Normalizer.NONE, 0)); 2838 assertTrue("NONE.isNormalized()", Normalizer.isNormalized("ä\u0327", Normalizer.NONE, 0)); 2839 } 2840 2841 @Test 2842 public void TestNoopNormalizer2() { 2843 // Use the internal class directly for coverage of methods that are not publicly reachable. 2844 Normalizer2 noop = Norm2AllModes.NOOP_NORMALIZER2; 2845 assertEquals("noop.normalizeSecondAndAppend()", "ä\u0327", 2846 noop.normalizeSecondAndAppend(new StringBuilder("ä"), "\u0327").toString()); 2847 assertEquals("noop.getDecomposition()", null, noop.getDecomposition('ä')); 2848 assertTrue("noop.hasBoundaryAfter()", noop.hasBoundaryAfter(0x0308)); 2849 assertTrue("noop.isInert()", noop.isInert(0x0308)); 2850 } 2851 2852 /* 2853 * This unit test covers two 'get' methods in class Normalizer2Impl. It only tests that 2854 * an object is returned. 2855 */ 2856 @Test 2857 public void TestGetsFromImpl() { 2858 Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstance().impl; 2859 assertNotEquals("getNormTrie() returns null", null, nfcImpl.getNormTrie()); 2860 assertNotEquals("getFCD16FromBelow180() returns null", null, 2861 nfcImpl.getFCD16FromBelow180(0)); 2862 } 2863 2864 /* 2865 * Abstract class Normalizer2 has non-abstract methods which are overwritten by 2866 * its derived classes. To test these methods a derived class is defined here. 2867 */ 2868 public class TestNormalizer2 extends Normalizer2 { 2869 2870 public TestNormalizer2() {} 2871 @Override 2872 public StringBuilder normalize(CharSequence src, StringBuilder dest) { return null; } 2873 @Override 2874 public Appendable normalize(CharSequence src, Appendable dest) { return null; } 2875 @Override 2876 public StringBuilder normalizeSecondAndAppend( 2877 StringBuilder first, CharSequence second) { return null; } 2878 @Override 2879 public StringBuilder append(StringBuilder first, CharSequence second) { return null; } 2880 @Override 2881 public String getDecomposition(int c) { return null; } 2882 @Override 2883 public boolean isNormalized(CharSequence s) { return false; } 2884 @Override 2885 public Normalizer.QuickCheckResult quickCheck(CharSequence s) { return null; } 2886 @Override 2887 public int spanQuickCheckYes(CharSequence s) { return 0; } 2888 @Override 2889 public boolean hasBoundaryBefore(int c) { return false; } 2890 @Override 2891 public boolean hasBoundaryAfter(int c) { return false; } 2892 @Override 2893 public boolean isInert(int c) { return false; } 2894 } 2895 2896 final TestNormalizer2 tnorm2 = new TestNormalizer2(); 2897 @Test 2898 public void TestGetRawDecompositionBase() { 2899 int c = 'à'; 2900 assertEquals("Unexpected value returned from Normalizer2.getRawDecomposition()", 2901 null, tnorm2.getRawDecomposition(c)); 2902 } 2903 2904 @Test 2905 public void TestComposePairBase() { 2906 int a = 'a'; 2907 int b = '\u0300'; 2908 assertEquals("Unexpected value returned from Normalizer2.composePair()", 2909 -1, tnorm2.composePair(a, b)); 2910 } 2911 2912 @Test 2913 public void TestGetCombiningClassBase() { 2914 int c = '\u00e0'; 2915 assertEquals("Unexpected value returned from Normalizer2.getCombiningClass()", 2916 0, tnorm2.getCombiningClass(c)); 2917 } 2918} 2919