1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10package com.ibm.icu.dev.test.normalizer; 11 12import java.text.StringCharacterIterator; 13import java.util.Random; 14 15import org.junit.Test; 16 17import com.ibm.icu.dev.test.TestFmwk; 18import com.ibm.icu.impl.Norm2AllModes; 19import com.ibm.icu.impl.Normalizer2Impl; 20import com.ibm.icu.impl.USerializedSet; 21import com.ibm.icu.impl.Utility; 22import com.ibm.icu.lang.UCharacter; 23import com.ibm.icu.lang.UCharacterCategory; 24import com.ibm.icu.lang.UProperty; 25import com.ibm.icu.text.FilteredNormalizer2; 26import com.ibm.icu.text.Normalizer; 27import com.ibm.icu.text.Normalizer2; 28import com.ibm.icu.text.UCharacterIterator; 29import com.ibm.icu.text.UTF16; 30import com.ibm.icu.text.UnicodeSet; 31import com.ibm.icu.text.UnicodeSetIterator; 32 33 34public class BasicTest extends TestFmwk { 35 String[][] canonTests = { 36 // Input Decomposed Composed 37 { "cat", "cat", "cat" }, 38 { "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", }, 39 40 { "\u1e0a", "D\u0307", "\u1e0a" }, // D-dot_above 41 { "D\u0307", "D\u0307", "\u1e0a" }, // D dot_above 42 43 { "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_below dot_above 44 { "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_above dot_below 45 { "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D dot_below dot_above 46 47 { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above 48 { "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below 49 50 { "\u1E14", "E\u0304\u0300", "\u1E14" }, // E-macron-grave 51 { "\u0112\u0300", "E\u0304\u0300", "\u1E14" }, // E-macron + grave 52 { "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" }, // E-grave + macron 53 54 { "\u212b", "A\u030a", "\u00c5" }, // angstrom_sign 55 { "\u00c5", "A\u030a", "\u00c5" }, // A-ring 56 57 { "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" }, 58 { "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" }, 59 60 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated with 3.0 61 { "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" }, //updated with 3.0 62 63 { "Henry IV", "Henry IV", "Henry IV" }, 64 { "Henry \u2163", "Henry \u2163", "Henry \u2163" }, 65 66 { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana) 67 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten 68 { "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" }, // hw_ka + hw_ten 69 { "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" }, // ka + hw_ten 70 { "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" }, // hw_ka + ten 71 72 { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" }, 73 {"\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e","\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165", "\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165"}, 74 }; 75 76 String[][] compatTests = { 77 // Input Decomposed Composed 78 { "cat", "cat", "cat" }, 79 { "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", }, // Alef-Lamed vs. Alef, Lamed 80 81 { "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" }, 82 { "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" }, // ffi ligature -> f + f + i 83 84 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated for 3.0 85 { "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" }, // ffi ligature -> f + f + i 86 87 { "Henry IV", "Henry IV", "Henry IV" }, 88 { "Henry \u2163", "Henry IV", "Henry IV" }, 89 90 { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana) 91 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten 92 93 { "\uFF76\u3099", "\u30AB\u3099", "\u30AC" }, // hw_ka + ten 94 95 /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/ 96 { "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" }, // hw_ka + hw_ten 97 { "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" }, // ka + hw_ten 98 99 }; 100 101 // With Canonical decomposition, Hangul syllables should get decomposed 102 // into Jamo, but Jamo characters should not be decomposed into 103 // conjoining Jamo 104 String[][] hangulCanon = { 105 // Input Decomposed Composed 106 { "\ud4db", "\u1111\u1171\u11b6", "\ud4db" }, 107 { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" }, 108 }; 109 110 // With compatibility decomposition turned on, 111 // it should go all the way down to conjoining Jamo characters. 112 // THIS IS NO LONGER TRUE IN UNICODE v2.1.8, SO THIS TEST IS OBSOLETE 113 String[][] hangulCompat = { 114 // Input Decomposed Composed 115 // { "\ud4db", "\u1111\u116e\u1175\u11af\u11c2", "\ud478\u1175\u11af\u11c2" }, 116 }; 117 118 @Test 119 public void TestHangulCompose() 120 throws Exception{ 121 // Make sure that the static composition methods work 122 logln("Canonical composition..."); 123 staticTest(Normalizer.NFC, hangulCanon, 2); 124 logln("Compatibility composition..."); 125 staticTest(Normalizer.NFKC, hangulCompat, 2); 126 // Now try iterative composition.... 127 logln("Iterative composition..."); 128 Normalizer norm = new Normalizer("", Normalizer.NFC,0); 129 iterateTest(norm, hangulCanon, 2); 130 131 norm.setMode(Normalizer.NFKD); 132 iterateTest(norm, hangulCompat, 2); 133 134 // And finally, make sure you can do it in reverse too 135 logln("Reverse iteration..."); 136 norm.setMode(Normalizer.NFC); 137 backAndForth(norm, hangulCanon); 138 } 139 140 @Test 141 public void TestHangulDecomp() throws Exception{ 142 // Make sure that the static decomposition methods work 143 logln("Canonical decomposition..."); 144 staticTest(Normalizer.NFD, hangulCanon, 1); 145 logln("Compatibility decomposition..."); 146 staticTest(Normalizer.NFKD, hangulCompat, 1); 147 148 // Now the iterative decomposition methods... 149 logln("Iterative decomposition..."); 150 Normalizer norm = new Normalizer("", Normalizer.NFD,0); 151 iterateTest(norm, hangulCanon, 1); 152 153 norm.setMode(Normalizer.NFKD); 154 iterateTest(norm, hangulCompat, 1); 155 156 // And finally, make sure you can do it in reverse too 157 logln("Reverse iteration..."); 158 norm.setMode(Normalizer.NFD); 159 backAndForth(norm, hangulCanon); 160 } 161 @Test 162 public void TestNone() throws Exception{ 163 Normalizer norm = new Normalizer("", Normalizer.NONE,0); 164 iterateTest(norm, canonTests, 0); 165 staticTest(Normalizer.NONE, canonTests, 0); 166 } 167 @Test 168 public void TestDecomp() throws Exception{ 169 Normalizer norm = new Normalizer("", Normalizer.NFD,0); 170 iterateTest(norm, canonTests, 1); 171 staticTest(Normalizer.NFD, canonTests, 1); 172 decomposeTest(Normalizer.NFD, canonTests, 1); 173 } 174 175 @Test 176 public void TestCompatDecomp() throws Exception{ 177 Normalizer norm = new Normalizer("", Normalizer.NFKD,0); 178 iterateTest(norm, compatTests, 1); 179 staticTest(Normalizer.NFKD,compatTests, 1); 180 decomposeTest(Normalizer.NFKD,compatTests, 1); 181 } 182 183 @Test 184 public void TestCanonCompose() throws Exception{ 185 Normalizer norm = new Normalizer("", Normalizer.NFC,0); 186 iterateTest(norm, canonTests, 2); 187 staticTest(Normalizer.NFC, canonTests, 2); 188 composeTest(Normalizer.NFC, canonTests, 2); 189 } 190 191 @Test 192 public void TestCompatCompose() throws Exception{ 193 Normalizer norm = new Normalizer("", Normalizer.NFKC,0); 194 iterateTest(norm, compatTests, 2); 195 staticTest(Normalizer.NFKC,compatTests, 2); 196 composeTest(Normalizer.NFKC,compatTests, 2); 197 } 198 199 @Test 200 public void TestExplodingBase() throws Exception{ 201 // \u017f - Latin small letter long s 202 // \u0307 - combining dot above 203 // \u1e61 - Latin small letter s with dot above 204 // \u1e9b - Latin small letter long s with dot above 205 String[][] canon = { 206 // Input Decomposed Composed 207 { "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" }, 208 { "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" }, 209 }; 210 String[][] compat = { 211 // Input Decomposed Composed 212 { "\u017f", "s", "s" }, 213 { "\u1e9b", "s\u0307", "\u1e61" }, 214 }; 215 216 staticTest(Normalizer.NFD, canon, 1); 217 staticTest(Normalizer.NFC, canon, 2); 218 219 staticTest(Normalizer.NFKD, compat, 1); 220 staticTest(Normalizer.NFKC, compat, 2); 221 222 } 223 224 /** 225 * The Tibetan vowel sign AA, 0f71, was messed up prior to 226 * Unicode version 2.1.9. 227 * Once 2.1.9 or 3.0 is released, uncomment this test. 228 */ 229 @Test 230 public void TestTibetan() throws Exception{ 231 String[][] decomp = { 232 { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" } 233 }; 234 String[][] compose = { 235 { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" } 236 }; 237 238 staticTest(Normalizer.NFD, decomp, 1); 239 staticTest(Normalizer.NFKD,decomp, 2); 240 staticTest(Normalizer.NFC, compose, 1); 241 staticTest(Normalizer.NFKC,compose, 2); 242 } 243 244 /** 245 * Make sure characters in the CompositionExclusion.txt list do not get 246 * composed to. 247 */ 248 @Test 249 public void TestCompositionExclusion() 250 throws Exception{ 251 // This list is generated from CompositionExclusion.txt. 252 // Update whenever the normalizer tables are updated. Note 253 // that we test all characters listed, even those that can be 254 // derived from the Unicode DB and are therefore commented 255 // out. 256 String EXCLUDED = 257 "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" + 258 "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" + 259 "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" + 260 "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" + 261 "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" + 262 "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" + 263 "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u1FDB" + 264 "\u1FE3\u1FEB\u1FEE\u1FEF\u1FF9\u1FFB\u1FFD\u2000" + 265 "\u2001\u2126\u212A\u212B\u2329\u232A\uF900\uFA10" + 266 "\uFA12\uFA15\uFA20\uFA22\uFA25\uFA26\uFA2A\uFB1F" + 267 "\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E\uFB2F\uFB30\uFB31" + 268 "\uFB32\uFB33\uFB34\uFB35\uFB36\uFB38\uFB39\uFB3A" + 269 "\uFB3B\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46" + 270 "\uFB47\uFB48\uFB49\uFB4A\uFB4B\uFB4C\uFB4D\uFB4E"; 271 for (int i=0; i<EXCLUDED.length(); ++i) { 272 String a = String.valueOf(EXCLUDED.charAt(i)); 273 String b = Normalizer.normalize(a, Normalizer.NFKD); 274 String c = Normalizer.normalize(b, Normalizer.NFC); 275 if (c.equals(a)) { 276 errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " + 277 hex(b) + " x COMPOSE => " + 278 hex(c)); 279 } else if (isVerbose()) { 280 logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " + 281 hex(b) + " x COMPOSE => " + 282 hex(c)); 283 } 284 } 285 // The following method works too, but it is somewhat 286 // incestuous. It uses UInfo, which is the same database that 287 // NormalizerBuilder uses, so if something is wrong with 288 // UInfo, the following test won't show it. All it will show 289 // is that NormalizerBuilder has been run with whatever the 290 // current UInfo is. 291 // 292 // We comment this out in favor of the test above, which 293 // provides independent verification (but also requires 294 // independent updating). 295// logln("---"); 296// UInfo uinfo = new UInfo(); 297// for (int i=0; i<=0xFFFF; ++i) { 298// if (!uinfo.isExcludedComposition((char)i) || 299// (!uinfo.hasCanonicalDecomposition((char)i) && 300// !uinfo.hasCompatibilityDecomposition((char)i))) continue; 301// String a = String.valueOf((char)i); 302// String b = Normalizer.normalize(a,Normalizer.DECOMP_COMPAT,0); 303// String c = Normalizer.normalize(b,Normalizer.COMPOSE,0); 304// if (c.equals(a)) { 305// errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " + 306// hex(b) + " x COMPOSE => " + 307// hex(c)); 308// } else if (isVerbose()) { 309// logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " + 310// hex(b) + " x COMPOSE => " + 311// hex(c)); 312// } 313// } 314 } 315 316 /** 317 * Test for a problem that showed up just before ICU 1.6 release 318 * having to do with combining characters with an index of zero. 319 * Such characters do not participate in any canonical 320 * decompositions. However, having an index of zero means that 321 * they all share one typeMask[] entry, that is, they all have to 322 * map to the same canonical class, which is not the case, in 323 * reality. 324 */ 325 @Test 326 public void TestZeroIndex() 327 throws Exception{ 328 String[] DATA = { 329 // Expect col1 x COMPOSE_COMPAT => col2 330 // Expect col2 x DECOMP => col3 331 "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300", 332 "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300", 333 "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300", 334 "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327", 335 "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321", 336 }; 337 338 for (int i=0; i<DATA.length; i+=3) { 339 String a = DATA[i]; 340 String b = Normalizer.normalize(a, Normalizer.NFKC); 341 String exp = DATA[i+1]; 342 if (b.equals(exp)) { 343 logln("Ok: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b)); 344 } else { 345 errln("FAIL: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b) + 346 ", expect " + hex(exp)); 347 } 348 a = Normalizer.normalize(b, Normalizer.NFD); 349 exp = DATA[i+2]; 350 if (a.equals(exp)) { 351 logln("Ok: " + hex(b) + " x DECOMP => " + hex(a)); 352 } else { 353 errln("FAIL: " + hex(b) + " x DECOMP => " + hex(a) + 354 ", expect " + hex(exp)); 355 } 356 } 357 } 358 359 /** 360 * Test for a problem found by Verisign. Problem is that 361 * characters at the start of a string are not put in canonical 362 * order correctly by compose() if there is no starter. 363 */ 364 @Test 365 public void TestVerisign() 366 throws Exception{ 367 String[] inputs = { 368 "\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f", 369 "\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad" 370 }; 371 String[] outputs = { 372 "\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f", 373 "\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4" 374 }; 375 376 for (int i = 0; i < inputs.length; ++i) { 377 String input = inputs[i]; 378 String output = outputs[i]; 379 String result = Normalizer.decompose(input, false); 380 if (!result.equals(output)) { 381 errln("FAIL input: " + hex(input)); 382 errln(" decompose: " + hex(result)); 383 errln(" expected: " + hex(output)); 384 } 385 result = Normalizer.compose(input, false); 386 if (!result.equals(output)) { 387 errln("FAIL input: " + hex(input)); 388 errln(" compose: " + hex(result)); 389 errln(" expected: " + hex(output)); 390 } 391 } 392 393 } 394 @Test 395 public void TestQuickCheckResultNO() 396 throws Exception{ 397 final char CPNFD[] = {0x00C5, 0x0407, 0x1E00, 0x1F57, 0x220C, 398 0x30AE, 0xAC00, 0xD7A3, 0xFB36, 0xFB4E}; 399 final char CPNFC[] = {0x0340, 0x0F93, 0x1F77, 0x1FBB, 0x1FEB, 400 0x2000, 0x232A, 0xF900, 0xFA1E, 0xFB4E}; 401 final char CPNFKD[] = {0x00A0, 0x02E4, 0x1FDB, 0x24EA, 0x32FE, 402 0xAC00, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D}; 403 final char CPNFKC[] = {0x00A0, 0x017F, 0x2000, 0x24EA, 0x32FE, 404 0x33FE, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D}; 405 406 407 final int SIZE = 10; 408 409 int count = 0; 410 for (; count < SIZE; count ++) 411 { 412 if (Normalizer.quickCheck(String.valueOf(CPNFD[count]), 413 Normalizer.NFD,0) != Normalizer.NO) 414 { 415 errln("ERROR in NFD quick check at U+" + 416 Integer.toHexString(CPNFD[count])); 417 return; 418 } 419 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), 420 Normalizer.NFC,0) !=Normalizer.NO) 421 { 422 errln("ERROR in NFC quick check at U+"+ 423 Integer.toHexString(CPNFC[count])); 424 return; 425 } 426 if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]), 427 Normalizer.NFKD,0) != Normalizer.NO) 428 { 429 errln("ERROR in NFKD quick check at U+"+ 430 Integer.toHexString(CPNFKD[count])); 431 return; 432 } 433 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 434 Normalizer.NFKC,0) !=Normalizer.NO) 435 { 436 errln("ERROR in NFKC quick check at U+"+ 437 Integer.toHexString(CPNFKC[count])); 438 return; 439 } 440 // for improving coverage 441 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 442 Normalizer.NFKC) !=Normalizer.NO) 443 { 444 errln("ERROR in NFKC quick check at U+"+ 445 Integer.toHexString(CPNFKC[count])); 446 return; 447 } 448 } 449 } 450 451 452 @Test 453 public void TestQuickCheckResultYES() 454 throws Exception{ 455 final char CPNFD[] = {0x00C6, 0x017F, 0x0F74, 0x1000, 0x1E9A, 456 0x2261, 0x3075, 0x4000, 0x5000, 0xF000}; 457 final char CPNFC[] = {0x0400, 0x0540, 0x0901, 0x1000, 0x1500, 458 0x1E9A, 0x3000, 0x4000, 0x5000, 0xF000}; 459 final char CPNFKD[] = {0x00AB, 0x02A0, 0x1000, 0x1027, 0x2FFB, 460 0x3FFF, 0x4FFF, 0xA000, 0xF000, 0xFA27}; 461 final char CPNFKC[] = {0x00B0, 0x0100, 0x0200, 0x0A02, 0x1000, 462 0x2010, 0x3030, 0x4000, 0xA000, 0xFA0E}; 463 464 final int SIZE = 10; 465 int count = 0; 466 467 char cp = 0; 468 while (cp < 0xA0) 469 { 470 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFD,0) 471 != Normalizer.YES) 472 { 473 errln("ERROR in NFD quick check at U+"+ 474 Integer.toHexString(cp)); 475 return; 476 } 477 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFC,0) 478 != Normalizer.YES) 479 { 480 errln("ERROR in NFC quick check at U+"+ 481 Integer.toHexString(cp)); 482 return; 483 } 484 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKD,0) 485 != Normalizer.YES) 486 { 487 errln("ERROR in NFKD quick check at U+" + 488 Integer.toHexString(cp)); 489 return; 490 } 491 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC,0) 492 != Normalizer.YES) 493 { 494 errln("ERROR in NFKC quick check at U+"+ 495 Integer.toHexString(cp)); 496 return; 497 } 498 // improve the coverage 499 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC) 500 != Normalizer.YES) 501 { 502 errln("ERROR in NFKC quick check at U+"+ 503 Integer.toHexString(cp)); 504 return; 505 } 506 cp++; 507 } 508 509 for (; count < SIZE; count ++) 510 { 511 if (Normalizer.quickCheck(String.valueOf(CPNFD[count]), 512 Normalizer.NFD,0)!=Normalizer.YES) 513 { 514 errln("ERROR in NFD quick check at U+"+ 515 Integer.toHexString(CPNFD[count])); 516 return; 517 } 518 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), 519 Normalizer.NFC,0)!=Normalizer.YES) 520 { 521 errln("ERROR in NFC quick check at U+"+ 522 Integer.toHexString(CPNFC[count])); 523 return; 524 } 525 if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]), 526 Normalizer.NFKD,0)!=Normalizer.YES) 527 { 528 errln("ERROR in NFKD quick check at U+"+ 529 Integer.toHexString(CPNFKD[count])); 530 return; 531 } 532 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 533 Normalizer.NFKC,0)!=Normalizer.YES) 534 { 535 errln("ERROR in NFKC quick check at U+"+ 536 Integer.toHexString(CPNFKC[count])); 537 return; 538 } 539 // improve the coverage 540 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 541 Normalizer.NFKC)!=Normalizer.YES) 542 { 543 errln("ERROR in NFKC quick check at U+"+ 544 Integer.toHexString(CPNFKC[count])); 545 return; 546 } 547 } 548 } 549 @Test 550 public void TestBengali() throws Exception{ 551 String input = "\u09bc\u09be\u09cd\u09be"; 552 String output=Normalizer.normalize(input,Normalizer.NFC); 553 if(!input.equals(output)){ 554 errln("ERROR in NFC of string"); 555 } 556 } 557 @Test 558 public void TestQuickCheckResultMAYBE() 559 throws Exception{ 560 561 final char[] CPNFC = {0x0306, 0x0654, 0x0BBE, 0x102E, 0x1161, 562 0x116A, 0x1173, 0x1175, 0x3099, 0x309A}; 563 final char[] CPNFKC = {0x0300, 0x0654, 0x0655, 0x09D7, 0x0B3E, 564 0x0DCF, 0xDDF, 0x102E, 0x11A8, 0x3099}; 565 566 567 final int SIZE = 10; 568 569 int count = 0; 570 571 /* NFD and NFKD does not have any MAYBE codepoints */ 572 for (; count < SIZE; count ++) 573 { 574 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), 575 Normalizer.NFC,0)!=Normalizer.MAYBE) 576 { 577 errln("ERROR in NFC quick check at U+"+ 578 Integer.toHexString(CPNFC[count])); 579 return; 580 } 581 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 582 Normalizer.NFKC,0)!=Normalizer.MAYBE) 583 { 584 errln("ERROR in NFKC quick check at U+"+ 585 Integer.toHexString(CPNFKC[count])); 586 return; 587 } 588 if (Normalizer.quickCheck(new char[]{CPNFC[count]}, 589 Normalizer.NFC,0)!=Normalizer.MAYBE) 590 { 591 errln("ERROR in NFC quick check at U+"+ 592 Integer.toHexString(CPNFC[count])); 593 return; 594 } 595 if (Normalizer.quickCheck(new char[]{CPNFKC[count]}, 596 Normalizer.NFKC,0)!=Normalizer.MAYBE) 597 { 598 errln("ERROR in NFKC quick check at U+"+ 599 Integer.toHexString(CPNFKC[count])); 600 return; 601 } 602 if (Normalizer.quickCheck(new char[]{CPNFKC[count]}, 603 Normalizer.NONE,0)!=Normalizer.YES) 604 { 605 errln("ERROR in NONE quick check at U+"+ 606 Integer.toHexString(CPNFKC[count])); 607 return; 608 } 609 } 610 } 611 612 @Test 613 public void TestQuickCheckStringResult() 614 throws Exception{ 615 int count; 616 String d; 617 String c; 618 619 for (count = 0; count < canonTests.length; count ++) 620 { 621 d = canonTests[count][1]; 622 c = canonTests[count][2]; 623 if (Normalizer.quickCheck(d,Normalizer.NFD,0) 624 != Normalizer.YES) 625 { 626 errln("ERROR in NFD quick check for string at count " + count); 627 return; 628 } 629 630 if (Normalizer.quickCheck(c, Normalizer.NFC,0) 631 == Normalizer.NO) 632 { 633 errln("ERROR in NFC quick check for string at count " + count); 634 return; 635 } 636 } 637 638 for (count = 0; count < compatTests.length; count ++) 639 { 640 d = compatTests[count][1]; 641 c = compatTests[count][2]; 642 if (Normalizer.quickCheck(d, Normalizer.NFKD,0) 643 != Normalizer.YES) 644 { 645 errln("ERROR in NFKD quick check for string at count " + count); 646 return; 647 } 648 649 if (Normalizer.quickCheck(c, Normalizer.NFKC,0) 650 != Normalizer.YES) 651 { 652 errln("ERROR in NFKC quick check for string at count " + count); 653 return; 654 } 655 } 656 } 657 658 static final int qcToInt(Normalizer.QuickCheckResult qc) { 659 if(qc==Normalizer.NO) { 660 return 0; 661 } else if(qc==Normalizer.YES) { 662 return 1; 663 } else /* Normalizer.MAYBE */ { 664 return 2; 665 } 666 } 667 668 @Test 669 public void TestQuickCheckPerCP() { 670 int c, lead, trail; 671 String s, nfd; 672 int lccc1, lccc2, tccc1, tccc2; 673 int qc1, qc2; 674 675 if( 676 UCharacter.getIntPropertyMaxValue(UProperty.NFD_QUICK_CHECK)!=1 || // YES 677 UCharacter.getIntPropertyMaxValue(UProperty.NFKD_QUICK_CHECK)!=1 || 678 UCharacter.getIntPropertyMaxValue(UProperty.NFC_QUICK_CHECK)!=2 || // MAYBE 679 UCharacter.getIntPropertyMaxValue(UProperty.NFKC_QUICK_CHECK)!=2 || 680 UCharacter.getIntPropertyMaxValue(UProperty.LEAD_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) || 681 UCharacter.getIntPropertyMaxValue(UProperty.TRAIL_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) 682 ) { 683 errln("wrong result from one of the u_getIntPropertyMaxValue(UCHAR_NF*_QUICK_CHECK) or UCHAR_*_CANONICAL_COMBINING_CLASS"); 684 } 685 686 /* 687 * compare the quick check property values for some code points 688 * to the quick check results for checking same-code point strings 689 */ 690 c=0; 691 while(c<0x110000) { 692 s=UTF16.valueOf(c); 693 694 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFC_QUICK_CHECK); 695 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFC)); 696 if(qc1!=qc2) { 697 errln("getIntPropertyValue(NFC)="+qc1+" != "+qc2+"=quickCheck(NFC) for U+"+Integer.toHexString(c)); 698 } 699 700 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFD_QUICK_CHECK); 701 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFD)); 702 if(qc1!=qc2) { 703 errln("getIntPropertyValue(NFD)="+qc1+" != "+qc2+"=quickCheck(NFD) for U+"+Integer.toHexString(c)); 704 } 705 706 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKC_QUICK_CHECK); 707 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKC)); 708 if(qc1!=qc2) { 709 errln("getIntPropertyValue(NFKC)="+qc1+" != "+qc2+"=quickCheck(NFKC) for U+"+Integer.toHexString(c)); 710 } 711 712 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKD_QUICK_CHECK); 713 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKD)); 714 if(qc1!=qc2) { 715 errln("getIntPropertyValue(NFKD)="+qc1+" != "+qc2+"=quickCheck(NFKD) for U+"+Integer.toHexString(c)); 716 } 717 718 nfd=Normalizer.normalize(s, Normalizer.NFD); 719 lead=UTF16.charAt(nfd, 0); 720 trail=UTF16.charAt(nfd, nfd.length()-1); 721 722 lccc1=UCharacter.getIntPropertyValue(c, UProperty.LEAD_CANONICAL_COMBINING_CLASS); 723 lccc2=UCharacter.getCombiningClass(lead); 724 tccc1=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS); 725 tccc2=UCharacter.getCombiningClass(trail); 726 727 if(lccc1!=lccc2) { 728 errln("getIntPropertyValue(lccc)="+lccc1+" != "+lccc2+"=getCombiningClass(lead) for U+"+Integer.toHexString(c)); 729 } 730 if(tccc1!=tccc2) { 731 errln("getIntPropertyValue(tccc)="+tccc1+" != "+tccc2+"=getCombiningClass(trail) for U+"+Integer.toHexString(c)); 732 } 733 734 /* skip some code points */ 735 c=(20*c)/19+1; 736 } 737 } 738 739 //------------------------------------------------------------------------ 740 // Internal utilities 741 // 742 //------------------------------------------------------------------------ 743 // Internal utilities 744 // 745 746/* private void backAndForth(Normalizer iter, String input) 747 { 748 iter.setText(input); 749 750 // Run through the iterator forwards and stick it into a StringBuffer 751 StringBuffer forward = new StringBuffer(); 752 for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) { 753 forward.append(ch); 754 } 755 756 // Now do it backwards 757 StringBuffer reverse = new StringBuffer(); 758 for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) { 759 reverse.insert(0, ch); 760 } 761 762 if (!forward.toString().equals(reverse.toString())) { 763 errln("FAIL: Forward/reverse mismatch for input " + hex(input) 764 + ", forward: " + hex(forward) + ", backward: "+hex(reverse)); 765 } else if (isVerbose()) { 766 logln("Ok: Forward/reverse for input " + hex(input) 767 + ", forward: " + hex(forward) + ", backward: "+hex(reverse)); 768 } 769 }*/ 770 771 private void backAndForth(Normalizer iter, String[][] tests) 772 { 773 for (int i = 0; i < tests.length; i++) 774 { 775 iter.setText(tests[i][0]); 776 777 // Run through the iterator forwards and stick it into a 778 // StringBuffer 779 StringBuffer forward = new StringBuffer(); 780 for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) { 781 forward.append(ch); 782 } 783 784 // Now do it backwards 785 StringBuffer reverse = new StringBuffer(); 786 for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) { 787 reverse.insert(0, ch); 788 } 789 790 if (!forward.toString().equals(reverse.toString())) { 791 errln("FAIL: Forward/reverse mismatch for input " 792 + hex(tests[i][0]) + ", forward: " + hex(forward) 793 + ", backward: " + hex(reverse)); 794 } else if (isVerbose()) { 795 logln("Ok: Forward/reverse for input " + hex(tests[i][0]) 796 + ", forward: " + hex(forward) + ", backward: " 797 + hex(reverse)); 798 } 799 } 800 } 801 802 private void staticTest (Normalizer.Mode mode, 803 String[][] tests, int outCol) throws Exception{ 804 for (int i = 0; i < tests.length; i++) 805 { 806 String input = Utility.unescape(tests[i][0]); 807 String expect = Utility.unescape(tests[i][outCol]); 808 809 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 810 811 String output = Normalizer.normalize(input, mode); 812 813 if (!output.equals(expect)) { 814 errln("FAIL: case " + i 815 + " expected '" + expect + "' (" + hex(expect) + ")" 816 + " but got '" + output + "' (" + hex(output) + ")" ); 817 } 818 } 819 char[] output = new char[1]; 820 for (int i = 0; i < tests.length; i++) 821 { 822 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 823 String expect =Utility.unescape( tests[i][outCol]); 824 825 logln("Normalizing '" + new String(input) + "' (" + 826 hex(new String(input)) + ")" ); 827 int reqLength=0; 828 while(true){ 829 try{ 830 reqLength=Normalizer.normalize(input,output, mode,0); 831 if(reqLength<=output.length ){ 832 break; 833 } 834 }catch(IndexOutOfBoundsException e){ 835 output= new char[Integer.parseInt(e.getMessage())]; 836 continue; 837 } 838 } 839 if (!expect.equals(new String(output,0,reqLength))) { 840 errln("FAIL: case " + i 841 + " expected '" + expect + "' (" + hex(expect) + ")" 842 + " but got '" + new String(output) 843 + "' (" + hex(new String(output)) + ")" ); 844 } 845 } 846 } 847 private void decomposeTest(Normalizer.Mode mode, 848 String[][] tests, int outCol) throws Exception{ 849 for (int i = 0; i < tests.length; i++) 850 { 851 String input = Utility.unescape(tests[i][0]); 852 String expect = Utility.unescape(tests[i][outCol]); 853 854 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 855 856 String output = Normalizer.decompose(input, mode==Normalizer.NFKD); 857 858 if (!output.equals(expect)) { 859 errln("FAIL: case " + i 860 + " expected '" + expect + "' (" + hex(expect) + ")" 861 + " but got '" + output + "' (" + hex(output) + ")" ); 862 } 863 } 864 char[] output = new char[1]; 865 for (int i = 0; i < tests.length; i++) 866 { 867 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 868 String expect = Utility.unescape(tests[i][outCol]); 869 870 logln("Normalizing '" + new String(input) + "' (" + 871 hex(new String(input)) + ")" ); 872 int reqLength=0; 873 while(true){ 874 try{ 875 reqLength=Normalizer.decompose(input,output, mode==Normalizer.NFKD,0); 876 if(reqLength<=output.length ){ 877 break; 878 } 879 }catch(IndexOutOfBoundsException e){ 880 output= new char[Integer.parseInt(e.getMessage())]; 881 continue; 882 } 883 } 884 if (!expect.equals(new String(output,0,reqLength))) { 885 errln("FAIL: case " + i 886 + " expected '" + expect + "' (" + hex(expect) + ")" 887 + " but got '" + new String(output) 888 + "' (" + hex(new String(output)) + ")" ); 889 } 890 } 891 output = new char[1]; 892 for (int i = 0; i < tests.length; i++) 893 { 894 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 895 String expect = Utility.unescape(tests[i][outCol]); 896 897 logln("Normalizing '" + new String(input) + "' (" + 898 hex(new String(input)) + ")" ); 899 int reqLength=0; 900 while(true){ 901 try{ 902 reqLength=Normalizer.decompose(input,0,input.length,output,0,output.length, mode==Normalizer.NFKD,0); 903 if(reqLength<=output.length ){ 904 break; 905 } 906 }catch(IndexOutOfBoundsException e){ 907 output= new char[Integer.parseInt(e.getMessage())]; 908 continue; 909 } 910 } 911 if (!expect.equals(new String(output,0,reqLength))) { 912 errln("FAIL: case " + i 913 + " expected '" + expect + "' (" + hex(expect) + ")" 914 + " but got '" + new String(output) 915 + "' (" + hex(new String(output)) + ")" ); 916 } 917 char[] output2 = new char[reqLength * 2]; 918 System.arraycopy(output, 0, output2, 0, reqLength); 919 int retLength = Normalizer.decompose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0); 920 if(retLength != reqLength){ 921 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength); 922 } 923 } 924 } 925 926 private void composeTest(Normalizer.Mode mode, 927 String[][] tests, int outCol) throws Exception{ 928 for (int i = 0; i < tests.length; i++) 929 { 930 String input = Utility.unescape(tests[i][0]); 931 String expect = Utility.unescape(tests[i][outCol]); 932 933 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 934 935 String output = Normalizer.compose(input, mode==Normalizer.NFKC); 936 937 if (!output.equals(expect)) { 938 errln("FAIL: case " + i 939 + " expected '" + expect + "' (" + hex(expect) + ")" 940 + " but got '" + output + "' (" + hex(output) + ")" ); 941 } 942 } 943 char[] output = new char[1]; 944 for (int i = 0; i < tests.length; i++) 945 { 946 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 947 String expect = Utility.unescape(tests[i][outCol]); 948 949 logln("Normalizing '" + new String(input) + "' (" + 950 hex(new String(input)) + ")" ); 951 int reqLength=0; 952 while(true){ 953 try{ 954 reqLength=Normalizer.compose(input,output, mode==Normalizer.NFKC,0); 955 if(reqLength<=output.length ){ 956 break; 957 } 958 }catch(IndexOutOfBoundsException e){ 959 output= new char[Integer.parseInt(e.getMessage())]; 960 continue; 961 } 962 } 963 if (!expect.equals(new String(output,0,reqLength))) { 964 errln("FAIL: case " + i 965 + " expected '" + expect + "' (" + hex(expect) + ")" 966 + " but got '" + new String(output) 967 + "' (" + hex(new String(output)) + ")" ); 968 } 969 } 970 output = new char[1]; 971 for (int i = 0; i < tests.length; i++) 972 { 973 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 974 String expect = Utility.unescape(tests[i][outCol]); 975 976 logln("Normalizing '" + new String(input) + "' (" + 977 hex(new String(input)) + ")" ); 978 int reqLength=0; 979 while(true){ 980 try{ 981 reqLength=Normalizer.compose(input,0,input.length, output, 0, output.length, mode==Normalizer.NFKC,0); 982 if(reqLength<=output.length ){ 983 break; 984 } 985 }catch(IndexOutOfBoundsException e){ 986 output= new char[Integer.parseInt(e.getMessage())]; 987 continue; 988 } 989 } 990 if (!expect.equals(new String(output,0,reqLength))) { 991 errln("FAIL: case " + i 992 + " expected '" + expect + "' (" + hex(expect) + ")" 993 + " but got '" + new String(output) 994 + "' (" + hex(new String(output)) + ")" ); 995 } 996 997 char[] output2 = new char[reqLength * 2]; 998 System.arraycopy(output, 0, output2, 0, reqLength); 999 int retLength = Normalizer.compose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0); 1000 if(retLength != reqLength){ 1001 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength); 1002 } 1003 } 1004 } 1005 private void iterateTest(Normalizer iter, String[][] tests, int outCol){ 1006 for (int i = 0; i < tests.length; i++) 1007 { 1008 String input = Utility.unescape(tests[i][0]); 1009 String expect = Utility.unescape(tests[i][outCol]); 1010 1011 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 1012 1013 iter.setText(input); 1014 assertEqual(expect, iter, "case " + i + " "); 1015 } 1016 } 1017 1018 private void assertEqual(String expected, Normalizer iter, String msg) 1019 { 1020 int index = 0; 1021 int ch; 1022 UCharacterIterator cIter = UCharacterIterator.getInstance(expected); 1023 1024 while ((ch=iter.next())!= Normalizer.DONE){ 1025 if (index >= expected.length()) { 1026 errln("FAIL: " + msg + "Unexpected character '" + (char)ch 1027 + "' (" + hex(ch) + ")" 1028 + " at index " + index); 1029 break; 1030 } 1031 int want = UTF16.charAt(expected,index); 1032 if (ch != want) { 1033 errln("FAIL: " + msg + "got '" + (char)ch 1034 + "' (" + hex(ch) + ")" 1035 + " but expected '" + want + "' (" + hex(want)+ ")" 1036 + " at index " + index); 1037 } 1038 index+= UTF16.getCharCount(ch); 1039 } 1040 if (index < expected.length()) { 1041 errln("FAIL: " + msg + "Only got " + index + " chars, expected " 1042 + expected.length()); 1043 } 1044 1045 cIter.setToLimit(); 1046 while((ch=iter.previous())!=Normalizer.DONE){ 1047 int want = cIter.previousCodePoint(); 1048 if (ch != want ) { 1049 errln("FAIL: " + msg + "got '" + (char)ch 1050 + "' (" + hex(ch) + ")" 1051 + " but expected '" + want + "' (" + hex(want) + ")" 1052 + " at index " + index); 1053 } 1054 } 1055 } 1056 //-------------------------------------------------------------------------- 1057 1058 // NOTE: These tests are used for quick debugging so are not ported 1059 // to ICU4C tsnorm.cpp in intltest 1060 // 1061 1062 @Test 1063 public void TestDebugStatic(){ 1064 String in = Utility.unescape("\\U0001D157\\U0001D165"); 1065 if(!Normalizer.isNormalized(in,Normalizer.NFC,0)){ 1066 errln("isNormalized failed"); 1067 } 1068 1069 String input = "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1070 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1071 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1072 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1073 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1074 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1075 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1076 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1077 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ 1078 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1079 "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1080 "d\u031B\u0307\u0323"; 1081 String expect = "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+ 1082 "\u11AA\u1100\u116F\u11AA\uD834\uDD57\uD834\uDD65"+ 1083 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1084 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1085 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1086 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1087 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1088 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1089 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1090 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1091 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1092 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1093 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1094 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1095 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1096 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1097 "\uD834\uDD57\uD834\uDD65aaaaaaaaaaaaaaaaaazzzzzz"+ 1098 "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1099 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1100 "bbbbbbbbbbbbbbbbbbbbbbbbccccccccccccccccccccccccccccc"+ 1101 "cccccccccccccccccccccccccccccccccccccccccccccccc"+ 1102 "ddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1103 "dddddddddddddddddddddddd"+ 1104 "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+ 1105 "\u11AA\u1100\u116F\u11AA\u0064\u031B\u0323\u0307"; 1106 String output = Normalizer.normalize(Utility.unescape(input), 1107 Normalizer.NFD); 1108 if(!expect.equals(output)){ 1109 errln("FAIL expected: "+hex(expect) + " got: "+hex(output)); 1110 } 1111 1112 1113 1114 } 1115 @Test 1116 public void TestDebugIter(){ 1117 String src = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e"); 1118 String expected = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e"); 1119 Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(src)), 1120 Normalizer.NONE,0); 1121 int index = 0; 1122 int ch; 1123 UCharacterIterator cIter = UCharacterIterator.getInstance(expected); 1124 1125 while ((ch=iter.next())!= Normalizer.DONE){ 1126 if (index >= expected.length()) { 1127 errln("FAIL: " + "Unexpected character '" + (char)ch 1128 + "' (" + hex(ch) + ")" 1129 + " at index " + index); 1130 break; 1131 } 1132 int want = UTF16.charAt(expected,index); 1133 if (ch != want) { 1134 errln("FAIL: " + "got '" + (char)ch 1135 + "' (" + hex(ch) + ")" 1136 + " but expected '" + want + "' (" + hex(want)+ ")" 1137 + " at index " + index); 1138 } 1139 index+= UTF16.getCharCount(ch); 1140 } 1141 if (index < expected.length()) { 1142 errln("FAIL: " + "Only got " + index + " chars, expected " 1143 + expected.length()); 1144 } 1145 1146 cIter.setToLimit(); 1147 while((ch=iter.previous())!=Normalizer.DONE){ 1148 int want = cIter.previousCodePoint(); 1149 if (ch != want ) { 1150 errln("FAIL: " + "got '" + (char)ch 1151 + "' (" + hex(ch) + ")" 1152 + " but expected '" + want + "' (" + hex(want) + ")" 1153 + " at index " + index); 1154 } 1155 } 1156 } 1157 @Test 1158 public void TestDebugIterOld(){ 1159 String input = "\\U0001D15E"; 1160 String expected = "\uD834\uDD57\uD834\uDD65"; 1161 String expectedReverse = "\uD834\uDD65\uD834\uDD57"; 1162 int index = 0; 1163 int ch; 1164 Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(input)), 1165 Normalizer.NFKC,0); 1166 StringBuffer got = new StringBuffer(); 1167 for (ch = iter.first();ch!=Normalizer.DONE;ch=iter.next()) 1168 { 1169 if (index >= expected.length()) { 1170 errln("FAIL: " + "Unexpected character '" + (char)ch + 1171 "' (" + hex(ch) + ")" + " at index " + index); 1172 break; 1173 } 1174 got.append(UCharacter.toString(ch)); 1175 index++; 1176 } 1177 if (!expected.equals(got.toString())) { 1178 errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")" 1179 + " but expected '" + expected + "' (" 1180 + hex(expected) + ")"); 1181 } 1182 if (got.length() < expected.length()) { 1183 errln("FAIL: " + "Only got " + index + " chars, expected " 1184 + expected.length()); 1185 } 1186 1187 logln("Reverse Iteration\n"); 1188 iter.setIndexOnly(iter.endIndex()); 1189 got.setLength(0); 1190 for(ch=iter.previous();ch!=Normalizer.DONE;ch=iter.previous()){ 1191 if (index >= expected.length()) { 1192 errln("FAIL: " + "Unexpected character '" + (char)ch 1193 + "' (" + hex(ch) + ")" + " at index " + index); 1194 break; 1195 } 1196 got.append(UCharacter.toString(ch)); 1197 } 1198 if (!expectedReverse.equals(got.toString())) { 1199 errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")" 1200 + " but expected '" + expected 1201 + "' (" + hex(expected) + ")"); 1202 } 1203 if (got.length() < expected.length()) { 1204 errln("FAIL: " + "Only got " + index + " chars, expected " 1205 + expected.length()); 1206 } 1207 1208 } 1209 //-------------------------------------------------------------------------- 1210 // helper class for TestPreviousNext() 1211 // simple UTF-32 character iterator 1212 class UCharIterator { 1213 1214 public UCharIterator(int[] src, int len, int index){ 1215 1216 s=src; 1217 length=len; 1218 i=index; 1219 } 1220 1221 public int current() { 1222 if(i<length) { 1223 return s[i]; 1224 } else { 1225 return -1; 1226 } 1227 } 1228 1229 public int next() { 1230 if(i<length) { 1231 return s[i++]; 1232 } else { 1233 return -1; 1234 } 1235 } 1236 1237 public int previous() { 1238 if(i>0) { 1239 return s[--i]; 1240 } else { 1241 return -1; 1242 } 1243 } 1244 1245 public int getIndex() { 1246 return i; 1247 } 1248 1249 private int[] s; 1250 private int length, i; 1251 } 1252 @Test 1253 public void TestPreviousNext() { 1254 // src and expect strings 1255 char src[]={ 1256 UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999), 1257 UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f), 1258 0xc4, 1259 0x1ed0 1260 }; 1261 int expect[]={ 1262 0x831d, 1263 0x1d158, 0x1d165, 1264 0x41, 0x308, 1265 0x4f, 0x302, 0x301 1266 }; 1267 1268 // expected src indexes corresponding to expect indexes 1269 int expectIndex[]={ 1270 0, 1271 2, 2, 1272 4, 4, 1273 5, 5, 5, 1274 6 // behind last character 1275 }; 1276 1277 // initial indexes into the src and expect strings 1278 1279 final int SRC_MIDDLE=4; 1280 final int EXPECT_MIDDLE=3; 1281 1282 1283 // movement vector 1284 // - for previous(), 0 for current(), + for next() 1285 // not const so that we can terminate it below for the error message 1286 String moves="0+0+0--0-0-+++0--+++++++0--------"; 1287 1288 // iterators 1289 Normalizer iter = new Normalizer(new String(src), 1290 Normalizer.NFD,0); 1291 UCharIterator iter32 = new UCharIterator(expect, expect.length, 1292 EXPECT_MIDDLE); 1293 1294 int c1, c2; 1295 char m; 1296 1297 // initially set the indexes into the middle of the strings 1298 iter.setIndexOnly(SRC_MIDDLE); 1299 1300 // move around and compare the iteration code points with 1301 // the expected ones 1302 int movesIndex =0; 1303 while(movesIndex<moves.length()) { 1304 m=moves.charAt(movesIndex++); 1305 if(m=='-') { 1306 c1=iter.previous(); 1307 c2=iter32.previous(); 1308 } else if(m=='0') { 1309 c1=iter.current(); 1310 c2=iter32.current(); 1311 } else /* m=='+' */ { 1312 c1=iter.next(); 1313 c2=iter32.next(); 1314 } 1315 1316 // compare results 1317 if(c1!=c2) { 1318 // copy the moves until the current (m) move, and terminate 1319 String history = moves.substring(0,movesIndex); 1320 errln("error: mismatch in Normalizer iteration at "+history+": " 1321 +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2)); 1322 break; 1323 } 1324 1325 // compare indexes 1326 if(iter.getIndex()!=expectIndex[iter32.getIndex()]) { 1327 // copy the moves until the current (m) move, and terminate 1328 String history = moves.substring(0,movesIndex); 1329 errln("error: index mismatch in Normalizer iteration at " 1330 +history+ " : "+ "Normalizer index " +iter.getIndex() 1331 +" expected "+ expectIndex[iter32.getIndex()]); 1332 break; 1333 } 1334 } 1335 } 1336 // Only in ICU4j 1337 @Test 1338 public void TestPreviousNextJCI() { 1339 // src and expect strings 1340 char src[]={ 1341 UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999), 1342 UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f), 1343 0xc4, 1344 0x1ed0 1345 }; 1346 int expect[]={ 1347 0x831d, 1348 0x1d158, 0x1d165, 1349 0x41, 0x308, 1350 0x4f, 0x302, 0x301 1351 }; 1352 1353 // expected src indexes corresponding to expect indexes 1354 int expectIndex[]={ 1355 0, 1356 2, 2, 1357 4, 4, 1358 5, 5, 5, 1359 6 // behind last character 1360 }; 1361 1362 // initial indexes into the src and expect strings 1363 1364 final int SRC_MIDDLE=4; 1365 final int EXPECT_MIDDLE=3; 1366 1367 1368 // movement vector 1369 // - for previous(), 0 for current(), + for next() 1370 // not const so that we can terminate it below for the error message 1371 String moves="0+0+0--0-0-+++0--+++++++0--------"; 1372 1373 // iterators 1374 StringCharacterIterator text = new StringCharacterIterator(new String(src)); 1375 Normalizer iter = new Normalizer(text,Normalizer.NFD,0); 1376 UCharIterator iter32 = new UCharIterator(expect, expect.length, 1377 EXPECT_MIDDLE); 1378 1379 int c1, c2; 1380 char m; 1381 1382 // initially set the indexes into the middle of the strings 1383 iter.setIndexOnly(SRC_MIDDLE); 1384 1385 // move around and compare the iteration code points with 1386 // the expected ones 1387 int movesIndex =0; 1388 while(movesIndex<moves.length()) { 1389 m=moves.charAt(movesIndex++); 1390 if(m=='-') { 1391 c1=iter.previous(); 1392 c2=iter32.previous(); 1393 } else if(m=='0') { 1394 c1=iter.current(); 1395 c2=iter32.current(); 1396 } else /* m=='+' */ { 1397 c1=iter.next(); 1398 c2=iter32.next(); 1399 } 1400 1401 // compare results 1402 if(c1!=c2) { 1403 // copy the moves until the current (m) move, and terminate 1404 String history = moves.substring(0,movesIndex); 1405 errln("error: mismatch in Normalizer iteration at "+history+": " 1406 +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2)); 1407 break; 1408 } 1409 1410 // compare indexes 1411 if(iter.getIndex()!=expectIndex[iter32.getIndex()]) { 1412 // copy the moves until the current (m) move, and terminate 1413 String history = moves.substring(0,movesIndex); 1414 errln("error: index mismatch in Normalizer iteration at " 1415 +history+ " : "+ "Normalizer index " +iter.getIndex() 1416 +" expected "+ expectIndex[iter32.getIndex()]); 1417 break; 1418 } 1419 } 1420 } 1421 1422 // test APIs that are not otherwise used - improve test coverage 1423 @Test 1424 public void TestNormalizerAPI() throws Exception { 1425 try{ 1426 // instantiate a Normalizer from a CharacterIterator 1427 String s=Utility.unescape("a\u0308\uac00\\U0002f800"); 1428 // make s a bit longer and more interesting 1429 UCharacterIterator iter = UCharacterIterator.getInstance(s+s); 1430 Normalizer norm = new Normalizer(iter, Normalizer.NFC,0); 1431 if(norm.next()!=0xe4) { 1432 errln("error in Normalizer(CharacterIterator).next()"); 1433 } 1434 1435 // test clone(), ==, and hashCode() 1436 Normalizer clone=(Normalizer)norm.clone(); 1437 if(clone.equals(norm)) { 1438 errln("error in Normalizer(Normalizer(CharacterIterator)).clone()!=norm"); 1439 } 1440 1441 if(clone.getLength()!= norm.getLength()){ 1442 errln("error in Normalizer.getBeginIndex()"); 1443 } 1444 // clone must have the same hashCode() 1445 //if(clone.hashCode()!=norm.hashCode()) { 1446 // errln("error in Normalizer(Normalizer(CharacterIterator)).clone().hashCode()!=copy.hashCode()"); 1447 //} 1448 if(clone.next()!=0xac00) { 1449 errln("error in Normalizer(Normalizer(CharacterIterator)).next()"); 1450 } 1451 int ch = clone.next(); 1452 if(ch!=0x4e3d) { 1453 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next()"); 1454 } 1455 // position changed, must change hashCode() 1456 if(clone.hashCode()==norm.hashCode()) { 1457 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next().hashCode()==copy.hashCode()"); 1458 } 1459 1460 // test compose() and decompose() 1461 StringBuffer tel; 1462 String nfkc, nfkd; 1463 tel=new StringBuffer("\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121"); 1464 tel.insert(1,(char)0x0301); 1465 1466 nfkc=Normalizer.compose(tel.toString(), true); 1467 nfkd=Normalizer.decompose(tel.toString(), true); 1468 if( 1469 !nfkc.equals(Utility.unescape("TE\u0139TELTELTELTELTELTELTELTELTEL"))|| 1470 !nfkd.equals(Utility.unescape("TEL\u0301TELTELTELTELTELTELTELTELTEL")) 1471 ) { 1472 errln("error in Normalizer::(de)compose(): wrong result(s)"); 1473 } 1474 1475 // test setIndex() 1476 ch=norm.setIndex(3); 1477 if(ch!=0x4e3d) { 1478 errln("error in Normalizer(CharacterIterator).setIndex(3)"); 1479 } 1480 1481 // test setText(CharacterIterator) and getText() 1482 String out, out2; 1483 clone.setText(iter); 1484 1485 out = clone.getText(); 1486 out2 = iter.getText(); 1487 if( !out.equals(out2) || 1488 clone.startIndex()!=0|| 1489 clone.endIndex()!=iter.getLength() 1490 ) { 1491 errln("error in Normalizer::setText() or Normalizer::getText()"); 1492 } 1493 1494 char[] fillIn1 = new char[clone.getLength()]; 1495 char[] fillIn2 = new char[iter.getLength()]; 1496 int len = clone.getText(fillIn1); 1497 iter.getText(fillIn2,0); 1498 if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){ 1499 errln("error in Normalizer.getText(). Normalizer: "+ 1500 Utility.hex(new String(fillIn1))+ 1501 " Iter: " + Utility.hex(new String(fillIn2))); 1502 } 1503 1504 clone.setText(fillIn1); 1505 len = clone.getText(fillIn2); 1506 if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){ 1507 errln("error in Normalizer.setText() or Normalizer.getText()"+ 1508 Utility.hex(new String(fillIn1))+ 1509 " Iter: " + Utility.hex(new String(fillIn2))); 1510 } 1511 1512 // test setText(UChar *), getUMode() and setMode() 1513 clone.setText(s); 1514 clone.setIndexOnly(1); 1515 clone.setMode(Normalizer.NFD); 1516 if(clone.getMode()!=Normalizer.NFD) { 1517 errln("error in Normalizer::setMode() or Normalizer::getMode()"); 1518 } 1519 if(clone.next()!=0x308 || clone.next()!=0x1100) { 1520 errln("error in Normalizer::setText() or Normalizer::setMode()"); 1521 } 1522 1523 // test last()/previous() with an internal buffer overflow 1524 StringBuffer buf = new StringBuffer("aaaaaaaaaa"); 1525 buf.setCharAt(10-1,'\u0308'); 1526 clone.setText(buf); 1527 if(clone.last()!=0x308) { 1528 errln("error in Normalizer(10*U+0308).last()"); 1529 } 1530 1531 // test UNORM_NONE 1532 norm.setMode(Normalizer.NONE); 1533 if(norm.first()!=0x61 || norm.next()!=0x308 || norm.last()!=0x2f800) { 1534 errln("error in Normalizer(UNORM_NONE).first()/next()/last()"); 1535 } 1536 out=Normalizer.normalize(s, Normalizer.NONE); 1537 if(!out.equals(s)) { 1538 errln("error in Normalizer::normalize(UNORM_NONE)"); 1539 } 1540 ch = 0x1D15E; 1541 String exp = "\\U0001D157\\U0001D165"; 1542 String ns = Normalizer.normalize(ch,Normalizer.NFC); 1543 if(!ns.equals(Utility.unescape(exp))){ 1544 errln("error in Normalizer.normalize(int,Mode)"); 1545 } 1546 ns = Normalizer.normalize(ch,Normalizer.NFC,0); 1547 if(!ns.equals(Utility.unescape(exp))){ 1548 errln("error in Normalizer.normalize(int,Mode,int)"); 1549 } 1550 }catch(Exception e){ 1551 throw e; 1552 } 1553 } 1554 1555 @Test 1556 public void TestConcatenate() { 1557 1558 Object[][]cases=new Object[][]{ 1559 /* mode, left, right, result */ 1560 { 1561 Normalizer.NFC, 1562 "re", 1563 "\u0301sum\u00e9", 1564 "r\u00e9sum\u00e9" 1565 }, 1566 { 1567 Normalizer.NFC, 1568 "a\u1100", 1569 "\u1161bcdefghijk", 1570 "a\uac00bcdefghijk" 1571 }, 1572 /* ### TODO: add more interesting cases */ 1573 { 1574 Normalizer.NFD, 1575 "\u03B1\u0345", 1576 "\u0C4D\uD804\uDCBA\uD834\uDD69", // 0C4D 110BA 1D169 1577 "\u03B1\uD834\uDD69\uD804\uDCBA\u0C4D\u0345" // 03B1 1D169 110BA 0C4D 0345 1578 } 1579 }; 1580 1581 String left, right, expect, result; 1582 Normalizer.Mode mode; 1583 int i; 1584 1585 /* test concatenation */ 1586 for(i=0; i<cases.length; ++i) { 1587 mode = (Normalizer.Mode)cases[i][0]; 1588 1589 left=(String)cases[i][1]; 1590 right=(String)cases[i][2]; 1591 expect=(String)cases[i][3]; 1592 { 1593 result=Normalizer.concatenate(left, right, mode,0); 1594 if(!result.equals(expect)) { 1595 errln("error in Normalizer.concatenate(), cases[] failed" 1596 +", result==expect: expected: " 1597 + hex(expect)+" =========> got: " + hex(result)); 1598 } 1599 } 1600 { 1601 result=Normalizer.concatenate(left.toCharArray(), right.toCharArray(), mode,0); 1602 if(!result.equals(expect)) { 1603 errln("error in Normalizer.concatenate(), cases[] failed" 1604 +", result==expect: expected: " 1605 + hex(expect)+" =========> got: " + hex(result)); 1606 } 1607 } 1608 } 1609 1610 mode= Normalizer.NFC; // (Normalizer.Mode)cases2[0][0]; 1611 char[] destination = "My resume is here".toCharArray(); 1612 left = "resume"; 1613 right = "re\u0301sum\u00e9 is HERE"; 1614 expect = "My r\u00e9sum\u00e9 is HERE"; 1615 1616 // Concatenates 're' with '\u0301sum\u00e9 is HERE' and places the result at 1617 // position 3 of string 'My resume is here'. 1618 Normalizer.concatenate(left.toCharArray(), 0, 2, right.toCharArray(), 2, 15, 1619 destination, 3, 17, mode, 0); 1620 if(!String.valueOf(destination).equals(expect)) { 1621 errln("error in Normalizer.concatenate(), cases2[] failed" 1622 +", result==expect: expected: " 1623 + hex(expect) + " =========> got: " + hex(destination)); 1624 } 1625 1626 // Error case when result of concatenation won't fit into destination array. 1627 try { 1628 Normalizer.concatenate(left.toCharArray(), 0, 2, right.toCharArray(), 2, 15, 1629 destination, 3, 16, mode, 0); 1630 } catch (IndexOutOfBoundsException e) { 1631 assertTrue("Normalizer.concatenate() failed", e.getMessage().equals("14")); 1632 return; 1633 } 1634 fail("Normalizer.concatenate() tested for failure but passed"); 1635 } 1636 1637 private final int RAND_MAX = 0x7fff; 1638 1639 @Test 1640 public void TestCheckFCD() 1641 { 1642 char[] FAST = {0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 1643 0x0008, 0x0009, 0x000A}; 1644 1645 char[] FALSE = {0x0001, 0x0002, 0x02EA, 0x03EB, 0x0300, 0x0301, 1646 0x02B9, 0x0314, 0x0315, 0x0316}; 1647 1648 char[] TRUE = {0x0030, 0x0040, 0x0440, 0x056D, 0x064F, 0x06E7, 1649 0x0050, 0x0730, 0x09EE, 0x1E10}; 1650 1651 char[][] datastr= { {0x0061, 0x030A, 0x1E05, 0x0302, 0}, 1652 {0x0061, 0x030A, 0x00E2, 0x0323, 0}, 1653 {0x0061, 0x0323, 0x00E2, 0x0323, 0}, 1654 {0x0061, 0x0323, 0x1E05, 0x0302, 0} 1655 }; 1656 Normalizer.QuickCheckResult result[] = {Normalizer.YES, Normalizer.NO, Normalizer.NO, Normalizer.YES}; 1657 1658 char[] datachar= { 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 1659 0x6a, 1660 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 1661 0xea, 1662 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 1663 0x0307, 0x0308, 0x0309, 0x030a, 1664 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 1665 0x0327, 0x0328, 0x0329, 0x032a, 1666 0x1e00, 0x1e01, 0x1e02, 0x1e03, 0x1e04, 0x1e05, 0x1e06, 1667 0x1e07, 0x1e08, 0x1e09, 0x1e0a 1668 }; 1669 1670 int count = 0; 1671 1672 if (Normalizer.quickCheck(FAST,0,FAST.length, Normalizer.FCD,0) != Normalizer.YES) 1673 errln("Normalizer.quickCheck(FCD) failed: expected value for fast Normalizer.quickCheck is Normalizer.YES\n"); 1674 if (Normalizer.quickCheck(FALSE,0, FALSE.length,Normalizer.FCD,0) != Normalizer.NO) 1675 errln("Normalizer.quickCheck(FCD) failed: expected value for error Normalizer.quickCheck is Normalizer.NO\n"); 1676 if (Normalizer.quickCheck(TRUE,0,TRUE.length,Normalizer.FCD,0) != Normalizer.YES) 1677 errln("Normalizer.quickCheck(FCD) failed: expected value for correct Normalizer.quickCheck is Normalizer.YES\n"); 1678 1679 1680 while (count < 4) 1681 { 1682 Normalizer.QuickCheckResult fcdresult = Normalizer.quickCheck(datastr[count],0,datastr[count].length, Normalizer.FCD,0); 1683 if (result[count] != fcdresult) { 1684 errln("Normalizer.quickCheck(FCD) failed: Data set "+ count 1685 + " expected value "+ result[count]); 1686 } 1687 count ++; 1688 } 1689 1690 /* random checks of long strings */ 1691 //srand((unsigned)time( NULL )); 1692 Random rand = createRandom(); // use test framework's random 1693 1694 for (count = 0; count < 50; count ++) 1695 { 1696 int size = 0; 1697 Normalizer.QuickCheckResult testresult = Normalizer.YES; 1698 char[] data= new char[20]; 1699 char[] norm= new char[100]; 1700 char[] nfd = new char[100]; 1701 int normStart = 0; 1702 int nfdsize = 0; 1703 while (size != 19) { 1704 data[size] = datachar[rand.nextInt(RAND_MAX)*50/RAND_MAX]; 1705 logln("0x"+data[size]); 1706 normStart += Normalizer.normalize(data,size,size+1, 1707 norm,normStart,100, 1708 Normalizer.NFD,0); 1709 size ++; 1710 } 1711 logln("\n"); 1712 1713 nfdsize = Normalizer.normalize(data,0,size, nfd,0,nfd.length,Normalizer.NFD,0); 1714 // nfdsize = unorm_normalize(data, size, UNORM_NFD, UCOL_IGNORE_HANGUL, 1715 // nfd, 100, &status); 1716 if (nfdsize != normStart || Utility.arrayRegionMatches(nfd,0, norm,0,nfdsize) ==false) { 1717 testresult = Normalizer.NO; 1718 } 1719 if (testresult == Normalizer.YES) { 1720 logln("result Normalizer.YES\n"); 1721 } 1722 else { 1723 logln("result Normalizer.NO\n"); 1724 } 1725 1726 if (Normalizer.quickCheck(data,0,data.length, Normalizer.FCD,0) != testresult) { 1727 errln("Normalizer.quickCheck(FCD) failed: expected "+ testresult +" for random data: "+hex(new String(data)) ); 1728 } 1729 } 1730 } 1731 1732 1733 // reference implementation of Normalizer::compare 1734 private int ref_norm_compare(String s1, String s2, int options) { 1735 String t1, t2,r1,r2; 1736 1737 int normOptions=options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT; 1738 1739 if((options&Normalizer.COMPARE_IGNORE_CASE)!=0) { 1740 // NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) 1741 r1 = Normalizer.decompose(s1,false,normOptions); 1742 r2 = Normalizer.decompose(s2,false,normOptions); 1743 r1 = UCharacter.foldCase(r1,options); 1744 r2 = UCharacter.foldCase(r2,options); 1745 }else{ 1746 r1 = s1; 1747 r2 = s2; 1748 } 1749 1750 t1 = Normalizer.decompose(r1, false, normOptions); 1751 t2 = Normalizer.decompose(r2, false, normOptions); 1752 1753 if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) { 1754 UTF16.StringComparator comp 1755 = new UTF16.StringComparator(true, false, 1756 UTF16.StringComparator.FOLD_CASE_DEFAULT); 1757 return comp.compare(t1,t2); 1758 } else { 1759 return t1.compareTo(t2); 1760 } 1761 1762 } 1763 1764 // test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately 1765 private int norm_compare(String s1, String s2, int options) { 1766 int normOptions=options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT; 1767 1768 if( Normalizer.YES==Normalizer.quickCheck(s1,Normalizer.FCD,normOptions) && 1769 Normalizer.YES==Normalizer.quickCheck(s2,Normalizer.FCD,normOptions)) { 1770 options|=Normalizer.INPUT_IS_FCD; 1771 } 1772 1773 int cmpStrings = Normalizer.compare(s1, s2, options); 1774 int cmpArrays = Normalizer.compare( 1775 s1.toCharArray(), 0, s1.length(), 1776 s2.toCharArray(), 0, s2.length(), options); 1777 assertEquals("compare strings == compare char arrays", cmpStrings, cmpArrays); 1778 return cmpStrings; 1779 } 1780 1781 // reference implementation of UnicodeString::caseCompare 1782 private int ref_case_compare(String s1, String s2, int options) { 1783 String t1, t2; 1784 1785 t1=s1; 1786 t2=s2; 1787 1788 t1 = UCharacter.foldCase(t1,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0)); 1789 t2 = UCharacter.foldCase(t2,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0)); 1790 1791 if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) { 1792 UTF16.StringComparator comp 1793 = new UTF16.StringComparator(true, false, 1794 UTF16.StringComparator.FOLD_CASE_DEFAULT); 1795 return comp.compare(t1,t2); 1796 } else { 1797 return t1.compareTo(t2); 1798 } 1799 1800 } 1801 1802 // reduce an integer to -1/0/1 1803 private static int sign(int value) { 1804 if(value==0) { 1805 return 0; 1806 } else { 1807 return (value>>31)|1; 1808 } 1809 } 1810 private static String signString(int value) { 1811 if(value<0) { 1812 return "<0"; 1813 } else if(value==0) { 1814 return "=0"; 1815 } else /* value>0 */ { 1816 return ">0"; 1817 } 1818 } 1819 // test Normalizer::compare and unorm_compare (thinly wrapped by the former) 1820 // by comparing it with its semantic equivalent 1821 // since we trust the pieces, this is sufficient 1822 1823 // test each string with itself and each other 1824 // each time with all options 1825 private String strings[]=new String[]{ 1826 // some cases from NormalizationTest.txt 1827 // 0..3 1828 "D\u031B\u0307\u0323", 1829 "\u1E0C\u031B\u0307", 1830 "D\u031B\u0323\u0307", 1831 "d\u031B\u0323\u0307", 1832 1833 // 4..6 1834 "\u00E4", 1835 "a\u0308", 1836 "A\u0308", 1837 1838 // Angstrom sign = A ring 1839 // 7..10 1840 "\u212B", 1841 "\u00C5", 1842 "A\u030A", 1843 "a\u030A", 1844 1845 // 11.14 1846 "a\u059A\u0316\u302A\u032Fb", 1847 "a\u302A\u0316\u032F\u059Ab", 1848 "a\u302A\u0316\u032F\u059Ab", 1849 "A\u059A\u0316\u302A\u032Fb", 1850 1851 // from ICU case folding tests 1852 // 15..20 1853 "A\u00df\u00b5\ufb03\\U0001040c\u0131", 1854 "ass\u03bcffi\\U00010434i", 1855 "\u0061\u0042\u0131\u03a3\u00df\ufb03\ud93f\udfff", 1856 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udfff", 1857 "\u0041\u0062\u0131\u03c3\u0053\u0073\u0066\u0046\u0069\ud93f\udfff", 1858 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udffd", 1859 1860 // U+d800 U+10001 see implementation comment in unorm_cmpEquivFold 1861 // vs. U+10000 at bottom - code point order 1862 // 21..22 1863 "\ud800\ud800\udc01", 1864 "\ud800\udc00", 1865 1866 // other code point order tests from ustrtest.cpp 1867 // 23..31 1868 "\u20ac\ud801", 1869 "\u20ac\ud800\udc00", 1870 "\ud800", 1871 "\ud800\uff61", 1872 "\udfff", 1873 "\uff61\udfff", 1874 "\uff61\ud800\udc02", 1875 "\ud800\udc02", 1876 "\ud84d\udc56", 1877 1878 // long strings, see cnormtst.c/TestNormCoverage() 1879 // equivalent if case-insensitive 1880 // 32..33 1881 "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1882 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1883 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1884 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1885 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1886 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1887 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1888 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1889 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ 1890 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1891 "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1892 "d\u031B\u0307\u0323", 1893 1894 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+ 1895 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1896 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1897 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1898 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1899 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1900 "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1901 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1902 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ 1903 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1904 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+ 1905 "\u1E0C\u031B\u0307", 1906 1907 // some strings that may make a difference whether the compare function 1908 // case-folds or decomposes first 1909 // 34..41 1910 "\u0360\u0345\u0334", 1911 "\u0360\u03b9\u0334", 1912 1913 "\u0360\u1f80\u0334", 1914 "\u0360\u03b1\u0313\u03b9\u0334", 1915 1916 "\u0360\u1ffc\u0334", 1917 "\u0360\u03c9\u03b9\u0334", 1918 1919 "a\u0360\u0345\u0360\u0345b", 1920 "a\u0345\u0360\u0345\u0360b", 1921 1922 // interesting cases for canonical caseless match with turkic i handling 1923 // 42..43 1924 "\u00cc", 1925 "\u0069\u0300", 1926 1927 // strings with post-Unicode 3.2 normalization or normalization corrections 1928 // 44..45 1929 "\u00e4\u193b\\U0002f868", 1930 "\u0061\u193b\u0308\u36fc", 1931 1932 1933 }; 1934 1935 // all combinations of options 1936 // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions 1937 final class Temp { 1938 int options; 1939 String name; 1940 public Temp(int opt,String str){ 1941 options =opt; 1942 name = str; 1943 } 1944 1945 } 1946 // set UNORM_UNICODE_3_2 in one additional combination 1947 1948 private Temp[] opt = new Temp[]{ 1949 new Temp(0,"default"), 1950 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER, "code point order" ), 1951 new Temp(Normalizer.COMPARE_IGNORE_CASE, "ignore case" ), 1952 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE, "code point order & ignore case" ), 1953 new Temp(Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "ignore case & special i"), 1954 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "code point order & ignore case & special i"), 1955 new Temp(Normalizer.UNICODE_3_2 << Normalizer.COMPARE_NORM_OPTIONS_SHIFT, "Unicode 3.2") 1956 }; 1957 1958 1959 @Test 1960 public void TestCompareDebug(){ 1961 1962 String[] s = new String[100]; // at least as many items as in strings[] ! 1963 1964 1965 int i, j, k, count=strings.length; 1966 int result, refResult; 1967 1968 // create the UnicodeStrings 1969 for(i=0; i<count; ++i) { 1970 s[i]=Utility.unescape(strings[i]); 1971 } 1972 UTF16.StringComparator comp = new UTF16.StringComparator(true, false, 1973 UTF16.StringComparator.FOLD_CASE_DEFAULT); 1974 // test them each with each other 1975 1976 i = 42; 1977 j = 43; 1978 k = 2; 1979 // test Normalizer::compare 1980 result=norm_compare(s[i], s[j], opt[k].options); 1981 refResult=ref_norm_compare(s[i], s[j], opt[k].options); 1982 if(sign(result)!=sign(refResult)) { 1983 errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 1984 } 1985 1986 // test UnicodeString::caseCompare - same internal implementation function 1987 if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) { 1988 // result=s[i]. (s[j], opt[k].options); 1989 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) 1990 { 1991 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); 1992 } 1993 else { 1994 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); 1995 } 1996 1997 result=comp.compare(s[i],s[j]); 1998 refResult=ref_case_compare(s[i], s[j], opt[k].options); 1999 if(sign(result)!=sign(refResult)) { 2000 errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 2001 } 2002 } 2003 String value1 = "\u00dater\u00fd"; 2004 String value2 = "\u00fater\u00fd"; 2005 if(Normalizer.compare(value1,value2,0)!=0){ 2006 if(Normalizer.compare(value1,value2,Normalizer.COMPARE_IGNORE_CASE)==0){ 2007 2008 } 2009 } 2010 } 2011 2012 @Test 2013 public void TestCompare() { 2014 2015 String[] s = new String[100]; // at least as many items as in strings[] ! 2016 2017 int i, j, k, count=strings.length; 2018 int result, refResult; 2019 2020 // create the UnicodeStrings 2021 for(i=0; i<count; ++i) { 2022 s[i]=Utility.unescape(strings[i]); 2023 } 2024 UTF16.StringComparator comp = new UTF16.StringComparator(); 2025 // test them each with each other 2026 for(i=0; i<count; ++i) { 2027 for(j=i; j<count; ++j) { 2028 for(k=0; k<opt.length; ++k) { 2029 // test Normalizer::compare 2030 result=norm_compare(s[i], s[j], opt[k].options); 2031 refResult=ref_norm_compare(s[i], s[j], opt[k].options); 2032 if(sign(result)!=sign(refResult)) { 2033 errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 2034 } 2035 2036 // test UnicodeString::caseCompare - same internal implementation function 2037 if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) { 2038 // result=s[i]. (s[j], opt[k].options); 2039 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) 2040 { 2041 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); 2042 } 2043 else { 2044 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); 2045 } 2046 2047 comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0); 2048 // result=comp.caseCompare(s[i],s[j], opt[k].options); 2049 result=comp.compare(s[i],s[j]); 2050 refResult=ref_case_compare(s[i], s[j], opt[k].options); 2051 if(sign(result)!=sign(refResult)) { 2052 errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 2053 } 2054 } 2055 } 2056 } 2057 } 2058 2059 // test cases with i and I to make sure Turkic works 2060 char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 }; 2061 UnicodeSet set = new UnicodeSet(), iSet = new UnicodeSet(); 2062 Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstance().impl; 2063 nfcImpl.ensureCanonIterData(); 2064 2065 String s1, s2; 2066 2067 // collect all sets into one for contiguous output 2068 for(i=0; i<iI.length; ++i) { 2069 if(nfcImpl.getCanonStartSet(iI[i], iSet)) { 2070 set.addAll(iSet); 2071 } 2072 } 2073 2074 // test all of these precomposed characters 2075 Normalizer2 nfcNorm2 = Normalizer2.getNFCInstance(); 2076 UnicodeSetIterator it = new UnicodeSetIterator(set); 2077 int c; 2078 while(it.next() && (c=it.codepoint)!=UnicodeSetIterator.IS_STRING) { 2079 s1 = UTF16.valueOf(c); 2080 s2 = nfcNorm2.getDecomposition(c); 2081 for(k=0; k<opt.length; ++k) { 2082 // test Normalizer::compare 2083 2084 result= norm_compare(s1, s2, opt[k].options); 2085 refResult=ref_norm_compare(s1, s2, opt[k].options); 2086 if(sign(result)!=sign(refResult)) { 2087 errln("Normalizer.compare(U+"+hex(c)+" with its NFD, "+opt[k].name+")" 2088 + signString(result)+" should be "+signString(refResult)); 2089 } 2090 2091 // test UnicodeString::caseCompare - same internal implementation function 2092 if((opt[k].options & Normalizer.COMPARE_IGNORE_CASE)>0) { 2093 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) 2094 { 2095 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); 2096 } 2097 else { 2098 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); 2099 } 2100 2101 comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0); 2102 2103 result=comp.compare(s1,s2); 2104 refResult=ref_case_compare(s1, s2, opt[k].options); 2105 if(sign(result)!=sign(refResult)) { 2106 errln("UTF16.compare(U+"+hex(c)+" with its NFD, " 2107 +opt[k].name+")"+signString(result) +" should be "+signString(refResult)); 2108 } 2109 } 2110 } 2111 } 2112 2113 // test getDecomposition() for some characters that do not decompose 2114 if( nfcNorm2.getDecomposition(0x20)!=null || 2115 nfcNorm2.getDecomposition(0x4e00)!=null || 2116 nfcNorm2.getDecomposition(0x20002)!=null 2117 ) { 2118 errln("NFC.getDecomposition() returns TRUE for characters which do not have decompositions"); 2119 } 2120 2121 // test getRawDecomposition() for some characters that do not decompose 2122 if( nfcNorm2.getRawDecomposition(0x20)!=null || 2123 nfcNorm2.getRawDecomposition(0x4e00)!=null || 2124 nfcNorm2.getRawDecomposition(0x20002)!=null 2125 ) { 2126 errln("getRawDecomposition() returns TRUE for characters which do not have decompositions"); 2127 } 2128 2129 // test composePair() for some pairs of characters that do not compose 2130 if( nfcNorm2.composePair(0x20, 0x301)>=0 || 2131 nfcNorm2.composePair(0x61, 0x305)>=0 || 2132 nfcNorm2.composePair(0x1100, 0x1160)>=0 || 2133 nfcNorm2.composePair(0xac00, 0x11a7)>=0 2134 ) { 2135 errln("NFC.composePair() incorrectly composes some pairs of characters"); 2136 } 2137 2138 // test FilteredNormalizer2.getDecomposition() 2139 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff]"); 2140 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); 2141 if(fn2.getDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getDecomposition(0x100))) { 2142 errln("FilteredNormalizer2(NFC, ^A0-FF).getDecomposition() failed"); 2143 } 2144 2145 // test FilteredNormalizer2.getRawDecomposition() 2146 if(fn2.getRawDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getRawDecomposition(0x100))) { 2147 errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed"); 2148 } 2149 2150 // test FilteredNormalizer2::composePair() 2151 if( 0x100!=fn2.composePair(0x41, 0x304) || 2152 fn2.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08 2153 ) { 2154 errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed"); 2155 } 2156 } 2157 2158 // verify that case-folding does not un-FCD strings 2159 int countFoldFCDExceptions(int foldingOptions) { 2160 String s, d; 2161 int c; 2162 int count; 2163 int/*unsigned*/ cc, trailCC, foldCC, foldTrailCC; 2164 Normalizer.QuickCheckResult qcResult; 2165 int category; 2166 boolean isNFD; 2167 2168 2169 logln("Test if case folding may un-FCD a string (folding options 0x)"+hex(foldingOptions)); 2170 2171 count=0; 2172 for(c=0; c<=0x10ffff; ++c) { 2173 category=UCharacter.getType(c); 2174 if(category==UCharacterCategory.UNASSIGNED) { 2175 continue; // skip unassigned code points 2176 } 2177 if(c==0xac00) { 2178 c=0xd7a3; // skip Hangul - no case folding there 2179 continue; 2180 } 2181 // skip Han blocks - no case folding there either 2182 if(c==0x3400) { 2183 c=0x4db5; 2184 continue; 2185 } 2186 if(c==0x4e00) { 2187 c=0x9fa5; 2188 continue; 2189 } 2190 if(c==0x20000) { 2191 c=0x2a6d6; 2192 continue; 2193 } 2194 2195 s= UTF16.valueOf(c); 2196 2197 // get leading and trailing cc for c 2198 d= Normalizer.decompose(s,false); 2199 isNFD= s==d; 2200 cc=UCharacter.getCombiningClass(UTF16.charAt(d,0)); 2201 trailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1)); 2202 2203 // get leading and trailing cc for the case-folding of c 2204 UCharacter.foldCase(s,(foldingOptions==0)); 2205 d = Normalizer.decompose(s, false); 2206 foldCC=UCharacter.getCombiningClass(UTF16.charAt(d,0)); 2207 foldTrailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1)); 2208 2209 qcResult=Normalizer.quickCheck(s, Normalizer.FCD,0); 2210 2211 2212 // bad: 2213 // - character maps to empty string: adjacent characters may then need reordering 2214 // - folding has different leading/trailing cc's, and they don't become just 0 2215 // - folding itself is not FCD 2216 if( qcResult!=Normalizer.YES || 2217 s.length()==0 || 2218 (cc!=foldCC && foldCC!=0) || (trailCC!=foldTrailCC && foldTrailCC!=0) 2219 ) { 2220 ++count; 2221 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")"); 2222 //errln(" cc %02x trailCC %02x foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x quickCheck(folded)=%d", cc, trailCC, UTF16.charAt(d,0), foldCC, UTF16.charAt(d,d.length()-1), foldTrailCC, qcResult); 2223 continue; 2224 } 2225 2226 // also bad: 2227 // if a code point is in NFD but its case folding is not, then 2228 // unorm_compare will also fail 2229 if(isNFD && Normalizer.YES!=Normalizer.quickCheck(s, Normalizer.NFD,0)) { 2230 ++count; 2231 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")"); 2232 } 2233 } 2234 2235 logln("There are "+hex(count)+" code points for which case-folding may un-FCD a string (folding options"+foldingOptions+"x)" ); 2236 return count; 2237 } 2238 2239 @Test 2240 public void TestFindFoldFCDExceptions() { 2241 int count; 2242 2243 count=countFoldFCDExceptions(0); 2244 count+=countFoldFCDExceptions(Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I); 2245 if(count>0) { 2246 //* 2247 //* If case-folding un-FCDs any strings, then unorm_compare() must be 2248 //* re-implemented. 2249 //* It currently assumes that one can check for FCD then case-fold 2250 //* and then still have FCD strings for raw decomposition without reordering. 2251 //* 2252 errln("error: There are "+count+" code points for which case-folding"+ 2253 " may un-FCD a string for all folding options.\n See comment"+ 2254 " in BasicNormalizerTest::FindFoldFCDExceptions()!"); 2255 } 2256 } 2257 2258 @Test 2259 public void TestCombiningMarks(){ 2260 String src = "\u0f71\u0f72\u0f73\u0f74\u0f75"; 2261 String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74"; 2262 String result = Normalizer.decompose(src,false); 2263 if(!expected.equals(result)){ 2264 errln("Reordering of combining marks failed. Expected: "+Utility.hex(expected)+" Got: "+ Utility.hex(result)); 2265 } 2266 } 2267 2268 /* 2269 * Re-enable this test when UTC fixes UAX 21 2270 @Test 2271 public void TestUAX21Failure(){ 2272 final String[][] cases = new String[][]{ 2273 {"\u0061\u0345\u0360\u0345\u0062", "\u0061\u0360\u0345\u0345\u0062"}, 2274 {"\u0061\u0345\u0345\u0360\u0062", "\u0061\u0360\u0345\u0345\u0062"}, 2275 {"\u0061\u0345\u0360\u0362\u0360\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"}, 2276 {"\u0061\u0360\u0345\u0360\u0362\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"}, 2277 {"\u0061\u0345\u0360\u0362\u0361\u0062", "\u0061\u0362\u0360\u0361\u0345\u0062"}, 2278 {"\u0061\u0361\u0345\u0360\u0362\u0062", "\u0061\u0362\u0361\u0360\u0345\u0062"}, 2279 }; 2280 for(int i = 0; i< cases.length; i++){ 2281 String s1 =cases[0][0]; 2282 String s2 = cases[0][1]; 2283 if( (Normalizer.compare(s1,s2,Normalizer.FOLD_CASE_DEFAULT ==0)//case sensitive compare 2284 && 2285 (Normalizer.compare(s1,s2,Normalizer.COMPARE_IGNORE_CASE)!=0)){ 2286 errln("Normalizer.compare() failed for s1: " 2287 + Utility.hex(s1) +" s2: " + Utility.hex(s2)); 2288 } 2289 } 2290 } 2291 */ 2292 @Test 2293 public void TestFCNFKCClosure() { 2294 final class TestStruct{ 2295 int c; 2296 String s; 2297 TestStruct(int cp, String src){ 2298 c=cp; 2299 s=src; 2300 } 2301 } 2302 2303 TestStruct[] tests= new TestStruct[]{ 2304 new TestStruct( 0x00C4, "" ), 2305 new TestStruct( 0x00E4, "" ), 2306 new TestStruct( 0x037A, "\u0020\u03B9" ), 2307 new TestStruct( 0x03D2, "\u03C5" ), 2308 new TestStruct( 0x20A8, "\u0072\u0073" ) , 2309 new TestStruct( 0x210B, "\u0068" ), 2310 new TestStruct( 0x210C, "\u0068" ), 2311 new TestStruct( 0x2121, "\u0074\u0065\u006C" ), 2312 new TestStruct( 0x2122, "\u0074\u006D" ), 2313 new TestStruct( 0x2128, "\u007A" ), 2314 new TestStruct( 0x1D5DB,"\u0068" ), 2315 new TestStruct( 0x1D5ED,"\u007A" ), 2316 new TestStruct( 0x0061, "" ) 2317 }; 2318 2319 2320 for(int i = 0; i < tests.length; ++ i) { 2321 String result=Normalizer.getFC_NFKC_Closure(tests[i].c); 2322 if(!result.equals(new String(tests[i].s))) { 2323 errln("getFC_NFKC_Closure(U+"+Integer.toHexString(tests[i].c)+") is wrong"); 2324 } 2325 } 2326 2327 /* error handling */ 2328 2329 int length=Normalizer.getFC_NFKC_Closure(0x5c, null); 2330 if(length!=0){ 2331 errln("getFC_NFKC_Closure did not perform error handling correctly"); 2332 } 2333 } 2334 @Test 2335 public void TestBugJ2324(){ 2336 /* String[] input = new String[]{ 2337 //"\u30FD\u3099", 2338 "\u30FA\u309A", 2339 "\u30FB\u309A", 2340 "\u30FC\u309A", 2341 "\u30FE\u309A", 2342 "\u30FD\u309A", 2343 2344 };*/ 2345 String troublesome = "\u309A"; 2346 for(int i=0x3000; i<0x3100;i++){ 2347 String input = ((char)i)+troublesome; 2348 try{ 2349 /* String result =*/ Normalizer.compose(input,false); 2350 }catch(IndexOutOfBoundsException e){ 2351 errln("compose() failed for input: " + Utility.hex(input) + " Exception: " + e.toString()); 2352 } 2353 } 2354 2355 } 2356 2357 static final int D = 0, C = 1, KD= 2, KC = 3, FCD=4, NONE=5; 2358 2359 private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets) { 2360 skipSets[D].applyPattern("[[:NFD_QC=Yes:]&[:ccc=0:]]", false); 2361 skipSets[C].applyPattern("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false); 2362 skipSets[KD].applyPattern("[[:NFKD_QC=Yes:]&[:ccc=0:]]", false); 2363 skipSets[KC].applyPattern("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false); 2364 2365 // Remove from the NFC and NFKC sets all those characters that change 2366 // when a back-combining character is added. 2367 // First, get all of the back-combining characters and their combining classes. 2368 UnicodeSet combineBack=new UnicodeSet("[:NFC_QC=Maybe:]"); 2369 int numCombineBack=combineBack.size(); 2370 int[] combineBackCharsAndCc=new int[numCombineBack*2]; 2371 UnicodeSetIterator iter=new UnicodeSetIterator(combineBack); 2372 for(int i=0; i<numCombineBack; ++i) { 2373 iter.next(); 2374 int c=iter.codepoint; 2375 combineBackCharsAndCc[2*i]=c; 2376 combineBackCharsAndCc[2*i+1]=UCharacter.getCombiningClass(c); 2377 } 2378 2379 // We need not look at control codes, Han characters nor Hangul LVT syllables because they 2380 // do not combine forward. LV syllables are already removed. 2381 UnicodeSet notInteresting=new UnicodeSet("[[:C:][:Unified_Ideograph:][:HST=LVT:]]"); 2382 UnicodeSet unsure=((UnicodeSet)(skipSets[C].clone())).removeAll(notInteresting); 2383 // System.out.format("unsure.size()=%d\n", unsure.size()); 2384 2385 // For each character about which we are unsure, see if it changes when we add 2386 // one of the back-combining characters. 2387 Normalizer2 norm2=Normalizer2.getNFCInstance(); 2388 StringBuilder s=new StringBuilder(); 2389 iter.reset(unsure); 2390 while(iter.next()) { 2391 int c=iter.codepoint; 2392 s.delete(0, 0x7fffffff).appendCodePoint(c); 2393 int cLength=s.length(); 2394 int tccc=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS); 2395 for(int i=0; i<numCombineBack; ++i) { 2396 // If c's decomposition ends with a character with non-zero combining class, then 2397 // c can only change if it combines with a character with a non-zero combining class. 2398 int cc2=combineBackCharsAndCc[2*i+1]; 2399 if(tccc==0 || cc2!=0) { 2400 int c2=combineBackCharsAndCc[2*i]; 2401 s.appendCodePoint(c2); 2402 if(!norm2.isNormalized(s)) { 2403 // System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2); 2404 skipSets[C].remove(c); 2405 skipSets[KC].remove(c); 2406 break; 2407 } 2408 s.delete(cLength, 0x7fffffff); 2409 } 2410 } 2411 } 2412 return skipSets; 2413 } 2414 2415 @Test 2416 public void TestSkippable() { 2417 UnicodeSet[] skipSets = new UnicodeSet[] { 2418 new UnicodeSet(), //NFD 2419 new UnicodeSet(), //NFC 2420 new UnicodeSet(), //NFKD 2421 new UnicodeSet() //NFKC 2422 }; 2423 UnicodeSet[] expectSets = new UnicodeSet[] { 2424 new UnicodeSet(), 2425 new UnicodeSet(), 2426 new UnicodeSet(), 2427 new UnicodeSet() 2428 }; 2429 StringBuilder s, pattern; 2430 2431 // build NF*Skippable sets from runtime data 2432 skipSets[D].applyPattern("[:NFD_Inert:]"); 2433 skipSets[C].applyPattern("[:NFC_Inert:]"); 2434 skipSets[KD].applyPattern("[:NFKD_Inert:]"); 2435 skipSets[KC].applyPattern("[:NFKC_Inert:]"); 2436 2437 expectSets = initSkippables(expectSets); 2438 if(expectSets[D].contains(0x0350)){ 2439 errln("expectSets[D] contains 0x0350"); 2440 } 2441 for(int i=0; i<expectSets.length; ++i) { 2442 if(!skipSets[i].equals(expectSets[i])) { 2443 errln("error: TestSkippable skipSets["+i+"]!=expectedSets["+i+"]\n"); 2444 // Note: This used to depend on hardcoded UnicodeSet patterns generated by 2445 // Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by 2446 // running com.ibm.text.UCD.Main with the option NFSkippable. 2447 // Since ICU 4.6/Unicode 6, we are generating the 2448 // expectSets ourselves in initSkippables(). 2449 2450 s=new StringBuilder(); 2451 2452 s.append("\n\nskip= "); 2453 s.append(skipSets[i].toPattern(true)); 2454 s.append("\n\n"); 2455 2456 s.append("skip-expect="); 2457 pattern = new StringBuilder(((UnicodeSet)skipSets[i].clone()).removeAll(expectSets[i]).toPattern(true)); 2458 s.append(pattern); 2459 2460 pattern.delete(0,pattern.length()); 2461 s.append("\n\nexpect-skip="); 2462 pattern = new StringBuilder(((UnicodeSet)expectSets[i].clone()).removeAll(skipSets[i]).toPattern(true)); 2463 s.append(pattern); 2464 s.append("\n\n"); 2465 2466 pattern.delete(0,pattern.length()); 2467 s.append("\n\nintersection(expect,skip)="); 2468 UnicodeSet intersection = ((UnicodeSet) expectSets[i].clone()).retainAll(skipSets[i]); 2469 pattern = new StringBuilder(intersection.toPattern(true)); 2470 s.append(pattern); 2471 // Special: test coverage for append(char). 2472 s.append('\n'); 2473 s.append('\n'); 2474 2475 errln(s.toString()); 2476 } 2477 } 2478 } 2479 2480 @Test 2481 public void TestBugJ2068(){ 2482 String sample = "The quick brown fox jumped over the lazy dog"; 2483 UCharacterIterator text = UCharacterIterator.getInstance(sample); 2484 Normalizer norm = new Normalizer(text,Normalizer.NFC,0); 2485 text.setIndex(4); 2486 if(text.current() == norm.current()){ 2487 errln("Normalizer is not cloning the UCharacterIterator"); 2488 } 2489 } 2490 @Test 2491 public void TestGetCombiningClass(){ 2492 for(int i=0;i<0x10FFFF;i++){ 2493 int cc = UCharacter.getCombiningClass(i); 2494 if(0xD800<= i && i<=0xDFFF && cc >0 ){ 2495 cc = UCharacter.getCombiningClass(i); 2496 errln("CC: "+ cc + " for codepoint: " +Utility.hex(i,8)); 2497 } 2498 } 2499 } 2500 2501 @Test 2502 public void TestSerializedSet(){ 2503 USerializedSet sset=new USerializedSet(); 2504 UnicodeSet set = new UnicodeSet(); 2505 int start, end; 2506 2507 char[] serialized = { 2508 0x8007, // length 2509 3, // bmpLength 2510 0xc0, 0xfe, 0xfffc, 2511 1, 9, 0x10, 0xfffc 2512 }; 2513 sset.getSet(serialized, 0); 2514 2515 // collect all sets into one for contiguous output 2516 int[] startEnd = new int[2]; 2517 int count=sset.countRanges(); 2518 for(int j=0; j<count; ++j) { 2519 sset.getRange(j, startEnd); 2520 set.add(startEnd[0], startEnd[1]); 2521 } 2522 2523 // test all of these characters 2524 UnicodeSetIterator it = new UnicodeSetIterator(set); 2525 while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) { 2526 start=it.codepoint; 2527 end=it.codepointEnd; 2528 while(start<=end) { 2529 if(!sset.contains(start)){ 2530 errln("USerializedSet.contains failed for "+Utility.hex(start,8)); 2531 } 2532 ++start; 2533 } 2534 } 2535 } 2536 2537 @Test 2538 public void TestReturnFailure(){ 2539 char[] term = {'r','\u00e9','s','u','m','\u00e9' }; 2540 char[] decomposed_term = new char[10 + term.length + 2]; 2541 int rc = Normalizer.decompose(term,0,term.length, decomposed_term,0,decomposed_term.length,true, 0); 2542 int rc1 = Normalizer.decompose(term,0,term.length, decomposed_term,10,decomposed_term.length,true, 0); 2543 if(rc!=rc1){ 2544 errln("Normalizer decompose did not return correct length"); 2545 } 2546 } 2547 2548 private final static class TestCompositionCase { 2549 public Normalizer.Mode mode; 2550 public int options; 2551 public String input, expect; 2552 TestCompositionCase(Normalizer.Mode mode, int options, String input, String expect) { 2553 this.mode=mode; 2554 this.options=options; 2555 this.input=input; 2556 this.expect=expect; 2557 } 2558 } 2559 2560 @Test 2561 public void TestComposition() { 2562 final TestCompositionCase cases[]=new TestCompositionCase[]{ 2563 /* 2564 * special cases for UAX #15 bug 2565 * see Unicode Corrigendum #5: Normalization Idempotency 2566 * at http://unicode.org/versions/corrigendum5.html 2567 * (was Public Review Issue #29) 2568 */ 2569 new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327", "\u1100\u0300\u1161\u0327"), 2570 new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327\u11a8","\u1100\u0300\u1161\u0327\u11a8"), 2571 new TestCompositionCase(Normalizer.NFC, 0, "\uac00\u0300\u0327\u11a8", "\uac00\u0327\u0300\u11a8"), 2572 new TestCompositionCase(Normalizer.NFC, 0, "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"), 2573 2574 /* TODO: add test cases for UNORM_FCC here (j2151) */ 2575 }; 2576 2577 String output; 2578 int i; 2579 2580 for(i=0; i<cases.length; ++i) { 2581 output=Normalizer.normalize(cases[i].input, cases[i].mode, cases[i].options); 2582 if(!output.equals(cases[i].expect)) { 2583 errln("unexpected result for case "+i); 2584 } 2585 } 2586 } 2587 2588 @Test 2589 public void TestGetDecomposition() { 2590 Normalizer2 n2=Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE_CONTIGUOUS); 2591 String decomp=n2.getDecomposition(0x20); 2592 assertEquals("fcc.getDecomposition(space) failed", null, decomp); 2593 decomp=n2.getDecomposition(0xe4); 2594 assertEquals("fcc.getDecomposition(a-umlaut) failed", "a\u0308", decomp); 2595 decomp=n2.getDecomposition(0xac01); 2596 assertEquals("fcc.getDecomposition(Hangul syllable U+AC01) failed", "\u1100\u1161\u11a8", decomp); 2597 } 2598 2599 @Test 2600 public void TestGetRawDecomposition() { 2601 Normalizer2 n2=Normalizer2.getNFKCInstance(); 2602 /* 2603 * Raw decompositions from NFKC data are the Unicode Decomposition_Mapping values, 2604 * without recursive decomposition. 2605 */ 2606 2607 String decomp=n2.getRawDecomposition(0x20); 2608 assertEquals("nfkc.getRawDecomposition(space) failed", null, decomp); 2609 decomp=n2.getRawDecomposition(0xe4); 2610 assertEquals("nfkc.getRawDecomposition(a-umlaut) failed", "a\u0308", decomp); 2611 /* U+1E08 LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE */ 2612 decomp=n2.getRawDecomposition(0x1e08); 2613 assertEquals("nfkc.getRawDecomposition(c-cedilla-acute) failed", "\u00c7\u0301", decomp); 2614 /* U+212B ANGSTROM SIGN */ 2615 decomp=n2.getRawDecomposition(0x212b); 2616 assertEquals("nfkc.getRawDecomposition(angstrom sign) failed", "\u00c5", decomp); 2617 decomp=n2.getRawDecomposition(0xac00); 2618 assertEquals("nfkc.getRawDecomposition(Hangul syllable U+AC00) failed", "\u1100\u1161", decomp); 2619 /* A Hangul LVT syllable has a raw decomposition of an LV syllable + T. */ 2620 decomp=n2.getRawDecomposition(0xac01); 2621 assertEquals("nfkc.getRawDecomposition(Hangul syllable U+AC01) failed", "\uac00\u11a8", decomp); 2622 } 2623 2624 @Test 2625 public void TestCustomComp() { 2626 String [][] pairs={ 2627 { "\\uD801\\uE000\\uDFFE", "" }, 2628 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" }, 2629 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" }, 2630 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE002\\U000110B9\\u0327\\u0345" }, 2631 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" }, 2632 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" }, 2633 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" }, 2634 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" } 2635 }; 2636 Normalizer2 customNorm2; 2637 customNorm2= 2638 Normalizer2.getInstance( 2639 BasicTest.class.getResourceAsStream("/com/ibm/icu/dev/data/testdata/testnorm.nrm"), 2640 "testnorm", 2641 Normalizer2.Mode.COMPOSE); 2642 for(int i=0; i<pairs.length; ++i) { 2643 String[] pair=pairs[i]; 2644 String input=Utility.unescape(pair[0]); 2645 String expected=Utility.unescape(pair[1]); 2646 String result=customNorm2.normalize(input); 2647 if(!result.equals(expected)) { 2648 errln("custom compose Normalizer2 did not normalize input "+i+" as expected"); 2649 } 2650 } 2651 } 2652 2653 @Test 2654 public void TestCustomFCC() { 2655 String[][] pairs={ 2656 { "\\uD801\\uE000\\uDFFE", "" }, 2657 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" }, 2658 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" }, 2659 // The following expected result is different from CustomComp 2660 // because of only-contiguous composition. 2661 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE001\\U000110B9\\u0327\\u0308\\u0345" }, 2662 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" }, 2663 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" }, 2664 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" }, 2665 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" } 2666 }; 2667 Normalizer2 customNorm2; 2668 customNorm2= 2669 Normalizer2.getInstance( 2670 BasicTest.class.getResourceAsStream("/com/ibm/icu/dev/data/testdata/testnorm.nrm"), 2671 "testnorm", 2672 Normalizer2.Mode.COMPOSE_CONTIGUOUS); 2673 for(int i=0; i<pairs.length; ++i) { 2674 String[] pair=pairs[i]; 2675 String input=Utility.unescape(pair[0]); 2676 String expected=Utility.unescape(pair[1]); 2677 String result=customNorm2.normalize(input); 2678 if(!result.equals(expected)) { 2679 errln("custom FCC Normalizer2 did not normalize input "+i+" as expected"); 2680 } 2681 } 2682 } 2683 2684 @Test 2685 public void TestCanonIterData() { 2686 // For now, just a regression test. 2687 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl.ensureCanonIterData(); 2688 // U+0FB5 TIBETAN SUBJOINED LETTER SSA is the trailing character 2689 // in some decomposition mappings where there is a composition exclusion. 2690 // In fact, U+0FB5 is normalization-inert (NFC_QC=Yes, NFD_QC=Yes, ccc=0) 2691 // but it is not a segment starter because it occurs in a decomposition mapping. 2692 if(impl.isCanonSegmentStarter(0xfb5)) { 2693 errln("isCanonSegmentStarter(U+0fb5)=true is wrong"); 2694 } 2695 // For [:Segment_Starter:] to work right, not just the property function has to work right, 2696 // UnicodeSet also needs a correct range starts set. 2697 UnicodeSet segStarters=new UnicodeSet("[:Segment_Starter:]").freeze(); 2698 if(segStarters.contains(0xfb5)) { 2699 errln("[:Segment_Starter:].contains(U+0fb5)=true is wrong"); 2700 } 2701 // Try characters up to Kana and miscellaneous CJK but below Han (for expediency). 2702 for(int c=0; c<=0x33ff; ++c) { 2703 boolean isStarter=impl.isCanonSegmentStarter(c); 2704 boolean isContained=segStarters.contains(c); 2705 if(isStarter!=isContained) { 2706 errln(String.format( 2707 "discrepancy: isCanonSegmentStarter(U+%04x)=%5b != " + 2708 "[:Segment_Starter:].contains(same)", 2709 c, isStarter)); 2710 } 2711 } 2712 } 2713 2714 @Test 2715 public void TestFilteredNormalizer2() { 2716 Normalizer2 nfcNorm2=Normalizer2.getNFCInstance(); 2717 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]"); 2718 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); 2719 int c; 2720 for(c=0; c<=0x3ff; ++c) { 2721 int expectedCC= filter.contains(c) ? nfcNorm2.getCombiningClass(c) : 0; 2722 int cc=fn2.getCombiningClass(c); 2723 assertEquals( 2724 "FilteredNormalizer2(NFC, ^A0-FF,310-31F).getCombiningClass(U+"+hex(c)+ 2725 ")==filtered NFC.getCC()", 2726 expectedCC, cc); 2727 } 2728 2729 // More coverage. 2730 StringBuilder sb=new StringBuilder(); 2731 assertEquals("filtered normalize()", "ää\u0304", 2732 fn2.normalize("a\u0308ä\u0304", (Appendable)sb).toString()); 2733 assertTrue("filtered hasBoundaryAfter()", fn2.hasBoundaryAfter('ä')); 2734 assertTrue("filtered isInert()", fn2.isInert(0x0313)); 2735 } 2736 2737 @Test 2738 public void TestFilteredAppend() { 2739 Normalizer2 nfcNorm2=Normalizer2.getNFCInstance(); 2740 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]"); 2741 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); 2742 2743 // Append two strings that each contain a character outside the filter set. 2744 StringBuilder sb = new StringBuilder("a\u0313a"); 2745 String second = "\u0301\u0313"; 2746 assertEquals("append()", "a\u0313á\u0313", fn2.append(sb, second).toString()); 2747 2748 // Same, and also normalize the second string. 2749 sb.replace(0, 0x7fffffff, "a\u0313a"); 2750 assertEquals( 2751 "normalizeSecondAndAppend()", 2752 "a\u0313á\u0313", fn2.normalizeSecondAndAppend(sb, second).toString()); 2753 2754 // Normalizer2.normalize(String) uses spanQuickCheckYes() and normalizeSecondAndAppend(). 2755 assertEquals("normalize()", "a\u0313á\u0313", fn2.normalize("a\u0313a\u0301\u0313")); 2756 } 2757 2758 @Test 2759 public void TestGetEasyToUseInstance() { 2760 // Test input string: 2761 // U+00A0 -> <noBreak> 0020 2762 // U+00C7 0301 = 1E08 = 0043 0327 0301 2763 String in="\u00A0\u00C7\u0301"; 2764 Normalizer2 n2=Normalizer2.getNFCInstance(); 2765 String out=n2.normalize(in); 2766 assertEquals( 2767 "getNFCInstance() did not return an NFC instance " + 2768 "(normalizes to " + prettify(out) + ')', 2769 "\u00A0\u1E08", out); 2770 2771 n2=Normalizer2.getNFDInstance(); 2772 out=n2.normalize(in); 2773 assertEquals( 2774 "getNFDInstance() did not return an NFD instance " + 2775 "(normalizes to " + prettify(out) + ')', 2776 "\u00A0C\u0327\u0301", out); 2777 2778 n2=Normalizer2.getNFKCInstance(); 2779 out=n2.normalize(in); 2780 assertEquals( 2781 "getNFKCInstance() did not return an NFKC instance " + 2782 "(normalizes to " + prettify(out) + ')', 2783 " \u1E08", out); 2784 2785 n2=Normalizer2.getNFKDInstance(); 2786 out=n2.normalize(in); 2787 assertEquals( 2788 "getNFKDInstance() did not return an NFKD instance " + 2789 "(normalizes to " + prettify(out) + ')', 2790 " C\u0327\u0301", out); 2791 2792 n2=Normalizer2.getNFKCCasefoldInstance(); 2793 out=n2.normalize(in); 2794 assertEquals( 2795 "getNFKCCasefoldInstance() did not return an NFKC_Casefold instance " + 2796 "(normalizes to " + prettify(out) + ')', 2797 " \u1E09", out); 2798 } 2799 2800 @Test 2801 public void TestNFC() { 2802 // Coverage tests. 2803 Normalizer2 nfc = Normalizer2.getNFCInstance(); 2804 assertTrue("nfc.hasBoundaryAfter(space)", nfc.hasBoundaryAfter(' ')); 2805 assertFalse("nfc.hasBoundaryAfter(ä)", nfc.hasBoundaryAfter('ä')); 2806 } 2807 2808 @Test 2809 public void TestNFD() { 2810 // Coverage tests. 2811 Normalizer2 nfd = Normalizer2.getNFDInstance(); 2812 assertTrue("nfd.hasBoundaryAfter(space)", nfd.hasBoundaryAfter(' ')); 2813 assertFalse("nfd.hasBoundaryAfter(ä)", nfd.hasBoundaryAfter('ä')); 2814 } 2815 2816 @Test 2817 public void TestFCD() { 2818 // Coverage tests. 2819 Normalizer2 fcd = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.FCD); 2820 assertTrue("fcd.hasBoundaryAfter(space)", fcd.hasBoundaryAfter(' ')); 2821 assertFalse("fcd.hasBoundaryAfter(ä)", fcd.hasBoundaryAfter('ä')); 2822 assertTrue("fcd.isInert(space)", fcd.isInert(' ')); 2823 assertFalse("fcd.isInert(ä)", fcd.isInert('ä')); 2824 2825 // This implementation method is unreachable via public API. 2826 Norm2AllModes.FCDNormalizer2 impl = (Norm2AllModes.FCDNormalizer2)fcd; 2827 assertEquals("fcd impl.getQuickCheck(space)", 1, impl.getQuickCheck(' ')); 2828 assertEquals("fcd impl.getQuickCheck(ä)", 0, impl.getQuickCheck('ä')); 2829 } 2830 2831 @Test 2832 public void TestNoneNormalizer() { 2833 // Use the deprecated Mode Normalizer.NONE for coverage of the internal NoopNormalizer2 2834 // as far as its methods are reachable that way. 2835 assertEquals("NONE.concatenate()", "ä\u0327", 2836 Normalizer.concatenate("ä", "\u0327", Normalizer.NONE, 0)); 2837 assertTrue("NONE.isNormalized()", Normalizer.isNormalized("ä\u0327", Normalizer.NONE, 0)); 2838 } 2839 2840 @Test 2841 public void TestNoopNormalizer2() { 2842 // Use the internal class directly for coverage of methods that are not publicly reachable. 2843 Normalizer2 noop = Norm2AllModes.NOOP_NORMALIZER2; 2844 assertEquals("noop.normalizeSecondAndAppend()", "ä\u0327", 2845 noop.normalizeSecondAndAppend(new StringBuilder("ä"), "\u0327").toString()); 2846 assertEquals("noop.getDecomposition()", null, noop.getDecomposition('ä')); 2847 assertTrue("noop.hasBoundaryAfter()", noop.hasBoundaryAfter(0x0308)); 2848 assertTrue("noop.isInert()", noop.isInert(0x0308)); 2849 } 2850 2851 /* 2852 * This unit test covers two 'get' methods in class Normalizer2Impl. It only tests that 2853 * an object is returned. 2854 */ 2855 @Test 2856 public void TestGetsFromImpl() { 2857 Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstance().impl; 2858 assertNotEquals("getNormTrie() returns null", null, nfcImpl.getNormTrie()); 2859 assertNotEquals("getFCD16FromBelow180() returns null", null, 2860 nfcImpl.getFCD16FromBelow180(0)); 2861 } 2862 2863 /* 2864 * Abstract class Normalizer2 has non-abstract methods which are overwritten by 2865 * its derived classes. To test these methods a derived class is defined here. 2866 */ 2867 public class TestNormalizer2 extends Normalizer2 { 2868 2869 public TestNormalizer2() {} 2870 @Override 2871 public StringBuilder normalize(CharSequence src, StringBuilder dest) { return null; } 2872 @Override 2873 public Appendable normalize(CharSequence src, Appendable dest) { return null; } 2874 @Override 2875 public StringBuilder normalizeSecondAndAppend( 2876 StringBuilder first, CharSequence second) { return null; } 2877 @Override 2878 public StringBuilder append(StringBuilder first, CharSequence second) { return null; } 2879 @Override 2880 public String getDecomposition(int c) { return null; } 2881 @Override 2882 public boolean isNormalized(CharSequence s) { return false; } 2883 @Override 2884 public Normalizer.QuickCheckResult quickCheck(CharSequence s) { return null; } 2885 @Override 2886 public int spanQuickCheckYes(CharSequence s) { return 0; } 2887 @Override 2888 public boolean hasBoundaryBefore(int c) { return false; } 2889 @Override 2890 public boolean hasBoundaryAfter(int c) { return false; } 2891 @Override 2892 public boolean isInert(int c) { return false; } 2893 } 2894 2895 final TestNormalizer2 tnorm2 = new TestNormalizer2(); 2896 @Test 2897 public void TestGetRawDecompositionBase() { 2898 int c = 'à'; 2899 assertEquals("Unexpected value returned from Normalizer2.getRawDecomposition()", 2900 null, tnorm2.getRawDecomposition(c)); 2901 } 2902 2903 @Test 2904 public void TestComposePairBase() { 2905 int a = 'a'; 2906 int b = '\u0300'; 2907 assertEquals("Unexpected value returned from Normalizer2.composePair()", 2908 -1, tnorm2.composePair(a, b)); 2909 } 2910 2911 @Test 2912 public void TestGetCombiningClassBase() { 2913 int c = '\u00e0'; 2914 assertEquals("Unexpected value returned from Normalizer2.getCombiningClass()", 2915 0, tnorm2.getCombiningClass(c)); 2916 } 2917} 2918