1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4 ******************************************************************************* 5 * Copyright (C) 1996-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10package com.ibm.icu.dev.test.normalizer; 11 12import java.io.BufferedReader; 13import java.io.IOException; 14import java.text.StringCharacterIterator; 15 16import org.junit.Ignore; 17import org.junit.Test; 18 19import com.ibm.icu.dev.test.TestFmwk; 20import com.ibm.icu.dev.test.TestUtil; 21import com.ibm.icu.impl.Utility; 22import com.ibm.icu.text.Normalizer; 23import com.ibm.icu.text.UTF16; 24import com.ibm.icu.text.UnicodeSet; 25 26public class ConformanceTest extends TestFmwk { 27 28 Normalizer normalizer; 29 30 public ConformanceTest() { 31 // Doesn't matter what the string and mode are; we'll change 32 // them later as needed. 33 normalizer = new Normalizer("", Normalizer.NFC, 0); 34 } 35 // more interesting conformance test cases, not in the unicode.org NormalizationTest.txt 36 static String[] moreCases ={ 37 // Markus 2001aug30 38 "0061 0332 0308;00E4 0332;0061 0332 0308;00E4 0332;0061 0332 0308; # Markus 0", 39 40 // Markus 2001oct26 - test edge case for iteration: U+0f73.cc==0 but decomposition.lead.cc==129 41 "0061 0301 0F73;00E1 0F71 0F72;0061 0F71 0F72 0301;00E1 0F71 0F72;0061 0F71 0F72 0301; # Markus 1" 42 }; 43 44 /** 45 * Test the conformance of Normalizer to 46 * http://www.unicode.org/unicode/reports/tr15/conformance/Draft-TestSuite.txt.* http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt 47 * This file must be located at the path specified as TEST_SUITE_FILE. 48 */ 49 @Test 50 public void TestConformance() throws Exception{ 51 runConformance("unicode/NormalizationTest.txt",0); 52 } 53 @Test 54 public void TestConformance_3_2() throws Exception{ 55 runConformance("unicode/NormalizationTest-3.2.0.txt",Normalizer.UNICODE_3_2); 56 } 57 58 public void runConformance(String fileName, int options) throws Exception{ 59 String line = null; 60 String[] fields = new String[5]; 61 StringBuffer buf = new StringBuffer(); 62 int passCount = 0; 63 int failCount = 0; 64 UnicodeSet other = new UnicodeSet(0, 0x10ffff); 65 int c=0; 66 BufferedReader input = null; 67 try { 68 input = TestUtil.getDataReader(fileName); 69 for (int count = 0;;++count) { 70 line = input.readLine(); 71 if (line == null) { 72 //read the extra test cases 73 if(count > moreCases.length) { 74 count = 0; 75 } else if(count == moreCases.length) { 76 // all done 77 break; 78 } 79 line = moreCases[count++]; 80 } 81 if (line.length() == 0) continue; 82 83 // Expect 5 columns of this format: 84 // 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments> 85 86 // Skip comments 87 if (line.charAt(0) == '#' || line.charAt(0)=='@') continue; 88 89 // Parse out the fields 90 hexsplit(line, ';', fields, buf); 91 92 // Remove a single code point from the "other" UnicodeSet 93 if(fields[0].length()==UTF16.moveCodePointOffset(fields[0],0, 1)) { 94 c=UTF16.charAt(fields[0],0); 95 if(0xac20<=c && c<=0xd73f) { 96 // not an exhaustive test run: skip most Hangul syllables 97 if(c==0xac20) { 98 other.remove(0xac20, 0xd73f); 99 } 100 continue; 101 } 102 other.remove(c); 103 } 104 if (checkConformance(fields, line,options)) { 105 ++passCount; 106 } else { 107 ++failCount; 108 } 109 if ((count % 1000) == 999) { 110 logln("Line " + (count+1)); 111 } 112 } 113 } catch (IOException ex) { 114 ex.printStackTrace(); 115 throw new IllegalArgumentException("Couldn't read file " 116 + ex.getClass().getName() + " " + ex.getMessage() 117 + " line = " + line 118 ); 119 } finally { 120 if (input != null) { 121 try { 122 input.close(); 123 } catch (IOException ignored) { 124 } 125 } 126 } 127 128 if (failCount != 0) { 129 errln("Total: " + failCount + " lines failed, " + 130 passCount + " lines passed"); 131 } else { 132 logln("Total: " + passCount + " lines passed"); 133 } 134 } 135 136 /** 137 * Verify the conformance of the given line of the Unicode 138 * normalization (UTR 15) test suite file. For each line, 139 * there are five columns, corresponding to field[0]..field[4]. 140 * 141 * The following invariants must be true for all conformant implementations 142 * c2 == NFC(c1) == NFC(c2) == NFC(c3) 143 * c3 == NFD(c1) == NFD(c2) == NFD(c3) 144 * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5) 145 * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5) 146 * 147 * @param field the 5 columns 148 * @param line the source line from the test suite file 149 * @return true if the test passes 150 */ 151 private boolean checkConformance(String[] field, String line, int options) throws Exception{ 152 boolean pass = true; 153 StringBuffer buf = new StringBuffer(); // scratch 154 String out,fcd; 155 int i=0; 156 for (i=0; i<5; ++i) { 157 if (i<3) { 158 out = Normalizer.normalize(field[i], Normalizer.NFC, options); 159 pass &= assertEqual("C", field[i], out, field[1], "c2!=C(c" + (i+1)); 160 161 out = iterativeNorm(field[i], Normalizer.NFC, buf, +1,options); 162 pass &= assertEqual("C(+1)", field[i], out, field[1], "c2!=C(c" + (i+1)); 163 164 out = iterativeNorm(field[i], Normalizer.NFC, buf, -1,options); 165 pass &= assertEqual("C(-1)", field[i], out, field[1], "c2!=C(c" + (i+1)); 166 167 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFC, buf, +1,options); 168 pass &= assertEqual("C(+1)", field[i], out, field[1], "c2!=C(c" + (i+1)); 169 170 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFC, buf, -1,options); 171 pass &= assertEqual("C(-1)", field[i], out, field[1], "c2!=C(c" + (i+1)); 172 173 out = Normalizer.normalize(field[i], Normalizer.NFD); 174 pass &= assertEqual("D", field[i], out, field[2], "c3!=D(c" + (i+1)); 175 176 out = iterativeNorm(field[i], Normalizer.NFD, buf, +1,options); 177 pass &= assertEqual("D(+1)", field[i], out, field[2], "c3!=D(c" + (i+1)); 178 179 out = iterativeNorm(field[i], Normalizer.NFD, buf, -1,options); 180 pass &= assertEqual("D(-1)", field[i], out, field[2], "c3!=D(c" + (i+1)); 181 182 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFD, buf, +1,options); 183 pass &= assertEqual("D(+1)", field[i], out, field[2], "c3!=D(c" + (i+1)); 184 185 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFD, buf, -1,options); 186 pass &= assertEqual("D(-1)", field[i], out, field[2], "c3!=D(c" + (i+1)); 187 188 cross(field[2] /*NFD String*/, field[1]/*NFC String*/, Normalizer.NFC); 189 cross(field[1] /*NFC String*/, field[2]/*NFD String*/, Normalizer.NFD); 190 } 191 out = Normalizer.normalize(field[i], Normalizer.NFKC,options); 192 pass &= assertEqual("KC", field[i], out, field[3], "c4!=KC(c" + (i+1)); 193 194 out = iterativeNorm(field[i], Normalizer.NFKC, buf, +1,options); 195 pass &= assertEqual("KD(+1)", field[i], out, field[3], "c4!=KC(c" + (i+1)); 196 197 out = iterativeNorm(field[i], Normalizer.NFKC, buf, -1,options); 198 pass &= assertEqual("KD(-1)", field[i], out, field[3], "c4!=KC(c" + (i+1)); 199 200 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKC, buf, +1,options); 201 pass &= assertEqual("KD(+1)", field[i], out, field[3], "c4!=KC(c" + (i+1)); 202 203 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKC, buf, -1,options); 204 pass &= assertEqual("KD(-1)", field[i], out, field[3], "c4!=KC(c" + (i+1)); 205 206 207 out = Normalizer.normalize(field[i], Normalizer.NFKD,options); 208 pass &= assertEqual("KD", field[i], out, field[4], "c5!=KD(c" + (i+1)); 209 210 out = iterativeNorm(field[i], Normalizer.NFKD, buf, +1,options); 211 pass &= assertEqual("KD(+1)", field[i], out, field[4], "c5!=KD(c" + (i+1)); 212 213 out = iterativeNorm(field[i], Normalizer.NFKD, buf, -1,options); 214 pass &= assertEqual("KD(-1)", field[i], out, field[4], "c5!=KD(c" + (i+1)); 215 216 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKD, buf, +1,options); 217 pass &= assertEqual("KD(+1)", field[i], out, field[4], "c5!=KD(c" + (i+1)); 218 219 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKD, buf, -1,options); 220 pass &= assertEqual("KD(-1)", field[i], out, field[4], "c5!=KD(c" + (i+1)); 221 222 cross(field[4] /*NFKD String*/, field[3]/*NFKC String*/, Normalizer.NFKC); 223 cross(field[3] /*NFKC String*/, field[4]/*NFKD String*/, Normalizer.NFKD); 224 225 } 226 compare(field[1],field[2]); 227 compare(field[0],field[1]); 228 compare(field[0],field[2]); 229 // test quick checks 230 if(Normalizer.NO == Normalizer.quickCheck(field[1], Normalizer.NFC,options)) { 231 errln("Normalizer error: quickCheck(NFC(s), Normalizer.NFC) is Normalizer.NO"); 232 pass = false; 233 } 234 if(Normalizer.NO == Normalizer.quickCheck(field[2], Normalizer.NFD,options)) { 235 errln("Normalizer error: quickCheck(NFD(s), Normalizer.NFD) is Normalizer.NO"); 236 pass = false; 237 } 238 if(Normalizer.NO == Normalizer.quickCheck(field[3], Normalizer.NFKC,options)) { 239 errln("Normalizer error: quickCheck(NFKC(s), Normalizer.NFKC) is Normalizer.NO"); 240 pass = false; 241 } 242 if(Normalizer.NO == Normalizer.quickCheck(field[4], Normalizer.NFKD,options)) { 243 errln("Normalizer error: quickCheck(NFKD(s), Normalizer.NFKD) is Normalizer.NO"); 244 pass = false; 245 } 246 247 if(!Normalizer.isNormalized(field[1], Normalizer.NFC, options)) { 248 errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false"); 249 pass = false; 250 } 251 if(!field[0].equals(field[1]) && Normalizer.isNormalized(field[0], Normalizer.NFC, options)) { 252 errln("Normalizer error: isNormalized(s, Normalizer.NFC) is TRUE"); 253 pass = false; 254 } 255 if(!Normalizer.isNormalized(field[3], Normalizer.NFKC, options)) { 256 errln("Normalizer error: isNormalized(NFKC(s), Normalizer.NFKC) is false"); 257 pass = false; 258 } 259 if(!field[0].equals(field[3]) && Normalizer.isNormalized(field[0], Normalizer.NFKC, options)) { 260 errln("Normalizer error: isNormalized(s, Normalizer.NFKC) is TRUE"); 261 pass = false; 262 } 263 // test api that takes a char[] 264 if(!Normalizer.isNormalized(field[1].toCharArray(),0,field[1].length(), Normalizer.NFC,options)) { 265 errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false"); 266 pass = false; 267 } 268 // test api that takes a codepoint 269 if(!Normalizer.isNormalized(UTF16.charAt(field[1],0), Normalizer.NFC,options)) { 270 errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false"); 271 pass = false; 272 } 273 // test FCD quick check and "makeFCD" 274 fcd=Normalizer.normalize(field[0], Normalizer.FCD); 275 if(Normalizer.NO == Normalizer.quickCheck(fcd, Normalizer.FCD,options)) { 276 errln("Normalizer error: quickCheck(FCD(s), Normalizer.FCD) is Normalizer.NO"); 277 pass = false; 278 } 279 // check FCD return length 280 { 281 char[] fcd2 = new char[ fcd.length() * 2 ]; 282 char[] src = field[0].toCharArray(); 283 int fcdLen = Normalizer.normalize(src, 0, src.length, fcd2, fcd.length(), fcd2.length,Normalizer.FCD, 0); 284 if(fcdLen != fcd.length()){ 285 errln("makeFCD did not return the correct length"); 286 } 287 } 288 if(Normalizer.NO == Normalizer.quickCheck(fcd, Normalizer.FCD, options)) { 289 errln("Normalizer error: quickCheck(FCD(s), Normalizer.FCD) is Normalizer.NO"); 290 pass = false; 291 } 292 if(Normalizer.NO == Normalizer.quickCheck(field[2], Normalizer.FCD, options)) { 293 errln("Normalizer error: quickCheck(NFD(s), Normalizer.FCD) is Normalizer.NO"); 294 pass = false; 295 } 296 297 if(Normalizer.NO == Normalizer.quickCheck(field[4], Normalizer.FCD, options)) { 298 errln("Normalizer error: quickCheck(NFKD(s), Normalizer.FCD) is Normalizer.NO"); 299 pass = false; 300 } 301 302 out = iterativeNorm(new StringCharacterIterator(field[0]), Normalizer.FCD, buf, +1,options); 303 out = iterativeNorm(new StringCharacterIterator(field[0]), Normalizer.FCD, buf, -1,options); 304 305 out = iterativeNorm(new StringCharacterIterator(field[2]), Normalizer.FCD, buf, +1,options); 306 out = iterativeNorm(new StringCharacterIterator(field[2]), Normalizer.FCD, buf, -1,options); 307 308 out = iterativeNorm(new StringCharacterIterator(field[4]), Normalizer.FCD, buf, +1,options); 309 out = iterativeNorm(new StringCharacterIterator(field[4]), Normalizer.FCD, buf, -1,options); 310 311 out=Normalizer.normalize(fcd, Normalizer.NFD); 312 if(!out.equals(field[2])) { 313 errln("Normalizer error: NFD(FCD(s))!=NFD(s)"); 314 pass = false; 315 } 316 if (!pass) { 317 errln("FAIL: " + line); 318 } 319 if(field[0]!=field[2]) { 320 // two strings that are canonically equivalent must test 321 // equal under a canonical caseless match 322 // see UAX #21 Case Mappings and Jitterbug 2021 and 323 // Unicode Technical Committee meeting consensus 92-C31 324 int rc; 325 if((rc = Normalizer.compare(field[0], field[2], (options<<Normalizer.COMPARE_NORM_OPTIONS_SHIFT)|Normalizer.COMPARE_IGNORE_CASE))!=0){ 326 errln("Normalizer.compare(original, NFD, case-insensitive) returned "+rc+" instead of 0 for equal"); 327 pass=false; 328 } 329 } 330 331 return pass; 332 } 333 // two strings that are canonically equivalent must test 334 // equal under a canonical caseless match 335 // see UAX #21 Case Mappings and Jitterbug 2021 and 336 // Unicode Technical Committee meeting consensus 92-C31 337 private void compare(String s1, String s2){ 338 if(s1.length()==1 && s2.length()==1){ 339 if(Normalizer.compare(UTF16.charAt(s1,0),UTF16.charAt(s2,0),Normalizer.COMPARE_IGNORE_CASE)!=0){ 340 errln("Normalizer.compare(int,int) failed for s1: " 341 +Utility.hex(s1) + " s2: " + Utility.hex(s2)); 342 } 343 } 344 if(s1.length()==1 && s2.length()>1){ 345 if(Normalizer.compare(UTF16.charAt(s1,0),s2,Normalizer.COMPARE_IGNORE_CASE)!=0){ 346 errln("Normalizer.compare(int,String) failed for s1: " 347 +Utility.hex(s1) + " s2: " + Utility.hex(s2)); 348 } 349 } 350 if(s1.length()>1 && s2.length()>1){ 351 // TODO: Re-enable this tests after UTC fixes UAX 21 352 if(Normalizer.compare(s1.toCharArray(),s2.toCharArray(),Normalizer.COMPARE_IGNORE_CASE)!=0){ 353 errln("Normalizer.compare(char[],char[]) failed for s1: " 354 +Utility.hex(s1) + " s2: " + Utility.hex(s2)); 355 } 356 } 357 } 358 private void cross(String s1, String s2,Normalizer.Mode mode){ 359 String result = Normalizer.normalize(s1,mode); 360 if(!result.equals(s2)){ 361 errln("cross test failed s1: " + Utility.hex(s1) + " s2: " 362 +Utility.hex(s2)); 363 } 364 } 365 /** 366 * Do a normalization using the iterative API in the given direction. 367 * @param buf scratch buffer 368 * @param dir either +1 or -1 369 */ 370 private String iterativeNorm(String str, Normalizer.Mode mode, 371 StringBuffer buf, int dir ,int options) throws Exception{ 372 normalizer.setText(str); 373 normalizer.setMode(mode); 374 buf.setLength(0); 375 normalizer.setOption(-1, false); // reset all options 376 normalizer.setOption(options, true); // set desired options 377 378 int ch; 379 if (dir > 0) { 380 for (ch = normalizer.first(); ch != Normalizer.DONE; 381 ch = normalizer.next()) { 382 buf.append(UTF16.valueOf(ch)); 383 } 384 } else { 385 for (ch = normalizer.last(); ch != Normalizer.DONE; 386 ch = normalizer.previous()) { 387 buf.insert(0, UTF16.valueOf(ch)); 388 } 389 } 390 return buf.toString(); 391 } 392 393 /** 394 * Do a normalization using the iterative API in the given direction. 395 * @param str a Java StringCharacterIterator 396 * @param buf scratch buffer 397 * @param dir either +1 or -1 398 */ 399 private String iterativeNorm(StringCharacterIterator str, Normalizer.Mode mode, 400 StringBuffer buf, int dir,int options) throws Exception{ 401 normalizer.setText(str); 402 normalizer.setMode(mode); 403 buf.setLength(0); 404 normalizer.setOption(-1, false); // reset all options 405 normalizer.setOption(options, true); // set desired options 406 407 int ch; 408 if (dir > 0) { 409 for (ch = normalizer.first(); ch != Normalizer.DONE; 410 ch = normalizer.next()) { 411 buf.append(UTF16.valueOf(ch)); 412 } 413 } else { 414 for (ch = normalizer.last(); ch != Normalizer.DONE; 415 ch = normalizer.previous()) { 416 buf.insert(0, UTF16.valueOf(ch)); 417 } 418 } 419 return buf.toString(); 420 } 421 422 /** 423 * @param op name of normalization form, e.g., "KC" 424 * @param s string being normalized 425 * @param got value received 426 * @param exp expected value 427 * @param msg description of this test 428 * @returns true if got == exp 429 */ 430 private boolean assertEqual(String op, String s, String got, 431 String exp, String msg) { 432 if (exp.equals(got)) { 433 return true; 434 } 435 errln((" " + msg + ") " + op + "(" + s + ")=" + hex(got) + 436 ", exp. " + hex(exp))); 437 return false; 438 } 439 440 /** 441 * Split a string into pieces based on the given delimiter 442 * character. Then, parse the resultant fields from hex into 443 * characters. That is, "0040 0400;0C00;0899" -> new String[] { 444 * "\u0040\u0400", "\u0C00", "\u0899" }. The output is assumed to 445 * be of the proper length already, and exactly output.length 446 * fields are parsed. If there are too few an exception is 447 * thrown. If there are too many the extras are ignored. 448 * 449 * @param buf scratch buffer 450 */ 451 private static void hexsplit(String s, char delimiter, 452 String[] output, StringBuffer buf) { 453 int i; 454 int pos = 0; 455 for (i=0; i<output.length; ++i) { 456 int delim = s.indexOf(delimiter, pos); 457 if (delim < 0) { 458 throw new IllegalArgumentException("Missing field in " + s); 459 } 460 // Our field is from pos..delim-1. 461 buf.setLength(0); 462 463 String toHex = s.substring(pos,delim); 464 pos = delim; 465 int index = 0; 466 int len = toHex.length(); 467 while(index< len){ 468 if(toHex.charAt(index)==' '){ 469 index++; 470 }else{ 471 int spacePos = toHex.indexOf(' ', index); 472 if(spacePos==-1){ 473 appendInt(buf,toHex.substring(index,len),s); 474 spacePos = len; 475 }else{ 476 appendInt(buf,toHex.substring(index, spacePos),s); 477 } 478 index = spacePos+1; 479 } 480 } 481 482 if (buf.length() < 1) { 483 throw new IllegalArgumentException("Empty field " + i + " in " + s); 484 } 485 output[i] = buf.toString(); 486 ++pos; // Skip over delim 487 } 488 } 489 public static void appendInt(StringBuffer buf, String strToHex, String s){ 490 int hex = Integer.parseInt(strToHex,16); 491 if (hex < 0 ) { 492 throw new IllegalArgumentException("Out of range hex " + 493 hex + " in " + s); 494 }else if (hex > 0xFFFF){ 495 buf.append((char)((hex>>10)+0xd7c0)); 496 buf.append((char)((hex&0x3ff)|0xdc00)); 497 }else{ 498 buf.append((char) hex); 499 } 500 } 501 502 // Specific tests for debugging. These are generally failures 503 // taken from the conformance file, but culled out to make 504 // debugging easier. These can be eliminated without affecting 505 // coverage. 506 @Ignore 507 @Test 508 public void _hideTestCase6(/*int options*/) throws Exception{ 509 _testOneLine("0385;0385;00A8 0301;0020 0308 0301;0020 0308 0301;", /*options*/ 0); 510 } 511 512 private void _testOneLine(String line,int options) throws Exception{ 513 String[] fields = new String[5]; 514 StringBuffer buf = new StringBuffer(); 515 // Parse out the fields 516 hexsplit(line, ';', fields, buf); 517 checkConformance(fields, line,options); 518 } 519 520 521} 522