1/* 2 ******************************************************************************* 3 * Copyright (C) 2009-2015, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7package com.ibm.icu.dev.test.text; 8 9import java.io.BufferedReader; 10import java.io.IOException; 11import java.io.Reader; 12import java.io.StringReader; 13import java.text.ParseException; 14import java.util.Arrays; 15import java.util.BitSet; 16import java.util.Comparator; 17import java.util.HashSet; 18import java.util.LinkedHashSet; 19import java.util.Locale; 20import java.util.Random; 21import java.util.Set; 22import java.util.regex.Matcher; 23import java.util.regex.Pattern; 24 25import com.ibm.icu.dev.test.TestFmwk; 26import com.ibm.icu.dev.test.TestUtil; 27import com.ibm.icu.dev.test.TestUtil.JavaVendor; 28import com.ibm.icu.impl.Utility; 29import com.ibm.icu.lang.UScript; 30import com.ibm.icu.text.IdentifierInfo; 31import com.ibm.icu.text.Normalizer2; 32import com.ibm.icu.text.SpoofChecker; 33import com.ibm.icu.text.SpoofChecker.CheckResult; 34import com.ibm.icu.text.SpoofChecker.RestrictionLevel; 35import com.ibm.icu.text.UnicodeSet; 36import com.ibm.icu.util.ULocale; 37 38public class SpoofCheckerTest extends TestFmwk { 39 40 public static void main(String[] args) throws Exception { 41 new SpoofCheckerTest().run(args); 42 } 43 44 /* 45 * Identifiers for verifying that spoof checking is minimally alive and working. 46 */ 47 char[] goodLatinChars = { (char) 0x75, (char) 0x7a }; 48 String goodLatin = new String(goodLatinChars); /* "uz", all ASCII */ 49 /* (not confusable) */ 50 char[] scMixedChars = { (char) 0x73, (char) 0x0441 }; 51 String scMixed = new String(scMixedChars); /* "sc", with Cyrillic 'c' */ 52 /* (mixed script, confusable */ 53 54 String scLatin = "sc"; /* "sc", plain ascii. */ 55 String goodCyrl = "\u0438\u043B"; // "Cyrillic small letter i and el" Plain lower case Cyrillic letters, no latin confusables 56 String goodGreek = "\u03c0\u03c6"; // "Greek small letter pi and phi" Plain lower case Greek letters 57 58 // Various 1 l I look-alikes 59 String lll_Latin_a = "lI1"; // small letter l, cap I, digit 1, all ASCII 60 // "\uFF29\u217C\u0196" Full-width I, Small Roman Numeral fifty, Latin Cap Letter IOTA 61 String lll_Latin_b = "\uff29\u217c\u0196"; 62 String lll_Cyrl = "\u0406\u04C0\u0031"; // "\u0406\u04C01" 63 /* The skeleton transform for all of the 'lll' lookalikes is ascii lower case letter l. */ 64 String lll_Skel = "lll"; 65 66 String han_Hiragana = "\u3086\u308A \u77F3\u7530"; // Hiragana, space, Han 67 68 69 /* 70 * Test basic constructor. 71 */ 72 public void TestUSpoof() { 73 SpoofChecker sc = new SpoofChecker.Builder().build(); 74 if (sc == null) { 75 errln("FAIL: null SpoofChecker"); 76 } 77 } 78 79 /* 80 * Test build from source rules. 81 */ 82 public void TestOpenFromSourceRules() { 83 if (TestUtil.getJavaVendor() == JavaVendor.IBM && TestUtil.getJavaVersion() == 5) { 84 // Note: IBM Java 5 has a bug reading a large UTF-8 text contents 85 logln("Skip this test case because of the IBM Java 5 bug"); 86 return; 87 } 88 String fileName; 89 Reader confusables; 90 Reader confusablesWholeScript; 91 92 try { 93 SpoofChecker rsc = null; 94 95 fileName = "unicode/confusables.txt"; 96 confusables = TestUtil.getDataReader(fileName, "UTF-8"); 97 try { 98 fileName = "unicode/confusablesWholeScript.txt"; 99 confusablesWholeScript = TestUtil.getDataReader(fileName, "UTF-8"); 100 try { 101 rsc = new SpoofChecker.Builder().setData(confusables, confusablesWholeScript).build(); 102 } finally { 103 confusablesWholeScript.close(); 104 } 105 } finally { 106 confusables.close(); 107 } 108 109 if (rsc == null) { 110 errln("FAIL: null SpoofChecker"); 111 return; 112 } 113 // Check that newly built-from-rules SpoofChecker is able to function. 114 checkSkeleton(rsc, "TestOpenFromSourceRules"); 115 116 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 117 rsc.failsChecks("Hello", result); 118 119 // The checker we just built from source rules should be equivalent to the 120 // default checker created from prebuilt rules baked into the ICU data. 121 SpoofChecker defaultChecker = new SpoofChecker.Builder().build(); 122 assertTrue("Checker built from rules equals default", defaultChecker.equals(rsc)); 123 124 SpoofChecker optionChecker = new SpoofChecker.Builder(). 125 setRestrictionLevel(RestrictionLevel.UNRESTRICTIVE).build(); 126 assertFalse("", optionChecker.equals(rsc)); 127 128 // Stub source data to build into a test SpoofChecker 129 String stubWSConfusables = 130 "# Stub Whole Script Confusable data\n" + 131 "0561 ; Armn; Cyrl; L # (ա) ARMENIAN SMALL LETTER AYB\n"; 132 133 String stubConfusables = 134 "# Stub confusables data\n" + 135 "05AD ; 0596 ; SL # ( ֭ → ֖ ) HEBREW ACCENT DEHI → HEBREW ACCENT TIPEHA #\n"; 136 137 // Verify that re-using a builder doesn't alter SpoofCheckers that were 138 // previously created by that builder. (The builder could modify data 139 // being used by the existing checker) 140 141 SpoofChecker.Builder builder = new SpoofChecker.Builder(); 142 SpoofChecker testChecker1 = builder.build(); 143 assertTrue("", testChecker1.equals(defaultChecker)); 144 145 builder.setData(new StringReader(stubConfusables), new StringReader(stubWSConfusables)); 146 builder.setRestrictionLevel(RestrictionLevel.UNRESTRICTIVE); 147 builder.setChecks(SpoofChecker.SINGLE_SCRIPT_CONFUSABLE); 148 Set<ULocale>allowedLocales = new HashSet<ULocale>(); 149 allowedLocales.add(ULocale.JAPANESE); 150 allowedLocales.add(ULocale.FRENCH); 151 builder.setAllowedLocales(allowedLocales); 152 SpoofChecker testChecker2 = builder.build(); 153 SpoofChecker testChecker3 = builder.build(); 154 155 assertTrue("", testChecker1.equals(defaultChecker)); 156 assertFalse("", testChecker2.equals(defaultChecker)); 157 assertTrue("", testChecker2.equals(testChecker3)); 158 159 } catch (java.io.IOException e) { 160 errln(e.toString()); 161 } catch (ParseException e) { 162 errln(e.toString()); 163 } 164 } 165 166 /* 167 * Set & Get Check Flags 168 */ 169 public void TestGetSetChecks1() { 170 SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS).build(); 171 int t; 172 t = sc.getChecks(); 173 assertEquals("", SpoofChecker.ALL_CHECKS, t); 174 175 sc = new SpoofChecker.Builder().setChecks(0).build(); 176 t = sc.getChecks(); 177 assertEquals("", 0, t); 178 179 int checks = SpoofChecker.WHOLE_SCRIPT_CONFUSABLE | SpoofChecker.MIXED_SCRIPT_CONFUSABLE 180 | SpoofChecker.ANY_CASE; 181 sc = new SpoofChecker.Builder().setChecks(checks).build(); 182 t = sc.getChecks(); 183 assertEquals("", checks, t); 184 } 185 186 /* 187 * get & setAllowedChars 188 */ 189 public void TestGetSetAllowedChars() { 190 SpoofChecker sc = new SpoofChecker.Builder().build(); 191 UnicodeSet us; 192 UnicodeSet uset; 193 194 uset = sc.getAllowedChars(); 195 assertTrue("", uset.isFrozen()); 196 us = new UnicodeSet((int) 0x41, (int) 0x5A); /* [A-Z] */ 197 sc = new SpoofChecker.Builder().setAllowedChars(us).build(); 198 assertEquals("", us, sc.getAllowedChars()); 199 } 200 201 /* 202 * get & set Checks 203 */ 204 public void TestGetSetChecks() { 205 SpoofChecker sc = new SpoofChecker.Builder().build(); 206 int checks; 207 int checks2; 208 boolean checkResults; 209 210 checks = sc.getChecks(); 211 assertEquals("", SpoofChecker.ALL_CHECKS, checks); 212 213 checks &= ~(SpoofChecker.SINGLE_SCRIPT | SpoofChecker.MIXED_SCRIPT_CONFUSABLE); 214 sc = new SpoofChecker.Builder().setChecks(checks).build(); 215 checks2 = sc.getChecks(); 216 assertEquals("", checks, checks2); 217 218 /* 219 * The checks that were disabled just above are the same ones that the "scMixed" test fails. So with those tests 220 * gone checking that Identifier should now succeed 221 */ 222 checkResults = sc.failsChecks(scMixed); 223 assertFalse("", checkResults); 224 } 225 226 /* 227 * AllowedLocales 228 */ 229 public void TestAllowedLocales() { 230 SpoofChecker sc = new SpoofChecker.Builder().build(); 231 Set<ULocale> allowedLocales = null; 232 Set<Locale> allowedJavaLocales = null; 233 boolean checkResults; 234 235 /* Default allowed locales list should be empty */ 236 allowedLocales = sc.getAllowedLocales(); 237 assertTrue("Empty allowed locales", allowedLocales.isEmpty()); 238 239 allowedJavaLocales = sc.getAllowedJavaLocales(); 240 assertTrue("Empty allowed Java locales", allowedJavaLocales.isEmpty()); 241 242 /* Allow en and ru, which should enable Latin and Cyrillic only to pass */ 243 ULocale enloc = new ULocale("en"); 244 ULocale ruloc = new ULocale("ru_RU"); 245 allowedLocales = new HashSet<ULocale>(); 246 allowedLocales.add(enloc); 247 allowedLocales.add(ruloc); 248 sc = new SpoofChecker.Builder().setAllowedLocales(allowedLocales).build(); 249 allowedLocales = sc.getAllowedLocales(); 250 assertTrue("en in allowed locales", allowedLocales.contains(enloc)); 251 assertTrue("ru_RU in allowed locales", allowedLocales.contains(ruloc)); 252 253 Locale frlocJ = new Locale("fr"); 254 allowedJavaLocales = new HashSet<Locale>(); 255 allowedJavaLocales.add(frlocJ); 256 sc = new SpoofChecker.Builder().setAllowedJavaLocales(allowedJavaLocales).build(); 257 assertFalse("no en in allowed Java locales", allowedJavaLocales.contains(new Locale("en"))); 258 assertTrue("fr in allowed Java locales", allowedJavaLocales.contains(frlocJ)); 259 260 /* 261 * Limit checks to SpoofChecker.CHAR_LIMIT. Some of the test data has whole script confusables also, which we 262 * don't want to see in this test. 263 */ 264 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build(); 265 266 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 267 checkResults = sc.failsChecks(goodLatin); 268 assertFalse("", checkResults); 269 270 checkResults = sc.failsChecks(goodGreek, result); 271 assertEquals("", SpoofChecker.CHAR_LIMIT, result.checks); 272 273 checkResults = sc.failsChecks(goodCyrl); 274 assertFalse("", checkResults); 275 276 /* Reset with an empty locale list, which should allow all characters to pass */ 277 allowedLocales = new LinkedHashSet<ULocale>(); 278 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build(); 279 280 checkResults = sc.failsChecks(goodGreek); 281 assertFalse("", checkResults); 282 } 283 284 /* 285 * AllowedChars set/get the UnicodeSet of allowed characters. 286 */ 287 public void TestAllowedChars() { 288 SpoofChecker sc = new SpoofChecker.Builder().build(); 289 UnicodeSet set; 290 UnicodeSet tmpSet; 291 boolean checkResults; 292 293 /* By default, we should see no restriction; the UnicodeSet should allow all characters. */ 294 set = sc.getAllowedChars(); 295 tmpSet = new UnicodeSet(0, 0x10ffff); 296 assertEquals("", tmpSet, set); 297 298 /* Setting the allowed chars should enable the check. */ 299 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CHAR_LIMIT).build(); 300 301 /* Remove a character that is in our good Latin test identifier from the allowed chars set. */ 302 tmpSet.remove(goodLatin.charAt(1)); 303 sc = new SpoofChecker.Builder().setAllowedChars(tmpSet).build(); 304 305 /* Latin Identifier should now fail; other non-latin test cases should still be OK */ 306 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 307 checkResults = sc.failsChecks(goodLatin, result); 308 assertTrue("", checkResults); 309 assertEquals("", SpoofChecker.CHAR_LIMIT | SpoofChecker.RESTRICTION_LEVEL, result.checks); 310 311 checkResults = sc.failsChecks(goodGreek, result); 312 assertTrue("", checkResults); 313 assertEquals("", SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, result.checks); 314 } 315 316 public void TestCheck() { 317 SpoofChecker sc = new SpoofChecker.Builder().build(); 318 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 319 boolean checkResults; 320 321 result.position = 666; 322 checkResults = sc.failsChecks(goodLatin, result); 323 assertFalse("", checkResults); 324 assertEquals("", 0, result.position); 325 326 checkResults = sc.failsChecks(goodCyrl, result); 327 assertFalse("", checkResults); 328 329 result.position = 666; 330 checkResults = sc.failsChecks(scMixed, result); 331 assertTrue("", checkResults); 332 assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.SINGLE_SCRIPT, result.checks); 333 assertEquals("", 0, result.position); 334 335 result.position = 666; 336 checkResults = sc.failsChecks(han_Hiragana, result); 337 assertFalse("", checkResults); 338 assertEquals("", 0, result.position); 339 assertEquals("", 0, result.checks); 340 } 341 342 public void TestAreConfusable1() { 343 SpoofChecker sc = new SpoofChecker.Builder().build(); 344 int checkResults; 345 checkResults = sc.areConfusable(scLatin, scMixed); 346 assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE, checkResults); 347 348 checkResults = sc.areConfusable(goodGreek, scLatin); 349 assertEquals("", 0, checkResults); 350 351 checkResults = sc.areConfusable(lll_Latin_a, lll_Latin_b); 352 assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, checkResults); 353 } 354 355 public void TestGetSkeleton() { 356 SpoofChecker sc = new SpoofChecker.Builder().build(); 357 String dest; 358 dest = sc.getSkeleton(SpoofChecker.ANY_CASE, lll_Latin_a); 359 assertEquals("", lll_Skel, dest); 360 } 361 362 /** 363 * IntlTestSpoof is the top level test class for the Unicode Spoof detection tests 364 */ 365 366 // Test the USpoofDetector API functions that require C++ 367 // The pure C part of the API, which is most of it, is tested in cintltst 368 /** 369 * IntlTestSpoof tests for USpoofDetector 370 */ 371 public void TestSpoofAPI() { 372 SpoofChecker sc = new SpoofChecker.Builder().build(); 373 String s = "xyz"; // Many latin ranges are whole-script confusable with other scripts. 374 // If this test starts failing, consult confusablesWholeScript.txt 375 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 376 result.position = 666; 377 boolean checkResults = sc.failsChecks(s, result); 378 assertFalse("", checkResults); 379 assertEquals("", 0, result.position); 380 381 sc = new SpoofChecker.Builder().build(); 382 String s1 = "cxs"; 383 String s2 = Utility.unescape("\\u0441\\u0445\\u0455"); // Cyrillic "cxs" 384 int checkResult = sc.areConfusable(s1, s2); 385 assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, checkResult); 386 387 sc = new SpoofChecker.Builder().build(); 388 s = "I1l0O"; 389 String dest = sc.getSkeleton(SpoofChecker.ANY_CASE, s); 390 assertEquals("", dest, "lllOO"); 391 } 392 393 public void TestSkeleton() { 394 SpoofChecker sc = new SpoofChecker.Builder().build(); 395 checkSkeleton(sc, "TestSkeleton"); 396 } 397 398 // testSkeleton. Spot check a number of confusable skeleton substitutions from the 399 // Unicode data file confusables.txt 400 // Test cases chosen for substitutions of various lengths, and 401 // membership in different mapping tables. 402 public void checkSkeleton(SpoofChecker sc, String testName) { 403 int ML = 0; 404 int SL = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE; 405 int MA = SpoofChecker.ANY_CASE; 406 int SA = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE | SpoofChecker.ANY_CASE; 407 408 checkSkeleton(sc, MA, "\\u02b9identifier'", "'identifier'", testName); 409 410 checkSkeleton(sc, SL, "nochange", "nochange", testName); 411 checkSkeleton(sc, SA, "nochange", "nochange", testName); 412 checkSkeleton(sc, ML, "nochange", "nochange", testName); 413 checkSkeleton(sc, MA, "nochange", "nochange", testName); 414 checkSkeleton(sc, MA, "love", "love", testName); 415 checkSkeleton(sc, MA, "1ove", "love", testName); // Digit 1 to letter l 416 checkSkeleton(sc, ML, "OOPS", "OOPS", testName); 417 checkSkeleton(sc, ML, "00PS", "OOPS", testName); 418 checkSkeleton(sc, MA, "OOPS", "OOPS", testName); 419 checkSkeleton(sc, MA, "00PS", "OOPS", testName); // Digit 0 to letter O 420 checkSkeleton(sc, SL, "\\u059c", "\\u0301", testName); 421 checkSkeleton(sc, SL, "\\u2A74", "\\u003A\\u003A\\u003D", testName); 422 checkSkeleton(sc, SL, "\\u247E", "(ll)", testName); 423 checkSkeleton(sc, SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u006c\\u0644\\u006f", testName); 424 425 // 0C83 mapping existed in the ML and MA tables, did not exist in SL, SA (Original Unicode 7) 426 // mapping exists in all tables (ICU 55). 427 // 0C83 ; 0983 ; ML # KANNADA SIGN VISARGA to 428 checkSkeleton(sc, SL, "\\u0C83", "\\u0983", testName); 429 checkSkeleton(sc, SA, "\\u0C83", "\\u0983", testName); 430 checkSkeleton(sc, ML, "\\u0C83", "\\u0983", testName); 431 checkSkeleton(sc, MA, "\\u0C83", "\\u0983", testName); 432 433 // 0391 mappings existed only in MA and SA tables (Original Unicode 7). 434 // mappings exist in all tables (ICU 55) 435 checkSkeleton(sc, MA, "\\u0391", "A", testName); 436 checkSkeleton(sc, SA, "\\u0391", "A", testName); 437 checkSkeleton(sc, ML, "\\u0391", "A", testName); 438 checkSkeleton(sc, SL, "\\u0391", "A", testName); 439 440 // 13CF Mappings in all four tables, different in MA (Original Unicode 7). 441 // Mapping same in all tables (ICU 55) 442 checkSkeleton(sc, ML, "\\u13CF", "b", testName); 443 checkSkeleton(sc, MA, "\\u13CF", "b", testName); 444 checkSkeleton(sc, SL, "\\u13CF", "b", testName); 445 checkSkeleton(sc, SA, "\\u13CF", "b", testName); 446 447 // 0022 ; 0027 0027 ; 448 // all tables 449 checkSkeleton(sc, SL, "\"", "\\u0027\\u0027", testName); 450 checkSkeleton(sc, SA, "\"", "\\u0027\\u0027", testName); 451 checkSkeleton(sc, ML, "\"", "\\u0027\\u0027", testName); 452 checkSkeleton(sc, MA, "\"", "\\u0027\\u0027", testName); 453 454 } 455 456 // Internal function to run a single skeleton test case. 457 // 458 // Run a single confusable skeleton transformation test case. 459 // 460 void checkSkeleton(SpoofChecker sc, int type, String input, String expected, String testName) { 461 String uInput = Utility.unescape(input); 462 String uExpected = Utility.unescape(expected); 463 String actual; 464 actual = sc.getSkeleton(type, uInput); 465 Throwable t = new Throwable(); 466 int lineNumberOfTest = t.getStackTrace()[1].getLineNumber(); 467 468 assertEquals(testName + " test at line " + lineNumberOfTest + " : Expected (escaped): " + expected, uExpected, actual); 469 } 470 471 public void TestAreConfusable() { 472 SpoofChecker sc = new SpoofChecker.Builder().build(); 473 String s1 = "A long string that will overflow stack buffers. A long string that will overflow stack buffers. " 474 + "A long string that will overflow stack buffers. A long string that will overflow stack buffers. "; 475 String s2 = "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. " 476 + "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "; 477 assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, sc.areConfusable(s1, s2)); 478 } 479 480 public void TestInvisible() { 481 SpoofChecker sc = new SpoofChecker.Builder().build(); 482 String s = Utility.unescape("abcd\\u0301ef"); 483 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 484 result.position = -42; 485 assertFalse("", sc.failsChecks(s, result)); 486 assertEquals("", 0, result.checks); 487 assertEquals("", result.position, 0); 488 489 String s2 = Utility.unescape("abcd\\u0301\\u0302\\u0301ef"); 490 assertTrue("", sc.failsChecks(s2, result)); 491 assertEquals("", SpoofChecker.INVISIBLE, result.checks); 492 assertEquals("", 0, result.position); 493 494 // Two acute accents, one from the composed a with acute accent, \u00e1, 495 // and one separate. 496 result.position = -42; 497 String s3 = Utility.unescape("abcd\\u00e1\\u0301xyz"); 498 assertTrue("", sc.failsChecks(s3, result)); 499 assertEquals("", SpoofChecker.INVISIBLE, result.checks); 500 assertEquals("", 0, result.position); 501 } 502 503 public void TestRestrictionLevel() { 504 Object[][] tests = { 505 {"aγ♥", RestrictionLevel.UNRESTRICTIVE}, 506 {"a", RestrictionLevel.ASCII}, 507 {"γ", RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE}, 508 {"aアー", RestrictionLevel.HIGHLY_RESTRICTIVE}, 509 {"aऄ", RestrictionLevel.MODERATELY_RESTRICTIVE}, 510 {"aγ", RestrictionLevel.MINIMALLY_RESTRICTIVE}, 511 }; 512 IdentifierInfo idInfo = new IdentifierInfo().setIdentifierProfile(SpoofChecker.RECOMMENDED); 513 CheckResult checkResult = new CheckResult(); 514 for (Object[] test : tests) { 515 String testString = (String) test[0]; 516 RestrictionLevel expectedLevel = (RestrictionLevel) test[1]; 517 idInfo.setIdentifier(testString); 518 assertEquals("Testing restriction level for '" + testString + "'", expectedLevel, idInfo.getRestrictionLevel()); 519 for (RestrictionLevel levelSetInSpoofChecker : RestrictionLevel.values()) { 520 SpoofChecker sc = new SpoofChecker.Builder() 521 .setChecks(SpoofChecker.RESTRICTION_LEVEL) // only check this 522 .setAllowedChars(SpoofChecker.RECOMMENDED) 523 .setRestrictionLevel(levelSetInSpoofChecker) 524 .build(); 525 boolean actualValue = sc.failsChecks(testString, checkResult); 526 527 // we want to fail if the text is (say) MODERATE and the testLevel is ASCII 528 boolean expectedFailure = expectedLevel.compareTo(levelSetInSpoofChecker) > 0 || !SpoofChecker.RECOMMENDED.containsAll(testString); 529 boolean t = assertEquals("Testing spoof restriction level for '" + testString + "', " + levelSetInSpoofChecker, expectedFailure, actualValue); 530 if (!t) { // debugging 531 actualValue = sc.failsChecks(testString, checkResult); 532 // we want to fail if the text is (say) MODERATE and the testLevel is ASCII 533 expectedFailure = expectedLevel.compareTo(levelSetInSpoofChecker) > 0 || !SpoofChecker.RECOMMENDED.containsAll(testString); 534 } 535 } 536 } 537 } 538 539 public void TestMixedNumbers() { 540 Object[][] tests = { 541 {"1", "[0]"}, 542 {"१", "[०]"}, 543 {"1१", "[0०]"}, 544 {"١۱", "[٠۰]"}, 545 }; 546 IdentifierInfo idInfo = new IdentifierInfo(); 547 CheckResult checkResult = new CheckResult(); 548 for (Object[] test : tests) { 549 String testString = (String) test[0]; 550 UnicodeSet expected = new UnicodeSet((String)test[1]); 551 idInfo.setIdentifier(testString); 552 assertEquals("", expected, idInfo.getNumerics()); 553 554 SpoofChecker sc = new SpoofChecker.Builder() 555 .setChecks(SpoofChecker.MIXED_NUMBERS) // only check this 556 .build(); 557 boolean actualValue = sc.failsChecks(testString, checkResult); 558 assertEquals("Testing spoof mixed numbers for '" + testString + "', ", expected.size() > 1, actualValue); 559 } 560 } 561 562 public void TestIdentifierInfo() { 563// contains(BitSet, BitSet) 564 BitSet bitset12 = IdentifierInfo.set(new BitSet(), UScript.LATIN, UScript.HANGUL); 565 BitSet bitset2 = IdentifierInfo.set(new BitSet(), UScript.HANGUL); 566 assertTrue("", IdentifierInfo.contains(bitset12, bitset2)); 567 assertTrue("", IdentifierInfo.contains(bitset12, bitset12)); 568 assertTrue("", !IdentifierInfo.contains(bitset2, bitset12)); 569 570 assertTrue("", IdentifierInfo.BITSET_COMPARATOR.compare( 571 IdentifierInfo.set(new BitSet(), UScript.ARABIC), 572 IdentifierInfo.set(new BitSet(), UScript.LATIN)) < 0); 573// displayAlternates(Collection<BitSet>) 574// displayScripts(BitSet) 575 String scriptString = IdentifierInfo.displayScripts(bitset12); 576 assertEquals("", "Hang Latn", scriptString); 577 Set<BitSet> alternates = new HashSet(Arrays.asList(bitset12, bitset2)); 578 String alternatesString = IdentifierInfo.displayAlternates(alternates); 579 assertEquals("", "Hang; Hang Latn", alternatesString); 580 581// parseAlternates(String) 582// parseScripts(String) 583 assertEquals("", bitset12, IdentifierInfo.parseScripts(scriptString)); 584 assertEquals("", alternates, IdentifierInfo.parseAlternates(alternatesString)); 585 586 String[][] tests = { 587 // String, restriction-level, numerics, scripts, alternates, common-alternates 588 {"a♥", "UNRESTRICTIVE", "[]", "Latn", "", ""}, 589 {"a〆", "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"}, 590 {"aー〆", "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hira Kana", "Hira Kana"}, 591 {"aー〆ア", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""}, 592 {"アaー〆", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""}, 593 {"a1١", "UNRESTRICTIVE", "[0٠]", "Latn", "Arab Thaa", "Arab Thaa"}, 594 {"a1١۱", "UNRESTRICTIVE", "[0٠۰]", "Latn Arab", "", ""}, 595 {"١ー〆aア1१۱", "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"}, 596 {"aアー〆1१١۱", "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"}, 597 }; 598 for (String[] test : tests) { 599 String testString = test[0]; 600 IdentifierInfo idInfo = new IdentifierInfo(); 601 idInfo.setIdentifierProfile(SpoofChecker.RECOMMENDED); 602 idInfo.setIdentifier(testString); 603 assertEquals("Identifier " + testString, testString, idInfo.getIdentifier()); 604 605 RestrictionLevel restrictionLevel = RestrictionLevel.valueOf(test[1]); 606 assertEquals("RestrictionLevel " + testString, restrictionLevel, idInfo.getRestrictionLevel()); 607 608 UnicodeSet numerics = new UnicodeSet(test[2]); 609 assertEquals("Numerics " + testString, numerics, idInfo.getNumerics()); 610 611 BitSet scripts = IdentifierInfo.parseScripts(test[3]); 612 assertEquals("Scripts " + testString, scripts, idInfo.getScripts()); 613 614 Set<BitSet> alternates2 = IdentifierInfo.parseAlternates(test[4]); 615 assertEquals("Alternates " + testString, alternates2, idInfo.getAlternates()); 616 617 BitSet commonAlternates = IdentifierInfo.parseScripts(test[5]); 618 assertEquals("Common Alternates " + testString, commonAlternates, idInfo.getCommonAmongAlternates()); 619 } 620 621// TODO 622// getIdentifierProfile() 623// setIdentifierProfile(UnicodeSet) 624 } 625 626 public void TestComparator() { 627 Random random = new Random(0); 628 for (int i = 0; i < 100; ++i) { 629 BitSet[] items = new BitSet[random.nextInt(5)+3]; 630 for (int j = 0; j < items.length; ++j) { 631 items[j] = new BitSet(); 632 int countInBitset = random.nextInt(5); 633 for (int k = 0; k < countInBitset; ++k) { 634 items[j].set(random.nextInt(10)); 635 } 636 } 637 checkComparator(IdentifierInfo.BITSET_COMPARATOR, items); 638 } 639 } 640 641 // Dumb implementation for now 642 private <T> void checkComparator(Comparator<T> comparator, T... items) { 643 logln("Checking " + Arrays.asList(items)); 644 /* 645 * The relation is transitive: a < b and b < c implies a < c. We test here. 646 * The relation is trichotomous: exactly one of a < b, b < a and a = b is true. Guaranteed by comparator. 647 */ 648 for (int i = 0; i < items.length-2; ++i) { 649 T a = items[i]; 650 for (int j = i+1; j < items.length-1; ++j) { 651 T b = items[j]; 652 for (int k = j+1; k < items.length; ++k) { 653 T c = items[k]; 654 checkTransitivity(comparator, a, b, c); 655 checkTransitivity(comparator, a, c, b); 656 checkTransitivity(comparator, b, a, b); 657 checkTransitivity(comparator, b, c, a); 658 checkTransitivity(comparator, c, a, b); 659 checkTransitivity(comparator, c, b, a); 660 } 661 } 662 } 663 } 664 665 private <T> void checkTransitivity(Comparator<T> comparator, T a, T b, T c) { 666 int ab = comparator.compare(a,b); 667 int bc = comparator.compare(b,c); 668 int ca = comparator.compare(c,a); 669 if (!assertFalse("Transitive: " + a + ", " + b + ", " + c, 670 ab < 0 && bc < 0 && ca <= 0)) { 671 // for debugging 672 comparator.compare(a,b); 673 comparator.compare(b,c); 674 comparator.compare(c,a); 675 assertFalse("Transitive: " + a + ", " + b + ", " + c, 676 ab < 0 && bc < 0 && ca <= 0); 677 } 678 } 679 680 private String parseHex(String in) { 681 StringBuilder sb = new StringBuilder(); 682 for (String oneCharAsHexString : in.split("\\s+")) { 683 if (oneCharAsHexString.length() > 0) { 684 sb.appendCodePoint(Integer.parseInt(oneCharAsHexString, 16)); 685 } 686 } 687 return sb.toString(); 688 } 689 690 private String escapeString(String in) { 691 StringBuilder out = new StringBuilder(); 692 for (int i = 0; i < in.length(); i++) { 693 int c = in.codePointAt(i); 694 if (c <= 0x7f) { 695 out.append((char) c); 696 } else if (c <= 0xffff) { 697 out.append(String.format("\\u%04x", c)); 698 } else { 699 out.append(String.format("\\U%06x", c)); 700 i++; 701 } 702 } 703 return out.toString(); 704 } 705 706 // Verify that each item from the Unicode confusables.txt file 707 // transforms into the expected skeleton. 708 public void testConfData() { 709 if (TestUtil.getJavaVendor() == JavaVendor.IBM && TestUtil.getJavaVersion() == 5) { 710 // Note: IBM Java 5 has a bug reading a large UTF-8 text contents 711 logln("Skip this test case because of the IBM Java 5 bug"); 712 return; 713 } 714 try { 715 // Read in the confusables.txt file. (Distributed by Unicode.org) 716 String fileName = "unicode/confusables.txt"; 717 BufferedReader confusablesRdr = TestUtil.getDataReader(fileName, "UTF-8"); 718 719 // Create a default spoof checker to use in this test. 720 SpoofChecker sc = new SpoofChecker.Builder().build(); 721 722 // Parse lines from the confusables.txt file. Example Line: 723 // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH .... 724 // Lines have three fields. The hex fields can contain more than one character, 725 // and each character may be more than 4 digits (for supplemntals) 726 // This regular expression matches lines and splits the fields into capture groups. 727 // Capture group 1: map from chars 728 // 2: map to chars 729 // 3: table type, SL, ML, SA or MA 730 // 4: Comment Lines Only 731 // 5: Error Lines Only 732 Matcher parseLine = Pattern.compile( 733 "\\ufeff?" + "(?:([0-9A-F\\s]+);([0-9A-F\\s]+);\\s*(SL|ML|SA|MA)\\s*(?:#.*?)?$)" 734 + "|\\ufeff?(\\s*(?:#.*)?)"). // Comment line 735 matcher(""); 736 Normalizer2 normalizer = Normalizer2.getNFDInstance(); 737 int lineNum = 0; 738 String inputLine; 739 while ((inputLine = confusablesRdr.readLine()) != null) { 740 lineNum++; 741 parseLine.reset(inputLine); 742 if (!parseLine.matches()) { 743 errln("Syntax error in confusable data file at line " + lineNum); 744 errln(inputLine); 745 break; 746 } 747 if (parseLine.group(4) != null) { 748 continue; // comment line 749 } 750 String from = parseHex(parseLine.group(1)); 751 752 if (!normalizer.isNormalized(from)) { 753 // The source character was not NFD. 754 // Skip this case; the first step in obtaining a skeleton is to NFD the input, 755 // so the mapping in this line of confusables.txt will never be applied. 756 continue; 757 } 758 759 String rawExpected = parseHex(parseLine.group(2)); 760 String expected = normalizer.normalize(rawExpected); 761 762 int skeletonType = 0; 763 String tableType = parseLine.group(3); 764 if (tableType.equals("SL")) { 765 skeletonType = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE; 766 } else if (tableType.indexOf("SA") >= 0) { 767 skeletonType = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE | SpoofChecker.ANY_CASE; 768 } else if (tableType.indexOf("ML") >= 0) { 769 skeletonType = 0; 770 } else if (tableType.indexOf("MA") >= 0) { 771 skeletonType = SpoofChecker.ANY_CASE; 772 } 773 774 String actual; 775 actual = sc.getSkeleton(skeletonType, from); 776 777 if (!actual.equals(expected)) { 778 errln("confusables.txt: " + lineNum + ": " + parseLine.group(0)); 779 errln("Actual: " + escapeString(actual)); 780 } 781 } 782 confusablesRdr.close(); 783 } catch (IOException e) { 784 errln(e.toString()); 785 } 786 } 787} 788