SpoofCheckerTest.java revision 7935b1839a081ed19ae0d33029ad3c09632a2caa
1/* 2 ******************************************************************************* 3 * Copyright (C) 2009-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7package com.ibm.icu.dev.test.text; 8 9import java.io.BufferedReader; 10import java.io.IOException; 11import java.io.Reader; 12import java.io.StringReader; 13import java.text.ParseException; 14import java.util.Arrays; 15import java.util.BitSet; 16import java.util.Comparator; 17import java.util.HashSet; 18import java.util.LinkedHashSet; 19import java.util.Locale; 20import java.util.Random; 21import java.util.Set; 22import java.util.regex.Matcher; 23import java.util.regex.Pattern; 24 25import com.ibm.icu.dev.test.TestFmwk; 26import com.ibm.icu.dev.test.TestUtil; 27import com.ibm.icu.dev.test.TestUtil.JavaVendor; 28import com.ibm.icu.impl.Utility; 29import com.ibm.icu.lang.UScript; 30import com.ibm.icu.text.IdentifierInfo; 31import com.ibm.icu.text.Normalizer2; 32import com.ibm.icu.text.SpoofChecker; 33import com.ibm.icu.text.SpoofChecker.CheckResult; 34import com.ibm.icu.text.SpoofChecker.RestrictionLevel; 35import com.ibm.icu.text.UnicodeSet; 36import com.ibm.icu.util.ULocale; 37 38public class SpoofCheckerTest extends TestFmwk { 39 40 public static void main(String[] args) throws Exception { 41 new SpoofCheckerTest().run(args); 42 } 43 44 /* 45 * Identifiers for verifying that spoof checking is minimally alive and working. 46 */ 47 char[] goodLatinChars = { (char) 0x75, (char) 0x7a }; 48 String goodLatin = new String(goodLatinChars); /* "uz", all ASCII */ 49 /* (not confusable) */ 50 char[] scMixedChars = { (char) 0x73, (char) 0x0441 }; 51 String scMixed = new String(scMixedChars); /* "sc", with Cyrillic 'c' */ 52 /* (mixed script, confusable */ 53 54 String scLatin = "sc"; /* "sc", plain ascii. */ 55 String goodCyrl = "\u0438\u043B"; // "Cyrillic small letter i and el" Plain lower case Cyrillic letters, no latin confusables 56 String goodGreek = "\u03c0\u03c6"; // "Greek small letter pi and phi" Plain lower case Greek letters 57 58 // Various 1 l I look-alikes 59 String lll_Latin_a = "lI1"; // small letter l, cap I, digit 1, all ASCII 60 // "\uFF29\u217C\u0196" Full-width I, Small Roman Numeral fifty, Latin Cap Letter IOTA 61 String lll_Latin_b = "\uff29\u217c\u0196"; 62 String lll_Cyrl = "\u0406\u04C0\u0031"; // "\u0406\u04C01" 63 /* The skeleton transform for all of the 'lll' lookalikes is ascii lower case letter l. */ 64 String lll_Skel = "lll"; 65 66 String han_Hiragana = "\u3086\u308A \u77F3\u7530"; // Hiragana, space, Han 67 68 69 /* 70 * Test basic constructor. 71 */ 72 public void TestUSpoof() { 73 SpoofChecker sc = new SpoofChecker.Builder().build(); 74 if (sc == null) { 75 errln("FAIL: null SpoofChecker"); 76 } 77 } 78 79 /* 80 * Test build from source rules. 81 */ 82 public void TestOpenFromSourceRules() { 83 if (TestUtil.getJavaVendor() == JavaVendor.IBM && TestUtil.getJavaVersion() == 5) { 84 // Note: IBM Java 5 has a bug reading a large UTF-8 text contents 85 logln("Skip this test case because of the IBM Java 5 bug"); 86 return; 87 } 88 String fileName; 89 Reader confusables; 90 Reader confusablesWholeScript; 91 92 try { 93 fileName = "unicode/confusables.txt"; 94 confusables = TestUtil.getDataReader(fileName, "UTF-8"); 95 fileName = "unicode/confusablesWholeScript.txt"; 96 confusablesWholeScript = TestUtil.getDataReader(fileName, "UTF-8"); 97 98 SpoofChecker rsc = new SpoofChecker.Builder().setData(confusables, confusablesWholeScript) 99 .build(); 100 if (rsc == null) { 101 errln("FAIL: null SpoofChecker"); 102 return; 103 } 104 // Check that newly built-from-rules SpoofChecker is able to function. 105 checkSkeleton(rsc, "TestOpenFromSourceRules"); 106 107 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 108 rsc.failsChecks("Hello", result); 109 110 // The checker we just built from source rules should be equivalent to the 111 // default checker created from prebuilt rules baked into the ICU data. 112 SpoofChecker defaultChecker = new SpoofChecker.Builder().build(); 113 assertTrue("Checker built from rules equals default", defaultChecker.equals(rsc)); 114 115 SpoofChecker optionChecker = new SpoofChecker.Builder(). 116 setRestrictionLevel(RestrictionLevel.UNRESTRICTIVE).build(); 117 assertFalse("", optionChecker.equals(rsc)); 118 119 // Stub source data to build into a test SpoofChecker 120 String stubWSConfusables = 121 "# Stub Whole Script Confusable data\n" + 122 "0561 ; Armn; Cyrl; L # (ա) ARMENIAN SMALL LETTER AYB\n"; 123 124 String stubConfusables = 125 "# Stub confusables data\n" + 126 "05AD ; 0596 ; SL # ( ֭ → ֖ ) HEBREW ACCENT DEHI → HEBREW ACCENT TIPEHA #\n"; 127 128 // Verify that re-using a builder doesn't alter SpoofCheckers that were 129 // previously created by that builder. (The builder could modify data 130 // being used by the existing checker) 131 132 SpoofChecker.Builder builder = new SpoofChecker.Builder(); 133 SpoofChecker testChecker1 = builder.build(); 134 assertTrue("", testChecker1.equals(defaultChecker)); 135 136 builder.setData(new StringReader(stubConfusables), new StringReader(stubWSConfusables)); 137 builder.setRestrictionLevel(RestrictionLevel.UNRESTRICTIVE); 138 builder.setChecks(SpoofChecker.SINGLE_SCRIPT_CONFUSABLE); 139 Set<ULocale>allowedLocales = new HashSet<ULocale>(); 140 allowedLocales.add(ULocale.JAPANESE); 141 allowedLocales.add(ULocale.FRENCH); 142 builder.setAllowedLocales(allowedLocales); 143 SpoofChecker testChecker2 = builder.build(); 144 SpoofChecker testChecker3 = builder.build(); 145 146 assertTrue("", testChecker1.equals(defaultChecker)); 147 assertFalse("", testChecker2.equals(defaultChecker)); 148 assertTrue("", testChecker2.equals(testChecker3)); 149 150 } catch (java.io.IOException e) { 151 errln(e.toString()); 152 } catch (ParseException e) { 153 errln(e.toString()); 154 } 155 } 156 157 /* 158 * Set & Get Check Flags 159 */ 160 public void TestGetSetChecks1() { 161 SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS).build(); 162 int t; 163 t = sc.getChecks(); 164 assertEquals("", SpoofChecker.ALL_CHECKS, t); 165 166 sc = new SpoofChecker.Builder().setChecks(0).build(); 167 t = sc.getChecks(); 168 assertEquals("", 0, t); 169 170 int checks = SpoofChecker.WHOLE_SCRIPT_CONFUSABLE | SpoofChecker.MIXED_SCRIPT_CONFUSABLE 171 | SpoofChecker.ANY_CASE; 172 sc = new SpoofChecker.Builder().setChecks(checks).build(); 173 t = sc.getChecks(); 174 assertEquals("", checks, t); 175 } 176 177 /* 178 * get & setAllowedChars 179 */ 180 public void TestGetSetAllowedChars() { 181 SpoofChecker sc = new SpoofChecker.Builder().build(); 182 UnicodeSet us; 183 UnicodeSet uset; 184 185 uset = sc.getAllowedChars(); 186 assertTrue("", uset.isFrozen()); 187 us = new UnicodeSet((int) 0x41, (int) 0x5A); /* [A-Z] */ 188 sc = new SpoofChecker.Builder().setAllowedChars(us).build(); 189 assertEquals("", us, sc.getAllowedChars()); 190 } 191 192 /* 193 * get & set Checks 194 */ 195 public void TestGetSetChecks() { 196 SpoofChecker sc = new SpoofChecker.Builder().build(); 197 int checks; 198 int checks2; 199 boolean checkResults; 200 201 checks = sc.getChecks(); 202 assertEquals("", SpoofChecker.ALL_CHECKS, checks); 203 204 checks &= ~(SpoofChecker.SINGLE_SCRIPT | SpoofChecker.MIXED_SCRIPT_CONFUSABLE); 205 sc = new SpoofChecker.Builder().setChecks(checks).build(); 206 checks2 = sc.getChecks(); 207 assertEquals("", checks, checks2); 208 209 /* 210 * The checks that were disabled just above are the same ones that the "scMixed" test fails. So with those tests 211 * gone checking that Identifier should now succeed 212 */ 213 checkResults = sc.failsChecks(scMixed); 214 assertFalse("", checkResults); 215 } 216 217 /* 218 * AllowedLocales 219 */ 220 public void TestAllowedLocales() { 221 SpoofChecker sc = new SpoofChecker.Builder().build(); 222 Set<ULocale> allowedLocales = null; 223 Set<Locale> allowedJavaLocales = null; 224 boolean checkResults; 225 226 /* Default allowed locales list should be empty */ 227 allowedLocales = sc.getAllowedLocales(); 228 assertTrue("Empty allowed locales", allowedLocales.isEmpty()); 229 230 allowedJavaLocales = sc.getAllowedJavaLocales(); 231 assertTrue("Empty allowed Java locales", allowedJavaLocales.isEmpty()); 232 233 /* Allow en and ru, which should enable Latin and Cyrillic only to pass */ 234 ULocale enloc = new ULocale("en"); 235 ULocale ruloc = new ULocale("ru_RU"); 236 allowedLocales = new HashSet<ULocale>(); 237 allowedLocales.add(enloc); 238 allowedLocales.add(ruloc); 239 sc = new SpoofChecker.Builder().setAllowedLocales(allowedLocales).build(); 240 allowedLocales = sc.getAllowedLocales(); 241 assertTrue("en in allowed locales", allowedLocales.contains(enloc)); 242 assertTrue("ru_RU in allowed locales", allowedLocales.contains(ruloc)); 243 244 Locale frlocJ = new Locale("fr"); 245 allowedJavaLocales = new HashSet<Locale>(); 246 allowedJavaLocales.add(frlocJ); 247 sc = new SpoofChecker.Builder().setAllowedJavaLocales(allowedJavaLocales).build(); 248 assertFalse("no en in allowed Java locales", allowedJavaLocales.contains(new Locale("en"))); 249 assertTrue("fr in allowed Java locales", allowedJavaLocales.contains(frlocJ)); 250 251 /* 252 * Limit checks to SpoofChecker.CHAR_LIMIT. Some of the test data has whole script confusables also, which we 253 * don't want to see in this test. 254 */ 255 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build(); 256 257 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 258 checkResults = sc.failsChecks(goodLatin); 259 assertFalse("", checkResults); 260 261 checkResults = sc.failsChecks(goodGreek, result); 262 assertEquals("", SpoofChecker.CHAR_LIMIT, result.checks); 263 264 checkResults = sc.failsChecks(goodCyrl); 265 assertFalse("", checkResults); 266 267 /* Reset with an empty locale list, which should allow all characters to pass */ 268 allowedLocales = new LinkedHashSet<ULocale>(); 269 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build(); 270 271 checkResults = sc.failsChecks(goodGreek); 272 assertFalse("", checkResults); 273 } 274 275 /* 276 * AllowedChars set/get the UnicodeSet of allowed characters. 277 */ 278 public void TestAllowedChars() { 279 SpoofChecker sc = new SpoofChecker.Builder().build(); 280 UnicodeSet set; 281 UnicodeSet tmpSet; 282 boolean checkResults; 283 284 /* By default, we should see no restriction; the UnicodeSet should allow all characters. */ 285 set = sc.getAllowedChars(); 286 tmpSet = new UnicodeSet(0, 0x10ffff); 287 assertEquals("", tmpSet, set); 288 289 /* Setting the allowed chars should enable the check. */ 290 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CHAR_LIMIT).build(); 291 292 /* Remove a character that is in our good Latin test identifier from the allowed chars set. */ 293 tmpSet.remove(goodLatin.charAt(1)); 294 sc = new SpoofChecker.Builder().setAllowedChars(tmpSet).build(); 295 296 /* Latin Identifier should now fail; other non-latin test cases should still be OK */ 297 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 298 checkResults = sc.failsChecks(goodLatin, result); 299 assertTrue("", checkResults); 300 assertEquals("", SpoofChecker.CHAR_LIMIT | SpoofChecker.RESTRICTION_LEVEL, result.checks); 301 302 checkResults = sc.failsChecks(goodGreek, result); 303 assertTrue("", checkResults); 304 assertEquals("", SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, result.checks); 305 } 306 307 public void TestCheck() { 308 SpoofChecker sc = new SpoofChecker.Builder().build(); 309 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 310 boolean checkResults; 311 312 result.position = 666; 313 checkResults = sc.failsChecks(goodLatin, result); 314 assertFalse("", checkResults); 315 assertEquals("", 0, result.position); 316 317 checkResults = sc.failsChecks(goodCyrl, result); 318 assertFalse("", checkResults); 319 320 result.position = 666; 321 checkResults = sc.failsChecks(scMixed, result); 322 assertTrue("", checkResults); 323 assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.SINGLE_SCRIPT, result.checks); 324 assertEquals("", 0, result.position); 325 326 result.position = 666; 327 checkResults = sc.failsChecks(han_Hiragana, result); 328 assertFalse("", checkResults); 329 assertEquals("", 0, result.position); 330 assertEquals("", 0, result.checks); 331 } 332 333 public void TestAreConfusable1() { 334 SpoofChecker sc = new SpoofChecker.Builder().build(); 335 int checkResults; 336 checkResults = sc.areConfusable(scLatin, scMixed); 337 assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE, checkResults); 338 339 checkResults = sc.areConfusable(goodGreek, scLatin); 340 assertEquals("", 0, checkResults); 341 342 checkResults = sc.areConfusable(lll_Latin_a, lll_Latin_b); 343 assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, checkResults); 344 } 345 346 public void TestGetSkeleton() { 347 SpoofChecker sc = new SpoofChecker.Builder().build(); 348 String dest; 349 dest = sc.getSkeleton(SpoofChecker.ANY_CASE, lll_Latin_a); 350 assertEquals("", lll_Skel, dest); 351 } 352 353 /** 354 * IntlTestSpoof is the top level test class for the Unicode Spoof detection tests 355 */ 356 357 // Test the USpoofDetector API functions that require C++ 358 // The pure C part of the API, which is most of it, is tested in cintltst 359 /** 360 * IntlTestSpoof tests for USpoofDetector 361 */ 362 public void TestSpoofAPI() { 363 SpoofChecker sc = new SpoofChecker.Builder().build(); 364 String s = "xyz"; // Many latin ranges are whole-script confusable with other scripts. 365 // If this test starts failing, consult confusablesWholeScript.txt 366 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 367 result.position = 666; 368 boolean checkResults = sc.failsChecks(s, result); 369 assertFalse("", checkResults); 370 assertEquals("", 0, result.position); 371 372 sc = new SpoofChecker.Builder().build(); 373 String s1 = "cxs"; 374 String s2 = Utility.unescape("\\u0441\\u0445\\u0455"); // Cyrillic "cxs" 375 int checkResult = sc.areConfusable(s1, s2); 376 assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, checkResult); 377 378 sc = new SpoofChecker.Builder().build(); 379 s = "I1l0O"; 380 String dest = sc.getSkeleton(SpoofChecker.ANY_CASE, s); 381 assertEquals("", dest, "lllOO"); 382 } 383 384 public void TestSkeleton() { 385 SpoofChecker sc = new SpoofChecker.Builder().build(); 386 checkSkeleton(sc, "TestSkeleton"); 387 } 388 389 // testSkeleton. Spot check a number of confusable skeleton substitutions from the 390 // Unicode data file confusables.txt 391 // Test cases chosen for substitutions of various lengths, and 392 // membership in different mapping tables. 393 public void checkSkeleton(SpoofChecker sc, String testName) { 394 int ML = 0; 395 int SL = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE; 396 int MA = SpoofChecker.ANY_CASE; 397 int SA = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE | SpoofChecker.ANY_CASE; 398 399 checkSkeleton(sc, SL, "\\u02b9identifier'", "'identifier'", testName); 400 401 checkSkeleton(sc, SL, "nochange", "nochange", testName); 402 checkSkeleton(sc, MA, "love", "love", testName); 403 checkSkeleton(sc, MA, "1ove", "love", testName); // Digit 1 to letter l 404 checkSkeleton(sc, ML, "OOPS", "OOPS", testName); 405 checkSkeleton(sc, ML, "00PS", "00PS", testName); // Digit 0 unchanged in lower case mode. 406 checkSkeleton(sc, MA, "OOPS", "OOPS", testName); 407 checkSkeleton(sc, MA, "00PS", "OOPS", testName); // Digit 0 to letter O in any case mode only 408 checkSkeleton(sc, SL, "\\u059c", "\\u0301", testName); 409 checkSkeleton(sc, SL, "\\u2A74", "\\u003A\\u003A\\u003D", testName); 410 checkSkeleton(sc, SL, "\\u247E", "\\u0028\\u006c\\u006c\\u0029", testName); // "(ll)" 411 checkSkeleton(sc, SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647", testName); 412 413 // This mapping exists in the ML and MA tables, does not exist in SL, SA 414 // 0C83 ; 0983 ; ML # KANNADA SIGN VISARGA to 415 checkSkeleton(sc, SL, "\\u0C83", "\\u0C83", testName); 416 checkSkeleton(sc, SA, "\\u0C83", "\\u0C83", testName); 417 checkSkeleton(sc, ML, "\\u0C83", "\\u0983", testName); 418 checkSkeleton(sc, MA, "\\u0C83", "\\u0983", testName); 419 420 // 0391 ; 0041 ; MA # GREEK CAPITAL LETTER ALPHA to LATIN CAPITAL LETTER A 421 // This mapping exists only in the MA table. 422 checkSkeleton(sc, MA, "\\u0391", "A", testName); 423 checkSkeleton(sc, SA, "\\u0391", "\\u0391", testName); 424 checkSkeleton(sc, ML, "\\u0391", "\\u0391", testName); 425 checkSkeleton(sc, SL, "\\u0391", "\\u0391", testName); 426 427 // 13CF ; 0062 ; MA # CHEROKEE LETTER SI to LATIN SMALL LETTER B 428 // This mapping exists in the ML and MA tables 429 checkSkeleton(sc, ML, "\\u13CF", "b", testName); 430 checkSkeleton(sc, MA, "\\u13CF", "b", testName); 431 checkSkeleton(sc, SL, "\\u13CF", "\\u13CF", testName); 432 checkSkeleton(sc, SA, "\\u13CF", "\\u13CF", testName); 433 434 // 0022 ; 0027 0027 ; 435 // all tables 436 checkSkeleton(sc, SL, "\"", "\\u0027\\u0027", testName); 437 checkSkeleton(sc, SA, "\"", "\\u0027\\u0027", testName); 438 checkSkeleton(sc, ML, "\"", "\\u0027\\u0027", testName); 439 checkSkeleton(sc, MA, "\"", "\\u0027\\u0027", testName); 440 441 // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations. 442 // (in the C implementation) 443 checkSkeleton( 444 sc, 445 SL, 446 " A 1ong \\u02b9identifier' that will overflow implementation stack buffers, forcing heap allocations." 447 + " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations." 448 + " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations." 449 + " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.", 450 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." 451 + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." 452 + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." 453 + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.", 454 testName); 455 456 } 457 458 // Internal function to run a single skeleton test case. 459 // 460 // Run a single confusable skeleton transformation test case. 461 // 462 void checkSkeleton(SpoofChecker sc, int type, String input, String expected, String testName) { 463 String uInput = Utility.unescape(input); 464 String uExpected = Utility.unescape(expected); 465 String actual; 466 actual = sc.getSkeleton(type, uInput); 467 assertEquals(testName + ": Expected (escaped): " + expected, uExpected, actual); 468 } 469 470 public void TestAreConfusable() { 471 SpoofChecker sc = new SpoofChecker.Builder().build(); 472 String s1 = "A long string that will overflow stack buffers. A long string that will overflow stack buffers. " 473 + "A long string that will overflow stack buffers. A long string that will overflow stack buffers. "; 474 String s2 = "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. " 475 + "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "; 476 assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, sc.areConfusable(s1, s2)); 477 } 478 479 public void TestInvisible() { 480 SpoofChecker sc = new SpoofChecker.Builder().build(); 481 String s = Utility.unescape("abcd\\u0301ef"); 482 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 483 result.position = -42; 484 assertFalse("", sc.failsChecks(s, result)); 485 assertEquals("", 0, result.checks); 486 assertEquals("", result.position, 0); 487 488 String s2 = Utility.unescape("abcd\\u0301\\u0302\\u0301ef"); 489 assertTrue("", sc.failsChecks(s2, result)); 490 assertEquals("", SpoofChecker.INVISIBLE, result.checks); 491 assertEquals("", 0, result.position); 492 493 // Two acute accents, one from the composed a with acute accent, \u00e1, 494 // and one separate. 495 result.position = -42; 496 String s3 = Utility.unescape("abcd\\u00e1\\u0301xyz"); 497 assertTrue("", sc.failsChecks(s3, result)); 498 assertEquals("", SpoofChecker.INVISIBLE, result.checks); 499 assertEquals("", 0, result.position); 500 } 501 502 public void TestRestrictionLevel() { 503 Object[][] tests = { 504 {"aγ♥", RestrictionLevel.UNRESTRICTIVE}, 505 {"a", RestrictionLevel.ASCII}, 506 {"γ", RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE}, 507 {"aアー", RestrictionLevel.HIGHLY_RESTRICTIVE}, 508 {"aऄ", RestrictionLevel.MODERATELY_RESTRICTIVE}, 509 {"aγ", RestrictionLevel.MINIMALLY_RESTRICTIVE}, 510 }; 511 IdentifierInfo idInfo = new IdentifierInfo().setIdentifierProfile(SpoofChecker.RECOMMENDED); 512 CheckResult checkResult = new CheckResult(); 513 for (Object[] test : tests) { 514 String testString = (String) test[0]; 515 RestrictionLevel expectedLevel = (RestrictionLevel) test[1]; 516 idInfo.setIdentifier(testString); 517 assertEquals("Testing restriction level for '" + testString + "'", expectedLevel, idInfo.getRestrictionLevel()); 518 for (RestrictionLevel levelSetInSpoofChecker : RestrictionLevel.values()) { 519 SpoofChecker sc = new SpoofChecker.Builder() 520 .setChecks(SpoofChecker.RESTRICTION_LEVEL) // only check this 521 .setAllowedChars(SpoofChecker.RECOMMENDED) 522 .setRestrictionLevel(levelSetInSpoofChecker) 523 .build(); 524 boolean actualValue = sc.failsChecks(testString, checkResult); 525 526 // we want to fail if the text is (say) MODERATE and the testLevel is ASCII 527 boolean expectedFailure = expectedLevel.compareTo(levelSetInSpoofChecker) > 0 || !SpoofChecker.RECOMMENDED.containsAll(testString); 528 boolean t = assertEquals("Testing spoof restriction level for '" + testString + "', " + levelSetInSpoofChecker, expectedFailure, actualValue); 529 if (!t) { // debugging 530 actualValue = sc.failsChecks(testString, checkResult); 531 // we want to fail if the text is (say) MODERATE and the testLevel is ASCII 532 expectedFailure = expectedLevel.compareTo(levelSetInSpoofChecker) > 0 || !SpoofChecker.RECOMMENDED.containsAll(testString); 533 } 534 } 535 } 536 } 537 538 public void TestMixedNumbers() { 539 Object[][] tests = { 540 {"1", "[0]"}, 541 {"१", "[०]"}, 542 {"1१", "[0०]"}, 543 {"١۱", "[٠۰]"}, 544 }; 545 IdentifierInfo idInfo = new IdentifierInfo(); 546 CheckResult checkResult = new CheckResult(); 547 for (Object[] test : tests) { 548 String testString = (String) test[0]; 549 UnicodeSet expected = new UnicodeSet((String)test[1]); 550 idInfo.setIdentifier(testString); 551 assertEquals("", expected, idInfo.getNumerics()); 552 553 SpoofChecker sc = new SpoofChecker.Builder() 554 .setChecks(SpoofChecker.MIXED_NUMBERS) // only check this 555 .build(); 556 boolean actualValue = sc.failsChecks(testString, checkResult); 557 assertEquals("Testing spoof mixed numbers for '" + testString + "', ", expected.size() > 1, actualValue); 558 } 559 } 560 561 public void TestIdentifierInfo() { 562// contains(BitSet, BitSet) 563 BitSet bitset12 = IdentifierInfo.set(new BitSet(), UScript.LATIN, UScript.HANGUL); 564 BitSet bitset2 = IdentifierInfo.set(new BitSet(), UScript.HANGUL); 565 assertTrue("", IdentifierInfo.contains(bitset12, bitset2)); 566 assertTrue("", IdentifierInfo.contains(bitset12, bitset12)); 567 assertTrue("", !IdentifierInfo.contains(bitset2, bitset12)); 568 569 assertTrue("", IdentifierInfo.BITSET_COMPARATOR.compare( 570 IdentifierInfo.set(new BitSet(), UScript.ARABIC), 571 IdentifierInfo.set(new BitSet(), UScript.LATIN)) < 0); 572// displayAlternates(Collection<BitSet>) 573// displayScripts(BitSet) 574 String scriptString = IdentifierInfo.displayScripts(bitset12); 575 assertEquals("", "Hang Latn", scriptString); 576 Set<BitSet> alternates = new HashSet(Arrays.asList(bitset12, bitset2)); 577 String alternatesString = IdentifierInfo.displayAlternates(alternates); 578 assertEquals("", "Hang; Hang Latn", alternatesString); 579 580// parseAlternates(String) 581// parseScripts(String) 582 assertEquals("", bitset12, IdentifierInfo.parseScripts(scriptString)); 583 assertEquals("", alternates, IdentifierInfo.parseAlternates(alternatesString)); 584 585 String[][] tests = { 586 // String, restriction-level, numerics, scripts, alternates, common-alternates 587 {"a♥", "UNRESTRICTIVE", "[]", "Latn", "", ""}, 588 {"a〆", "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"}, 589 {"aー〆", "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hira Kana", "Hira Kana"}, 590 {"aー〆ア", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""}, 591 {"アaー〆", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""}, 592 {"a1١", "UNRESTRICTIVE", "[0٠]", "Latn", "Arab Thaa", "Arab Thaa"}, 593 {"a1١۱", "UNRESTRICTIVE", "[0٠۰]", "Latn Arab", "", ""}, 594 {"١ー〆aア1१۱", "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"}, 595 {"aアー〆1१١۱", "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"}, 596 }; 597 for (String[] test : tests) { 598 String testString = test[0]; 599 IdentifierInfo idInfo = new IdentifierInfo(); 600 idInfo.setIdentifierProfile(SpoofChecker.RECOMMENDED); 601 idInfo.setIdentifier(testString); 602 assertEquals("Identifier " + testString, testString, idInfo.getIdentifier()); 603 604 RestrictionLevel restrictionLevel = RestrictionLevel.valueOf(test[1]); 605 assertEquals("RestrictionLevel " + testString, restrictionLevel, idInfo.getRestrictionLevel()); 606 607 UnicodeSet numerics = new UnicodeSet(test[2]); 608 assertEquals("Numerics " + testString, numerics, idInfo.getNumerics()); 609 610 BitSet scripts = IdentifierInfo.parseScripts(test[3]); 611 assertEquals("Scripts " + testString, scripts, idInfo.getScripts()); 612 613 Set<BitSet> alternates2 = IdentifierInfo.parseAlternates(test[4]); 614 assertEquals("Alternates " + testString, alternates2, idInfo.getAlternates()); 615 616 BitSet commonAlternates = IdentifierInfo.parseScripts(test[5]); 617 assertEquals("Common Alternates " + testString, commonAlternates, idInfo.getCommonAmongAlternates()); 618 } 619 620// TODO 621// getIdentifierProfile() 622// setIdentifierProfile(UnicodeSet) 623 } 624 625 public void TestComparator() { 626 Random random = new Random(0); 627 for (int i = 0; i < 100; ++i) { 628 BitSet[] items = new BitSet[random.nextInt(5)+3]; 629 for (int j = 0; j < items.length; ++j) { 630 items[j] = new BitSet(); 631 int countInBitset = random.nextInt(5); 632 for (int k = 0; k < countInBitset; ++k) { 633 items[j].set(random.nextInt(10)); 634 } 635 } 636 checkComparator(IdentifierInfo.BITSET_COMPARATOR, items); 637 } 638 } 639 640 // Dumb implementation for now 641 private <T> void checkComparator(Comparator<T> comparator, T... items) { 642 logln("Checking " + Arrays.asList(items)); 643 /* 644 * The relation is transitive: a < b and b < c implies a < c. We test here. 645 * The relation is trichotomous: exactly one of a < b, b < a and a = b is true. Guaranteed by comparator. 646 */ 647 for (int i = 0; i < items.length-2; ++i) { 648 T a = items[i]; 649 for (int j = i+1; j < items.length-1; ++j) { 650 T b = items[j]; 651 for (int k = j+1; k < items.length; ++k) { 652 T c = items[k]; 653 checkTransitivity(comparator, a, b, c); 654 checkTransitivity(comparator, a, c, b); 655 checkTransitivity(comparator, b, a, b); 656 checkTransitivity(comparator, b, c, a); 657 checkTransitivity(comparator, c, a, b); 658 checkTransitivity(comparator, c, b, a); 659 } 660 } 661 } 662 } 663 664 private <T> void checkTransitivity(Comparator<T> comparator, T a, T b, T c) { 665 int ab = comparator.compare(a,b); 666 int bc = comparator.compare(b,c); 667 int ca = comparator.compare(c,a); 668 if (!assertFalse("Transitive: " + a + ", " + b + ", " + c, 669 ab < 0 && bc < 0 && ca <= 0)) { 670 // for debugging 671 comparator.compare(a,b); 672 comparator.compare(b,c); 673 comparator.compare(c,a); 674 assertFalse("Transitive: " + a + ", " + b + ", " + c, 675 ab < 0 && bc < 0 && ca <= 0); 676 } 677 } 678 679 private String parseHex(String in) { 680 StringBuilder sb = new StringBuilder(); 681 for (String oneCharAsHexString : in.split("\\s+")) { 682 if (oneCharAsHexString.length() > 0) { 683 sb.appendCodePoint(Integer.parseInt(oneCharAsHexString, 16)); 684 } 685 } 686 return sb.toString(); 687 } 688 689 private String escapeString(String in) { 690 StringBuilder out = new StringBuilder(); 691 for (int i = 0; i < in.length(); i++) { 692 int c = in.codePointAt(i); 693 if (c <= 0x7f) { 694 out.append((char) c); 695 } else if (c <= 0xffff) { 696 out.append(String.format("\\u%04x", c)); 697 } else { 698 out.append(String.format("\\U%06x", c)); 699 i++; 700 } 701 } 702 return out.toString(); 703 } 704 705 // Verify that each item from the Unicode confusables.txt file 706 // transforms into the expected skeleton. 707 public void testConfData() { 708 if (TestUtil.getJavaVendor() == JavaVendor.IBM && TestUtil.getJavaVersion() == 5) { 709 // Note: IBM Java 5 has a bug reading a large UTF-8 text contents 710 logln("Skip this test case because of the IBM Java 5 bug"); 711 return; 712 } 713 try { 714 // Read in the confusables.txt file. (Distributed by Unicode.org) 715 String fileName = "unicode/confusables.txt"; 716 BufferedReader confusablesRdr = TestUtil.getDataReader(fileName, "UTF-8"); 717 718 // Create a default spoof checker to use in this test. 719 SpoofChecker sc = new SpoofChecker.Builder().build(); 720 721 // Parse lines from the confusables.txt file. Example Line: 722 // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH .... 723 // Lines have three fields. The hex fields can contain more than one character, 724 // and each character may be more than 4 digits (for supplemntals) 725 // This regular expression matches lines and splits the fields into capture groups. 726 // Capture group 1: map from chars 727 // 2: map to chars 728 // 3: table type, SL, ML, SA or MA 729 // 4: Comment Lines Only 730 // 5: Error Lines Only 731 Matcher parseLine = Pattern.compile( 732 "\\ufeff?" + "(?:([0-9A-F\\s]+);([0-9A-F\\s]+);\\s*(SL|ML|SA|MA)\\s*(?:#.*?)?$)" 733 + "|\\ufeff?(\\s*(?:#.*)?)"). // Comment line 734 matcher(""); 735 Normalizer2 normalizer = Normalizer2.getNFDInstance(); 736 int lineNum = 0; 737 String inputLine; 738 while ((inputLine = confusablesRdr.readLine()) != null) { 739 lineNum++; 740 parseLine.reset(inputLine); 741 if (!parseLine.matches()) { 742 errln("Syntax error in confusable data file at line " + lineNum); 743 errln(inputLine); 744 break; 745 } 746 if (parseLine.group(4) != null) { 747 continue; // comment line 748 } 749 String from = parseHex(parseLine.group(1)); 750 751 if (!normalizer.isNormalized(from)) { 752 // The source character was not NFD. 753 // Skip this case; the first step in obtaining a skeleton is to NFD the input, 754 // so the mapping in this line of confusables.txt will never be applied. 755 continue; 756 } 757 758 String rawExpected = parseHex(parseLine.group(2)); 759 String expected = normalizer.normalize(rawExpected); 760 761 int skeletonType = 0; 762 String tableType = parseLine.group(3); 763 if (tableType.equals("SL")) { 764 skeletonType = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE; 765 } else if (tableType.indexOf("SA") >= 0) { 766 skeletonType = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE | SpoofChecker.ANY_CASE; 767 } else if (tableType.indexOf("ML") >= 0) { 768 skeletonType = 0; 769 } else if (tableType.indexOf("MA") >= 0) { 770 skeletonType = SpoofChecker.ANY_CASE; 771 } 772 773 String actual; 774 actual = sc.getSkeleton(skeletonType, from); 775 776 if (!actual.equals(expected)) { 777 errln("confusables.txt: " + lineNum + ": " + parseLine.group(0)); 778 errln("Actual: " + escapeString(actual)); 779 } 780 } 781 confusablesRdr.close(); 782 } catch (IOException e) { 783 errln(e.toString()); 784 } 785 } 786} 787