1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4******************************************************************************* 5* Copyright (C) 2010-2014, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8*/ 9package com.ibm.icu.dev.test.normalizer; 10 11import java.util.Collections; 12import java.util.EnumSet; 13import java.util.Map; 14import java.util.Set; 15import java.util.TreeMap; 16 17import org.junit.Test; 18 19import com.ibm.icu.dev.test.TestFmwk; 20import com.ibm.icu.impl.Normalizer2Impl.UTF16Plus; 21import com.ibm.icu.text.IDNA; 22 23/** 24 * UTS #46 (IDNA2008) test. 25 * @author Markus Scherer 26 * @since 2010jul10 27 */ 28public class UTS46Test extends TestFmwk { 29 public UTS46Test() { 30 int commonOptions= 31 IDNA.USE_STD3_RULES|IDNA.CHECK_BIDI| 32 IDNA.CHECK_CONTEXTJ|IDNA.CHECK_CONTEXTO; 33 trans=IDNA.getUTS46Instance(commonOptions); 34 nontrans=IDNA.getUTS46Instance(commonOptions| 35 IDNA.NONTRANSITIONAL_TO_ASCII|IDNA.NONTRANSITIONAL_TO_UNICODE); 36 } 37 38 @Test 39 public void TestAPI() { 40 StringBuilder result=new StringBuilder(); 41 IDNA.Info info=new IDNA.Info(); 42 String input="www.eXample.cOm"; 43 String expected="www.example.com"; 44 trans.nameToASCII(input, result, info); 45 if(info.hasErrors() || !UTF16Plus.equal(result, expected)) { 46 errln(String.format("T.nameToASCII(www.example.com) info.errors=%s result matches=%b", 47 info.getErrors(), UTF16Plus.equal(result, expected))); 48 } 49 input="xn--bcher.de-65a"; 50 expected="xn--bcher\uFFFDde-65a"; 51 nontrans.labelToASCII(input, result, info); 52 if( !info.getErrors().equals(EnumSet.of(IDNA.Error.LABEL_HAS_DOT, IDNA.Error.INVALID_ACE_LABEL)) || 53 !UTF16Plus.equal(result, expected) 54 ) { 55 errln(String.format("N.labelToASCII(label-with-dot) failed with errors %s", 56 info.getErrors())); 57 } 58 // Java API tests that are not parallel to C++ tests 59 // because the C++ specifics (error codes etc.) do not apply here. 60 String resultString=trans.nameToUnicode("fA\u00DF.de", result, info).toString(); 61 if(info.hasErrors() || !resultString.equals("fass.de")) { 62 errln(String.format("T.nameToUnicode(fA\u00DF.de) info.errors=%s result matches=%b", 63 info.getErrors(), resultString.equals("fass.de"))); 64 } 65 try { 66 nontrans.labelToUnicode(result, result, info); 67 errln("N.labelToUnicode(result, result) did not throw an Exception"); 68 } catch(Exception e) { 69 // as expected (should be an IllegalArgumentException, or an ICU version of it) 70 } 71 } 72 73 @Test 74 public void TestNotSTD3() { 75 IDNA not3=IDNA.getUTS46Instance(IDNA.CHECK_BIDI); 76 String input="\u0000A_2+2=4\n.e\u00DFen.net"; 77 StringBuilder result=new StringBuilder(); 78 IDNA.Info info=new IDNA.Info(); 79 if( !not3.nameToUnicode(input, result, info).toString().equals("\u0000a_2+2=4\n.essen.net") || 80 info.hasErrors() 81 ) { 82 errln(String.format("notSTD3.nameToUnicode(non-LDH ASCII) unexpected errors %s string %s", 83 info.getErrors(), prettify(result.toString()))); 84 } 85 // A space (BiDi class WS) is not allowed in a BiDi domain name. 86 input="a z.xn--4db.edu"; 87 not3.nameToASCII(input, result, info); 88 if(!UTF16Plus.equal(result, input) || !info.getErrors().equals(EnumSet.of(IDNA.Error.BIDI))) { 89 errln("notSTD3.nameToASCII(ASCII-with-space.alef.edu) failed"); 90 } 91 // Characters that are canonically equivalent to sequences with non-LDH ASCII. 92 input="a\u2260b\u226Ec\u226Fd"; 93 not3.nameToUnicode(input, result, info); 94 if(!UTF16Plus.equal(result, input) || info.hasErrors()) { 95 errln(String.format("notSTD3.nameToUnicode(equiv to non-LDH ASCII) unexpected errors %s string %s", 96 info.getErrors().toString(), prettify(result.toString()))); 97 } 98 } 99 100 private static final Map<String, IDNA.Error> errorNamesToErrors; 101 static { 102 errorNamesToErrors=new TreeMap<String, IDNA.Error>(); 103 errorNamesToErrors.put("UIDNA_ERROR_EMPTY_LABEL", IDNA.Error.EMPTY_LABEL); 104 errorNamesToErrors.put("UIDNA_ERROR_LABEL_TOO_LONG", IDNA.Error.LABEL_TOO_LONG); 105 errorNamesToErrors.put("UIDNA_ERROR_DOMAIN_NAME_TOO_LONG", IDNA.Error.DOMAIN_NAME_TOO_LONG); 106 errorNamesToErrors.put("UIDNA_ERROR_LEADING_HYPHEN", IDNA.Error.LEADING_HYPHEN); 107 errorNamesToErrors.put("UIDNA_ERROR_TRAILING_HYPHEN", IDNA.Error.TRAILING_HYPHEN); 108 errorNamesToErrors.put("UIDNA_ERROR_HYPHEN_3_4", IDNA.Error.HYPHEN_3_4); 109 errorNamesToErrors.put("UIDNA_ERROR_LEADING_COMBINING_MARK", IDNA.Error.LEADING_COMBINING_MARK); 110 errorNamesToErrors.put("UIDNA_ERROR_DISALLOWED", IDNA.Error.DISALLOWED); 111 errorNamesToErrors.put("UIDNA_ERROR_PUNYCODE", IDNA.Error.PUNYCODE); 112 errorNamesToErrors.put("UIDNA_ERROR_LABEL_HAS_DOT", IDNA.Error.LABEL_HAS_DOT); 113 errorNamesToErrors.put("UIDNA_ERROR_INVALID_ACE_LABEL", IDNA.Error.INVALID_ACE_LABEL); 114 errorNamesToErrors.put("UIDNA_ERROR_BIDI", IDNA.Error.BIDI); 115 errorNamesToErrors.put("UIDNA_ERROR_CONTEXTJ", IDNA.Error.CONTEXTJ); 116 errorNamesToErrors.put("UIDNA_ERROR_CONTEXTO_PUNCTUATION", IDNA.Error.CONTEXTO_PUNCTUATION); 117 errorNamesToErrors.put("UIDNA_ERROR_CONTEXTO_DIGITS", IDNA.Error.CONTEXTO_DIGITS); 118 } 119 120 private static final class TestCase { 121 private TestCase() { 122 errors=EnumSet.noneOf(IDNA.Error.class); 123 } 124 private void set(String[] data) { 125 s=data[0]; 126 o=data[1]; 127 u=data[2]; 128 errors.clear(); 129 if(data[3].length()!=0) { 130 for(String e: data[3].split("\\|")) { 131 errors.add(errorNamesToErrors.get(e)); 132 } 133 } 134 } 135 // Input string and options string (Nontransitional/Transitional/Both). 136 private String s, o; 137 // Expected Unicode result string. 138 private String u; 139 private EnumSet<IDNA.Error> errors; 140 }; 141 142 private static final String testCases[][]={ 143 { "www.eXample.cOm", "B", // all ASCII 144 "www.example.com", "" }, 145 { "B\u00FCcher.de", "B", // u-umlaut 146 "b\u00FCcher.de", "" }, 147 { "\u00D6BB", "B", // O-umlaut 148 "\u00F6bb", "" }, 149 { "fa\u00DF.de", "N", // sharp s 150 "fa\u00DF.de", "" }, 151 { "fa\u00DF.de", "T", // sharp s 152 "fass.de", "" }, 153 { "XN--fA-hia.dE", "B", // sharp s in Punycode 154 "fa\u00DF.de", "" }, 155 { "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "N", // Greek with final sigma 156 "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "" }, 157 { "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "T", // Greek with final sigma 158 "\u03B2\u03CC\u03BB\u03BF\u03C3.com", "" }, 159 { "xn--nxasmm1c", "B", // Greek with final sigma in Punycode 160 "\u03B2\u03CC\u03BB\u03BF\u03C2", "" }, 161 { "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "N", // "Sri" in "Sri Lanka" has a ZWJ 162 "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "" }, 163 { "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "T", // "Sri" in "Sri Lanka" has a ZWJ 164 "www.\u0DC1\u0DCA\u0DBB\u0DD3.com", "" }, 165 { "www.xn--10cl1a0b660p.com", "B", // "Sri" in Punycode 166 "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "" }, 167 { "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "N", // ZWNJ 168 "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "" }, 169 { "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "T", // ZWNJ 170 "\u0646\u0627\u0645\u0647\u0627\u06CC", "" }, 171 { "xn--mgba3gch31f060k.com", "B", // ZWNJ in Punycode 172 "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC.com", "" }, 173 { "a.b\uFF0Ec\u3002d\uFF61", "B", 174 "a.b.c.d.", "" }, 175 { "U\u0308.xn--tda", "B", // U+umlaut.u-umlaut 176 "\u00FC.\u00FC", "" }, 177 { "xn--u-ccb", "B", // u+umlaut in Punycode 178 "xn--u-ccb\uFFFD", "UIDNA_ERROR_INVALID_ACE_LABEL" }, 179 { "a\u2488com", "B", // contains 1-dot 180 "a\uFFFDcom", "UIDNA_ERROR_DISALLOWED" }, 181 { "xn--a-ecp.ru", "B", // contains 1-dot in Punycode 182 "xn--a-ecp\uFFFD.ru", "UIDNA_ERROR_INVALID_ACE_LABEL" }, 183 { "xn--0.pt", "B", // invalid Punycode 184 "xn--0\uFFFD.pt", "UIDNA_ERROR_PUNYCODE" }, 185 { "xn--a.pt", "B", // U+0080 186 "xn--a\uFFFD.pt", "UIDNA_ERROR_INVALID_ACE_LABEL" }, 187 { "xn--a-\u00C4.pt", "B", // invalid Punycode 188 "xn--a-\u00E4.pt", "UIDNA_ERROR_PUNYCODE" }, 189 { "\u65E5\u672C\u8A9E\u3002\uFF2A\uFF30", "B", // Japanese with fullwidth ".jp" 190 "\u65E5\u672C\u8A9E.jp", "" }, 191 { "\u2615", "B", "\u2615", "" }, // Unicode 4.0 HOT BEVERAGE 192 // some characters are disallowed because they are canonically equivalent 193 // to sequences with non-LDH ASCII 194 { "a\u2260b\u226Ec\u226Fd", "B", 195 "a\uFFFDb\uFFFDc\uFFFDd", "UIDNA_ERROR_DISALLOWED" }, 196 // many deviation characters, test the special mapping code 197 { "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+ 198 "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+ 199 "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+ 200 "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+ 201 "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", "N", 202 "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+ 203 "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+ 204 "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+ 205 "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+ 206 "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", 207 "UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_CONTEXTJ" }, 208 { "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+ 209 "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+ 210 "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+ 211 "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+ 212 "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", "T", 213 "1.assbcssssssssd"+ 214 "\u03C3\u03C3sssssssssssssssse"+ 215 "ssssssssssssssssssssx"+ 216 "ssssssssssssssssssssy"+ 217 "sssssssssssssss\u015Dssz", "UIDNA_ERROR_LABEL_TOO_LONG" }, 218 // "xn--bss" with deviation characters 219 { "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "N", 220 "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "UIDNA_ERROR_CONTEXTJ" }, 221 { "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "T", 222 "\u5919", "" }, 223 // "xn--bssffl" written as: 224 // 02E3 MODIFIER LETTER SMALL X 225 // 034F COMBINING GRAPHEME JOINER (ignored) 226 // 2115 DOUBLE-STRUCK CAPITAL N 227 // 200B ZERO WIDTH SPACE (ignored) 228 // FE63 SMALL HYPHEN-MINUS 229 // 00AD SOFT HYPHEN (ignored) 230 // FF0D FULLWIDTH HYPHEN-MINUS 231 // 180C MONGOLIAN FREE VARIATION SELECTOR TWO (ignored) 232 // 212C SCRIPT CAPITAL B 233 // FE00 VARIATION SELECTOR-1 (ignored) 234 // 017F LATIN SMALL LETTER LONG S 235 // 2064 INVISIBLE PLUS (ignored) 236 // 1D530 MATHEMATICAL FRAKTUR SMALL S 237 // E01EF VARIATION SELECTOR-256 (ignored) 238 // FB04 LATIN SMALL LIGATURE FFL 239 { "\u02E3\u034F\u2115\u200B\uFE63\u00AD\uFF0D\u180C"+ 240 "\u212C\uFE00\u017F\u2064"+"\uD835\uDD30\uDB40\uDDEF"/*1D530 E01EF*/+"\uFB04", "B", 241 "\u5921\u591E\u591C\u5919", "" }, 242 { "123456789012345678901234567890123456789012345678901234567890123."+ 243 "123456789012345678901234567890123456789012345678901234567890123."+ 244 "123456789012345678901234567890123456789012345678901234567890123."+ 245 "1234567890123456789012345678901234567890123456789012345678901", "B", 246 "123456789012345678901234567890123456789012345678901234567890123."+ 247 "123456789012345678901234567890123456789012345678901234567890123."+ 248 "123456789012345678901234567890123456789012345678901234567890123."+ 249 "1234567890123456789012345678901234567890123456789012345678901", "" }, 250 { "123456789012345678901234567890123456789012345678901234567890123."+ 251 "123456789012345678901234567890123456789012345678901234567890123."+ 252 "123456789012345678901234567890123456789012345678901234567890123."+ 253 "1234567890123456789012345678901234567890123456789012345678901.", "B", 254 "123456789012345678901234567890123456789012345678901234567890123."+ 255 "123456789012345678901234567890123456789012345678901234567890123."+ 256 "123456789012345678901234567890123456789012345678901234567890123."+ 257 "1234567890123456789012345678901234567890123456789012345678901.", "" }, 258 // Domain name >256 characters, forces slow path in UTF-8 processing. 259 { "123456789012345678901234567890123456789012345678901234567890123."+ 260 "123456789012345678901234567890123456789012345678901234567890123."+ 261 "123456789012345678901234567890123456789012345678901234567890123."+ 262 "123456789012345678901234567890123456789012345678901234567890123."+ 263 "12345678901234567890123456789012345678901234567890123456789012", "B", 264 "123456789012345678901234567890123456789012345678901234567890123."+ 265 "123456789012345678901234567890123456789012345678901234567890123."+ 266 "123456789012345678901234567890123456789012345678901234567890123."+ 267 "123456789012345678901234567890123456789012345678901234567890123."+ 268 "12345678901234567890123456789012345678901234567890123456789012", 269 "UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" }, 270 { "123456789012345678901234567890123456789012345678901234567890123."+ 271 "123456789012345678901234567890123456789012345678901234567890123."+ 272 "123456789012345678901234567890123456789012345678901234567890123."+ 273 "123456789012345678901234567890123456789012345678901234567890123."+ 274 "1234567890123456789012345678901234567890123456789\u05D0", "B", 275 "123456789012345678901234567890123456789012345678901234567890123."+ 276 "123456789012345678901234567890123456789012345678901234567890123."+ 277 "123456789012345678901234567890123456789012345678901234567890123."+ 278 "123456789012345678901234567890123456789012345678901234567890123."+ 279 "1234567890123456789012345678901234567890123456789\u05D0", 280 "UIDNA_ERROR_DOMAIN_NAME_TOO_LONG|UIDNA_ERROR_BIDI" }, 281 { "123456789012345678901234567890123456789012345678901234567890123."+ 282 "1234567890123456789012345678901234567890123456789012345678901234."+ 283 "123456789012345678901234567890123456789012345678901234567890123."+ 284 "123456789012345678901234567890123456789012345678901234567890", "B", 285 "123456789012345678901234567890123456789012345678901234567890123."+ 286 "1234567890123456789012345678901234567890123456789012345678901234."+ 287 "123456789012345678901234567890123456789012345678901234567890123."+ 288 "123456789012345678901234567890123456789012345678901234567890", 289 "UIDNA_ERROR_LABEL_TOO_LONG" }, 290 { "123456789012345678901234567890123456789012345678901234567890123."+ 291 "1234567890123456789012345678901234567890123456789012345678901234."+ 292 "123456789012345678901234567890123456789012345678901234567890123."+ 293 "123456789012345678901234567890123456789012345678901234567890.", "B", 294 "123456789012345678901234567890123456789012345678901234567890123."+ 295 "1234567890123456789012345678901234567890123456789012345678901234."+ 296 "123456789012345678901234567890123456789012345678901234567890123."+ 297 "123456789012345678901234567890123456789012345678901234567890.", 298 "UIDNA_ERROR_LABEL_TOO_LONG" }, 299 { "123456789012345678901234567890123456789012345678901234567890123."+ 300 "1234567890123456789012345678901234567890123456789012345678901234."+ 301 "123456789012345678901234567890123456789012345678901234567890123."+ 302 "1234567890123456789012345678901234567890123456789012345678901", "B", 303 "123456789012345678901234567890123456789012345678901234567890123."+ 304 "1234567890123456789012345678901234567890123456789012345678901234."+ 305 "123456789012345678901234567890123456789012345678901234567890123."+ 306 "1234567890123456789012345678901234567890123456789012345678901", 307 "UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" }, 308 // label length 63: xn--1234567890123456789012345678901234567890123456789012345-9te 309 { "\u00E41234567890123456789012345678901234567890123456789012345", "B", 310 "\u00E41234567890123456789012345678901234567890123456789012345", "" }, 311 { "1234567890\u00E41234567890123456789012345678901234567890123456", "B", 312 "1234567890\u00E41234567890123456789012345678901234567890123456", "UIDNA_ERROR_LABEL_TOO_LONG" }, 313 { "123456789012345678901234567890123456789012345678901234567890123."+ 314 "1234567890\u00E4123456789012345678901234567890123456789012345."+ 315 "123456789012345678901234567890123456789012345678901234567890123."+ 316 "1234567890123456789012345678901234567890123456789012345678901", "B", 317 "123456789012345678901234567890123456789012345678901234567890123."+ 318 "1234567890\u00E4123456789012345678901234567890123456789012345."+ 319 "123456789012345678901234567890123456789012345678901234567890123."+ 320 "1234567890123456789012345678901234567890123456789012345678901", "" }, 321 { "123456789012345678901234567890123456789012345678901234567890123."+ 322 "1234567890\u00E4123456789012345678901234567890123456789012345."+ 323 "123456789012345678901234567890123456789012345678901234567890123."+ 324 "1234567890123456789012345678901234567890123456789012345678901.", "B", 325 "123456789012345678901234567890123456789012345678901234567890123."+ 326 "1234567890\u00E4123456789012345678901234567890123456789012345."+ 327 "123456789012345678901234567890123456789012345678901234567890123."+ 328 "1234567890123456789012345678901234567890123456789012345678901.", "" }, 329 { "123456789012345678901234567890123456789012345678901234567890123."+ 330 "1234567890\u00E4123456789012345678901234567890123456789012345."+ 331 "123456789012345678901234567890123456789012345678901234567890123."+ 332 "12345678901234567890123456789012345678901234567890123456789012", "B", 333 "123456789012345678901234567890123456789012345678901234567890123."+ 334 "1234567890\u00E4123456789012345678901234567890123456789012345."+ 335 "123456789012345678901234567890123456789012345678901234567890123."+ 336 "12345678901234567890123456789012345678901234567890123456789012", 337 "UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" }, 338 { "123456789012345678901234567890123456789012345678901234567890123."+ 339 "1234567890\u00E41234567890123456789012345678901234567890123456."+ 340 "123456789012345678901234567890123456789012345678901234567890123."+ 341 "123456789012345678901234567890123456789012345678901234567890", "B", 342 "123456789012345678901234567890123456789012345678901234567890123."+ 343 "1234567890\u00E41234567890123456789012345678901234567890123456."+ 344 "123456789012345678901234567890123456789012345678901234567890123."+ 345 "123456789012345678901234567890123456789012345678901234567890", 346 "UIDNA_ERROR_LABEL_TOO_LONG" }, 347 { "123456789012345678901234567890123456789012345678901234567890123."+ 348 "1234567890\u00E41234567890123456789012345678901234567890123456."+ 349 "123456789012345678901234567890123456789012345678901234567890123."+ 350 "123456789012345678901234567890123456789012345678901234567890.", "B", 351 "123456789012345678901234567890123456789012345678901234567890123."+ 352 "1234567890\u00E41234567890123456789012345678901234567890123456."+ 353 "123456789012345678901234567890123456789012345678901234567890123."+ 354 "123456789012345678901234567890123456789012345678901234567890.", 355 "UIDNA_ERROR_LABEL_TOO_LONG" }, 356 { "123456789012345678901234567890123456789012345678901234567890123."+ 357 "1234567890\u00E41234567890123456789012345678901234567890123456."+ 358 "123456789012345678901234567890123456789012345678901234567890123."+ 359 "1234567890123456789012345678901234567890123456789012345678901", "B", 360 "123456789012345678901234567890123456789012345678901234567890123."+ 361 "1234567890\u00E41234567890123456789012345678901234567890123456."+ 362 "123456789012345678901234567890123456789012345678901234567890123."+ 363 "1234567890123456789012345678901234567890123456789012345678901", 364 "UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" }, 365 // hyphen errors and empty-label errors 366 // Ticket #10883: ToUnicode also checks for empty labels. 367 { ".", "B", ".", "UIDNA_ERROR_EMPTY_LABEL" }, 368 { "\uFF0E", "B", ".", "UIDNA_ERROR_EMPTY_LABEL" }, 369 // "xn---q----jra"=="-q--a-umlaut-" 370 { "a.b..-q--a-.e", "B", "a.b..-q--a-.e", 371 "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+ 372 "UIDNA_ERROR_HYPHEN_3_4" }, 373 { "a.b..-q--\u00E4-.e", "B", "a.b..-q--\u00E4-.e", 374 "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+ 375 "UIDNA_ERROR_HYPHEN_3_4" }, 376 { "a.b..xn---q----jra.e", "B", "a.b..-q--\u00E4-.e", 377 "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+ 378 "UIDNA_ERROR_HYPHEN_3_4" }, 379 { "a..c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" }, 380 { "a.xn--.c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" }, 381 { "a.-b.", "B", "a.-b.", "UIDNA_ERROR_LEADING_HYPHEN" }, 382 { "a.b-.c", "B", "a.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" }, 383 { "a.-.c", "B", "a.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" }, 384 { "a.bc--de.f", "B", "a.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" }, 385 { "\u00E4.\u00AD.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" }, 386 { "\u00E4.xn--.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" }, 387 { "\u00E4.-b.", "B", "\u00E4.-b.", "UIDNA_ERROR_LEADING_HYPHEN" }, 388 { "\u00E4.b-.c", "B", "\u00E4.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" }, 389 { "\u00E4.-.c", "B", "\u00E4.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" }, 390 { "\u00E4.bc--de.f", "B", "\u00E4.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" }, 391 { "a.b.\u0308c.d", "B", "a.b.\uFFFDc.d", "UIDNA_ERROR_LEADING_COMBINING_MARK" }, 392 { "a.b.xn--c-bcb.d", "B", 393 "a.b.xn--c-bcb\uFFFD.d", "UIDNA_ERROR_LEADING_COMBINING_MARK|UIDNA_ERROR_INVALID_ACE_LABEL" }, 394 // BiDi 395 { "A0", "B", "a0", "" }, 396 { "0A", "B", "0a", "" }, // all-LTR is ok to start with a digit (EN) 397 { "0A.\u05D0", "B", // ASCII label does not start with L/R/AL 398 "0a.\u05D0", "UIDNA_ERROR_BIDI" }, 399 { "c.xn--0-eha.xn--4db", "B", // 2nd label does not start with L/R/AL 400 "c.0\u00FC.\u05D0", "UIDNA_ERROR_BIDI" }, 401 { "b-.\u05D0", "B", // label does not end with L/EN 402 "b-.\u05D0", "UIDNA_ERROR_TRAILING_HYPHEN|UIDNA_ERROR_BIDI" }, 403 { "d.xn----dha.xn--4db", "B", // 2nd label does not end with L/EN 404 "d.\u00FC-.\u05D0", "UIDNA_ERROR_TRAILING_HYPHEN|UIDNA_ERROR_BIDI" }, 405 { "a\u05D0", "B", "a\u05D0", "UIDNA_ERROR_BIDI" }, // first dir != last dir 406 { "\u05D0\u05C7", "B", "\u05D0\u05C7", "" }, 407 { "\u05D09\u05C7", "B", "\u05D09\u05C7", "" }, 408 { "\u05D0a\u05C7", "B", "\u05D0a\u05C7", "UIDNA_ERROR_BIDI" }, // first dir != last dir 409 { "\u05D0\u05EA", "B", "\u05D0\u05EA", "" }, 410 { "\u05D0\u05F3\u05EA", "B", "\u05D0\u05F3\u05EA", "" }, 411 { "a\u05D0Tz", "B", "a\u05D0tz", "UIDNA_ERROR_BIDI" }, // mixed dir 412 { "\u05D0T\u05EA", "B", "\u05D0t\u05EA", "UIDNA_ERROR_BIDI" }, // mixed dir 413 { "\u05D07\u05EA", "B", "\u05D07\u05EA", "" }, 414 { "\u05D0\u0667\u05EA", "B", "\u05D0\u0667\u05EA", "" }, // Arabic 7 in the middle 415 { "a7\u0667z", "B", "a7\u0667z", "UIDNA_ERROR_BIDI" }, // AN digit in LTR 416 { "\u05D07\u0667\u05EA", "B", // mixed EN/AN digits in RTL 417 "\u05D07\u0667\u05EA", "UIDNA_ERROR_BIDI" }, 418 // ZWJ 419 { "\u0BB9\u0BCD\u200D", "N", "\u0BB9\u0BCD\u200D", "" }, // Virama+ZWJ 420 { "\u0BB9\u200D", "N", "\u0BB9\u200D", "UIDNA_ERROR_CONTEXTJ" }, // no Virama 421 { "\u200D", "N", "\u200D", "UIDNA_ERROR_CONTEXTJ" }, // no Virama 422 // ZWNJ 423 { "\u0BB9\u0BCD\u200C", "N", "\u0BB9\u0BCD\u200C", "" }, // Virama+ZWNJ 424 { "\u0BB9\u200C", "N", "\u0BB9\u200C", "UIDNA_ERROR_CONTEXTJ" }, // no Virama 425 { "\u200C", "N", "\u200C", "UIDNA_ERROR_CONTEXTJ" }, // no Virama 426 { "\u0644\u0670\u200C\u06ED\u06EF", "N", // Joining types D T ZWNJ T R 427 "\u0644\u0670\u200C\u06ED\u06EF", "" }, 428 { "\u0644\u0670\u200C\u06EF", "N", // D T ZWNJ R 429 "\u0644\u0670\u200C\u06EF", "" }, 430 { "\u0644\u200C\u06ED\u06EF", "N", // D ZWNJ T R 431 "\u0644\u200C\u06ED\u06EF", "" }, 432 { "\u0644\u200C\u06EF", "N", // D ZWNJ R 433 "\u0644\u200C\u06EF", "" }, 434 { "\u0644\u0670\u200C\u06ED", "N", // D T ZWNJ T 435 "\u0644\u0670\u200C\u06ED", "UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ" }, 436 { "\u06EF\u200C\u06EF", "N", // R ZWNJ R 437 "\u06EF\u200C\u06EF", "UIDNA_ERROR_CONTEXTJ" }, 438 { "\u0644\u200C", "N", // D ZWNJ 439 "\u0644\u200C", "UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ" }, 440 { "\u0660\u0661", "B", // Arabic-Indic Digits alone 441 "\u0660\u0661", "UIDNA_ERROR_BIDI" }, 442 { "\u06F0\u06F1", "B", // Extended Arabic-Indic Digits alone 443 "\u06F0\u06F1", "" }, 444 { "\u0660\u06F1", "B", // Mixed Arabic-Indic Digits 445 "\u0660\u06F1", "UIDNA_ERROR_CONTEXTO_DIGITS|UIDNA_ERROR_BIDI" }, 446 // All of the CONTEXTO "Would otherwise have been DISALLOWED" characters 447 // in their correct contexts, 448 // then each in incorrect context. 449 { "l\u00B7l\u4E00\u0375\u03B1\u05D0\u05F3\u05F4\u30FB", "B", 450 "l\u00B7l\u4E00\u0375\u03B1\u05D0\u05F3\u05F4\u30FB", "UIDNA_ERROR_BIDI" }, 451 { "l\u00B7", "B", 452 "l\u00B7", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" }, 453 { "\u00B7l", "B", 454 "\u00B7l", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" }, 455 { "\u0375", "B", 456 "\u0375", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" }, 457 { "\u03B1\u05F3", "B", 458 "\u03B1\u05F3", "UIDNA_ERROR_CONTEXTO_PUNCTUATION|UIDNA_ERROR_BIDI" }, 459 { "\u05F4", "B", 460 "\u05F4", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" }, 461 { "l\u30FB", "B", 462 "l\u30FB", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" }, 463 // { "", "B", 464 // "", "" }, 465 }; 466 467 @Test 468 public void TestSomeCases() { 469 StringBuilder aT=new StringBuilder(), uT=new StringBuilder(); 470 StringBuilder aN=new StringBuilder(), uN=new StringBuilder(); 471 IDNA.Info aTInfo=new IDNA.Info(), uTInfo=new IDNA.Info(); 472 IDNA.Info aNInfo=new IDNA.Info(), uNInfo=new IDNA.Info(); 473 474 StringBuilder aTuN=new StringBuilder(), uTaN=new StringBuilder(); 475 StringBuilder aNuN=new StringBuilder(), uNaN=new StringBuilder(); 476 IDNA.Info aTuNInfo=new IDNA.Info(), uTaNInfo=new IDNA.Info(); 477 IDNA.Info aNuNInfo=new IDNA.Info(), uNaNInfo=new IDNA.Info(); 478 479 StringBuilder aTL=new StringBuilder(), uTL=new StringBuilder(); 480 StringBuilder aNL=new StringBuilder(), uNL=new StringBuilder(); 481 IDNA.Info aTLInfo=new IDNA.Info(), uTLInfo=new IDNA.Info(); 482 IDNA.Info aNLInfo=new IDNA.Info(), uNLInfo=new IDNA.Info(); 483 484 EnumSet<IDNA.Error> uniErrors=EnumSet.noneOf(IDNA.Error.class); 485 486 TestCase testCase=new TestCase(); 487 int i; 488 for(i=0; i<testCases.length; ++i) { 489 testCase.set(testCases[i]); 490 String input=testCase.s; 491 String expected=testCase.u; 492 // ToASCII/ToUnicode, transitional/nontransitional 493 try { 494 trans.nameToASCII(input, aT, aTInfo); 495 trans.nameToUnicode(input, uT, uTInfo); 496 nontrans.nameToASCII(input, aN, aNInfo); 497 nontrans.nameToUnicode(input, uN, uNInfo); 498 } catch(Exception e) { 499 errln(String.format("first-level processing [%d/%s] %s - %s", 500 i, testCase.o, testCase.s, e)); 501 continue; 502 } 503 // ToUnicode does not set length-overflow errors. 504 uniErrors.clear(); 505 uniErrors.addAll(testCase.errors); 506 uniErrors.removeAll(lengthOverflowErrors); 507 char mode=testCase.o.charAt(0); 508 if(mode=='B' || mode=='N') { 509 if(!sameErrors(uNInfo, uniErrors)) { 510 errln(String.format("N.nameToUnicode([%d] %s) unexpected errors %s", 511 i, testCase.s, uNInfo.getErrors())); 512 continue; 513 } 514 if(!UTF16Plus.equal(uN, expected)) { 515 errln(String.format("N.nameToUnicode([%d] %s) unexpected string %s", 516 i, testCase.s, prettify(uN.toString()))); 517 continue; 518 } 519 if(!sameErrors(aNInfo, testCase.errors)) { 520 errln(String.format("N.nameToASCII([%d] %s) unexpected errors %s", 521 i, testCase.s, aNInfo.getErrors())); 522 continue; 523 } 524 } 525 if(mode=='B' || mode=='T') { 526 if(!sameErrors(uTInfo, uniErrors)) { 527 errln(String.format("T.nameToUnicode([%d] %s) unexpected errors %s", 528 i, testCase.s, uTInfo.getErrors())); 529 continue; 530 } 531 if(!UTF16Plus.equal(uT, expected)) { 532 errln(String.format("T.nameToUnicode([%d] %s) unexpected string %s", 533 i, testCase.s, prettify(uT.toString()))); 534 continue; 535 } 536 if(!sameErrors(aTInfo, testCase.errors)) { 537 errln(String.format("T.nameToASCII([%d] %s) unexpected errors %s", 538 i, testCase.s, aTInfo.getErrors())); 539 continue; 540 } 541 } 542 // ToASCII is all-ASCII if no severe errors 543 if(!hasCertainErrors(aNInfo, severeErrors) && !isASCII(aN)) { 544 errln(String.format("N.nameToASCII([%d] %s) (errors %s) result is not ASCII %s", 545 i, testCase.s, aNInfo.getErrors(), prettify(aN.toString()))); 546 continue; 547 } 548 if(!hasCertainErrors(aTInfo, severeErrors) && !isASCII(aT)) { 549 errln(String.format("T.nameToASCII([%d] %s) (errors %s) result is not ASCII %s", 550 i, testCase.s, aTInfo.getErrors(), prettify(aT.toString()))); 551 continue; 552 } 553 if(isVerbose()) { 554 char m= mode=='B' ? mode : 'N'; 555 logln(String.format("%c.nameToASCII([%d] %s) (errors %s) result string: %s", 556 m, i, testCase.s, aNInfo.getErrors(), prettify(aN.toString()))); 557 if(mode!='B') { 558 logln(String.format("T.nameToASCII([%d] %s) (errors %s) result string: %s", 559 i, testCase.s, aTInfo.getErrors(), prettify(aT.toString()))); 560 } 561 } 562 // second-level processing 563 try { 564 nontrans.nameToUnicode(aT, aTuN, aTuNInfo); 565 nontrans.nameToASCII(uT, uTaN, uTaNInfo); 566 nontrans.nameToUnicode(aN, aNuN, aNuNInfo); 567 nontrans.nameToASCII(uN, uNaN, uNaNInfo); 568 } catch(Exception e) { 569 errln(String.format("second-level processing [%d/%s] %s - %s", 570 i, testCase.o, testCase.s, e)); 571 continue; 572 } 573 if(!UTF16Plus.equal(aN, uNaN)) { 574 errln(String.format("N.nameToASCII([%d] %s)!=N.nameToUnicode().N.nameToASCII() "+ 575 "(errors %s) %s vs. %s", 576 i, testCase.s, aNInfo.getErrors(), 577 prettify(aN.toString()), prettify(uNaN.toString()))); 578 continue; 579 } 580 if(!UTF16Plus.equal(aT, uTaN)) { 581 errln(String.format("T.nameToASCII([%d] %s)!=T.nameToUnicode().N.nameToASCII() "+ 582 "(errors %s) %s vs. %s", 583 i, testCase.s, aNInfo.getErrors(), 584 prettify(aT.toString()), prettify(uTaN.toString()))); 585 continue; 586 } 587 if(!UTF16Plus.equal(uN, aNuN)) { 588 errln(String.format("N.nameToUnicode([%d] %s)!=N.nameToASCII().N.nameToUnicode() "+ 589 "(errors %s) %s vs. %s", 590 i, testCase.s, uNInfo.getErrors(), prettify(uN.toString()), prettify(aNuN.toString()))); 591 continue; 592 } 593 if(!UTF16Plus.equal(uT, aTuN)) { 594 errln(String.format("T.nameToUnicode([%d] %s)!=T.nameToASCII().N.nameToUnicode() "+ 595 "(errors %s) %s vs. %s", 596 i, testCase.s, uNInfo.getErrors(), 597 prettify(uT.toString()), prettify(aTuN.toString()))); 598 continue; 599 } 600 // labelToUnicode 601 try { 602 trans.labelToASCII(input, aTL, aTLInfo); 603 trans.labelToUnicode(input, uTL, uTLInfo); 604 nontrans.labelToASCII(input, aNL, aNLInfo); 605 nontrans.labelToUnicode(input, uNL, uNLInfo); 606 } catch(Exception e) { 607 errln(String.format("labelToXYZ processing [%d/%s] %s - %s", 608 i, testCase.o, testCase.s, e)); 609 continue; 610 } 611 if(aN.indexOf(".")<0) { 612 if(!UTF16Plus.equal(aN, aNL) || !sameErrors(aNInfo, aNLInfo)) { 613 errln(String.format("N.nameToASCII([%d] %s)!=N.labelToASCII() "+ 614 "(errors %s vs %s) %s vs. %s", 615 i, testCase.s, aNInfo.getErrors().toString(), aNLInfo.getErrors().toString(), 616 prettify(aN.toString()), prettify(aNL.toString()))); 617 continue; 618 } 619 } else { 620 if(!hasError(aNLInfo, IDNA.Error.LABEL_HAS_DOT)) { 621 errln(String.format("N.labelToASCII([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT", 622 i, testCase.s, aNLInfo.getErrors())); 623 continue; 624 } 625 } 626 if(aT.indexOf(".")<0) { 627 if(!UTF16Plus.equal(aT, aTL) || !sameErrors(aTInfo, aTLInfo)) { 628 errln(String.format("T.nameToASCII([%d] %s)!=T.labelToASCII() "+ 629 "(errors %s vs %s) %s vs. %s", 630 i, testCase.s, aTInfo.getErrors().toString(), aTLInfo.getErrors().toString(), 631 prettify(aT.toString()), prettify(aTL.toString()))); 632 continue; 633 } 634 } else { 635 if(!hasError(aTLInfo, IDNA.Error.LABEL_HAS_DOT)) { 636 errln(String.format("T.labelToASCII([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT", 637 i, testCase.s, aTLInfo.getErrors())); 638 continue; 639 } 640 } 641 if(uN.indexOf(".")<0) { 642 if(!UTF16Plus.equal(uN, uNL) || !sameErrors(uNInfo, uNLInfo)) { 643 errln(String.format("N.nameToUnicode([%d] %s)!=N.labelToUnicode() "+ 644 "(errors %s vs %s) %s vs. %s", 645 i, testCase.s, uNInfo.getErrors().toString(), uNLInfo.getErrors().toString(), 646 prettify(uN.toString()), prettify(uNL.toString()))); 647 continue; 648 } 649 } else { 650 if(!hasError(uNLInfo, IDNA.Error.LABEL_HAS_DOT)) { 651 errln(String.format("N.labelToUnicode([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT", 652 i, testCase.s, uNLInfo.getErrors())); 653 continue; 654 } 655 } 656 if(uT.indexOf(".")<0) { 657 if(!UTF16Plus.equal(uT, uTL) || !sameErrors(uTInfo, uTLInfo)) { 658 errln(String.format("T.nameToUnicode([%d] %s)!=T.labelToUnicode() "+ 659 "(errors %s vs %s) %s vs. %s", 660 i, testCase.s, uTInfo.getErrors().toString(), uTLInfo.getErrors().toString(), 661 prettify(uT.toString()), prettify(uTL.toString()))); 662 continue; 663 } 664 } else { 665 if(!hasError(uTLInfo, IDNA.Error.LABEL_HAS_DOT)) { 666 errln(String.format("T.labelToUnicode([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT", 667 i, testCase.s, uTLInfo.getErrors())); 668 continue; 669 } 670 } 671 // Differences between transitional and nontransitional processing 672 if(mode=='B') { 673 if( aNInfo.isTransitionalDifferent() || 674 aTInfo.isTransitionalDifferent() || 675 uNInfo.isTransitionalDifferent() || 676 uTInfo.isTransitionalDifferent() || 677 aNLInfo.isTransitionalDifferent() || 678 aTLInfo.isTransitionalDifferent() || 679 uNLInfo.isTransitionalDifferent() || 680 uTLInfo.isTransitionalDifferent() 681 ) { 682 errln(String.format("B.process([%d] %s) isTransitionalDifferent()", i, testCase.s)); 683 continue; 684 } 685 if( !UTF16Plus.equal(aN, aT) || !UTF16Plus.equal(uN, uT) || 686 !UTF16Plus.equal(aNL, aTL) || !UTF16Plus.equal(uNL, uTL) || 687 !sameErrors(aNInfo, aTInfo) || !sameErrors(uNInfo, uTInfo) || 688 !sameErrors(aNLInfo, aTLInfo) || !sameErrors(uNLInfo, uTLInfo) 689 ) { 690 errln(String.format("N.process([%d] %s) vs. T.process() different errors or result strings", 691 i, testCase.s)); 692 continue; 693 } 694 } else { 695 if( !aNInfo.isTransitionalDifferent() || 696 !aTInfo.isTransitionalDifferent() || 697 !uNInfo.isTransitionalDifferent() || 698 !uTInfo.isTransitionalDifferent() || 699 !aNLInfo.isTransitionalDifferent() || 700 !aTLInfo.isTransitionalDifferent() || 701 !uNLInfo.isTransitionalDifferent() || 702 !uTLInfo.isTransitionalDifferent() 703 ) { 704 errln(String.format("%s.process([%d] %s) !isTransitionalDifferent()", 705 testCase.o, i, testCase.s)); 706 continue; 707 } 708 if( UTF16Plus.equal(aN, aT) || UTF16Plus.equal(uN, uT) || 709 UTF16Plus.equal(aNL, aTL) || UTF16Plus.equal(uNL, uTL) 710 ) { 711 errln(String.format("N.process([%d] %s) vs. T.process() same result strings", 712 i, testCase.s)); 713 continue; 714 } 715 } 716 } 717 } 718 719 private final IDNA trans, nontrans; 720 721 private static final EnumSet<IDNA.Error> severeErrors=EnumSet.of( 722 IDNA.Error.LEADING_COMBINING_MARK, 723 IDNA.Error.DISALLOWED, 724 IDNA.Error.PUNYCODE, 725 IDNA.Error.LABEL_HAS_DOT, 726 IDNA.Error.INVALID_ACE_LABEL); 727 private static final EnumSet<IDNA.Error> lengthOverflowErrors=EnumSet.of( 728 IDNA.Error.LABEL_TOO_LONG, 729 IDNA.Error.DOMAIN_NAME_TOO_LONG); 730 731 private boolean hasError(IDNA.Info info, IDNA.Error error) { 732 return info.getErrors().contains(error); 733 } 734 // assumes that certainErrors is not empty 735 private boolean hasCertainErrors(Set<IDNA.Error> errors, Set<IDNA.Error> certainErrors) { 736 return !errors.isEmpty() && !Collections.disjoint(errors, certainErrors); 737 } 738 private boolean hasCertainErrors(IDNA.Info info, Set<IDNA.Error> certainErrors) { 739 return hasCertainErrors(info.getErrors(), certainErrors); 740 } 741 private boolean sameErrors(Set<IDNA.Error> a, Set<IDNA.Error> b) { 742 return a.equals(b); 743 } 744 private boolean sameErrors(IDNA.Info a, IDNA.Info b) { 745 return sameErrors(a.getErrors(), b.getErrors()); 746 } 747 private boolean sameErrors(IDNA.Info a, Set<IDNA.Error> b) { 748 return sameErrors(a.getErrors(), b); 749 } 750 751 private static boolean 752 isASCII(CharSequence str) { 753 int length=str.length(); 754 for(int i=0; i<length; ++i) { 755 if(str.charAt(i)>=0x80) { 756 return false; 757 } 758 } 759 return true; 760 } 761} 762