1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4*******************************************************************************
5* Copyright (C) 2010-2014, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8*/
9package com.ibm.icu.dev.test.normalizer;
10
11import java.util.Collections;
12import java.util.EnumSet;
13import java.util.Map;
14import java.util.Set;
15import java.util.TreeMap;
16
17import org.junit.Test;
18
19import com.ibm.icu.dev.test.TestFmwk;
20import com.ibm.icu.impl.Normalizer2Impl.UTF16Plus;
21import com.ibm.icu.text.IDNA;
22
23/**
24 * UTS #46 (IDNA2008) test.
25 * @author Markus Scherer
26 * @since 2010jul10
27 */
28public class UTS46Test extends TestFmwk {
29    public UTS46Test() {
30        int commonOptions=
31            IDNA.USE_STD3_RULES|IDNA.CHECK_BIDI|
32            IDNA.CHECK_CONTEXTJ|IDNA.CHECK_CONTEXTO;
33        trans=IDNA.getUTS46Instance(commonOptions);
34        nontrans=IDNA.getUTS46Instance(commonOptions|
35                                       IDNA.NONTRANSITIONAL_TO_ASCII|IDNA.NONTRANSITIONAL_TO_UNICODE);
36    }
37
38    @Test
39    public void TestAPI() {
40        StringBuilder result=new StringBuilder();
41        IDNA.Info info=new IDNA.Info();
42        String input="www.eXample.cOm";
43        String expected="www.example.com";
44        trans.nameToASCII(input, result, info);
45        if(info.hasErrors() || !UTF16Plus.equal(result, expected)) {
46            errln(String.format("T.nameToASCII(www.example.com) info.errors=%s result matches=%b",
47                                info.getErrors(), UTF16Plus.equal(result, expected)));
48        }
49        input="xn--bcher.de-65a";
50        expected="xn--bcher\uFFFDde-65a";
51        nontrans.labelToASCII(input, result, info);
52        if( !info.getErrors().equals(EnumSet.of(IDNA.Error.LABEL_HAS_DOT, IDNA.Error.INVALID_ACE_LABEL)) ||
53            !UTF16Plus.equal(result, expected)
54        ) {
55            errln(String.format("N.labelToASCII(label-with-dot) failed with errors %s",
56                                info.getErrors()));
57        }
58        // Java API tests that are not parallel to C++ tests
59        // because the C++ specifics (error codes etc.) do not apply here.
60        String resultString=trans.nameToUnicode("fA\u00DF.de", result, info).toString();
61        if(info.hasErrors() || !resultString.equals("fass.de")) {
62            errln(String.format("T.nameToUnicode(fA\u00DF.de) info.errors=%s result matches=%b",
63                                info.getErrors(), resultString.equals("fass.de")));
64        }
65        try {
66            nontrans.labelToUnicode(result, result, info);
67            errln("N.labelToUnicode(result, result) did not throw an Exception");
68        } catch(Exception e) {
69            // as expected (should be an IllegalArgumentException, or an ICU version of it)
70        }
71    }
72
73    @Test
74    public void TestNotSTD3() {
75        IDNA not3=IDNA.getUTS46Instance(IDNA.CHECK_BIDI);
76        String input="\u0000A_2+2=4\n.e\u00DFen.net";
77        StringBuilder result=new StringBuilder();
78        IDNA.Info info=new IDNA.Info();
79        if( !not3.nameToUnicode(input, result, info).toString().equals("\u0000a_2+2=4\n.essen.net") ||
80            info.hasErrors()
81        ) {
82            errln(String.format("notSTD3.nameToUnicode(non-LDH ASCII) unexpected errors %s string %s",
83                                info.getErrors(), prettify(result.toString())));
84        }
85        // A space (BiDi class WS) is not allowed in a BiDi domain name.
86        input="a z.xn--4db.edu";
87        not3.nameToASCII(input, result, info);
88        if(!UTF16Plus.equal(result, input) || !info.getErrors().equals(EnumSet.of(IDNA.Error.BIDI))) {
89            errln("notSTD3.nameToASCII(ASCII-with-space.alef.edu) failed");
90        }
91        // Characters that are canonically equivalent to sequences with non-LDH ASCII.
92        input="a\u2260b\u226Ec\u226Fd";
93        not3.nameToUnicode(input, result, info);
94        if(!UTF16Plus.equal(result, input) || info.hasErrors()) {
95            errln(String.format("notSTD3.nameToUnicode(equiv to non-LDH ASCII) unexpected errors %s string %s",
96                                info.getErrors().toString(), prettify(result.toString())));
97        }
98    }
99
100    private static final Map<String, IDNA.Error> errorNamesToErrors;
101    static {
102        errorNamesToErrors=new TreeMap<String, IDNA.Error>();
103        errorNamesToErrors.put("UIDNA_ERROR_EMPTY_LABEL", IDNA.Error.EMPTY_LABEL);
104        errorNamesToErrors.put("UIDNA_ERROR_LABEL_TOO_LONG", IDNA.Error.LABEL_TOO_LONG);
105        errorNamesToErrors.put("UIDNA_ERROR_DOMAIN_NAME_TOO_LONG", IDNA.Error.DOMAIN_NAME_TOO_LONG);
106        errorNamesToErrors.put("UIDNA_ERROR_LEADING_HYPHEN", IDNA.Error.LEADING_HYPHEN);
107        errorNamesToErrors.put("UIDNA_ERROR_TRAILING_HYPHEN", IDNA.Error.TRAILING_HYPHEN);
108        errorNamesToErrors.put("UIDNA_ERROR_HYPHEN_3_4", IDNA.Error.HYPHEN_3_4);
109        errorNamesToErrors.put("UIDNA_ERROR_LEADING_COMBINING_MARK", IDNA.Error.LEADING_COMBINING_MARK);
110        errorNamesToErrors.put("UIDNA_ERROR_DISALLOWED", IDNA.Error.DISALLOWED);
111        errorNamesToErrors.put("UIDNA_ERROR_PUNYCODE", IDNA.Error.PUNYCODE);
112        errorNamesToErrors.put("UIDNA_ERROR_LABEL_HAS_DOT", IDNA.Error.LABEL_HAS_DOT);
113        errorNamesToErrors.put("UIDNA_ERROR_INVALID_ACE_LABEL", IDNA.Error.INVALID_ACE_LABEL);
114        errorNamesToErrors.put("UIDNA_ERROR_BIDI", IDNA.Error.BIDI);
115        errorNamesToErrors.put("UIDNA_ERROR_CONTEXTJ", IDNA.Error.CONTEXTJ);
116        errorNamesToErrors.put("UIDNA_ERROR_CONTEXTO_PUNCTUATION", IDNA.Error.CONTEXTO_PUNCTUATION);
117        errorNamesToErrors.put("UIDNA_ERROR_CONTEXTO_DIGITS", IDNA.Error.CONTEXTO_DIGITS);
118    }
119
120    private static final class TestCase {
121        private TestCase() {
122            errors=EnumSet.noneOf(IDNA.Error.class);
123        }
124        private void set(String[] data) {
125            s=data[0];
126            o=data[1];
127            u=data[2];
128            errors.clear();
129            if(data[3].length()!=0) {
130                for(String e: data[3].split("\\|")) {
131                    errors.add(errorNamesToErrors.get(e));
132                }
133            }
134        }
135        // Input string and options string (Nontransitional/Transitional/Both).
136        private String s, o;
137        // Expected Unicode result string.
138        private String u;
139        private EnumSet<IDNA.Error> errors;
140    };
141
142    private static final String testCases[][]={
143        { "www.eXample.cOm", "B",  // all ASCII
144          "www.example.com", "" },
145        { "B\u00FCcher.de", "B",  // u-umlaut
146          "b\u00FCcher.de", "" },
147        { "\u00D6BB", "B",  // O-umlaut
148          "\u00F6bb", "" },
149        { "fa\u00DF.de", "N",  // sharp s
150          "fa\u00DF.de", "" },
151        { "fa\u00DF.de", "T",  // sharp s
152          "fass.de", "" },
153        { "XN--fA-hia.dE", "B",  // sharp s in Punycode
154          "fa\u00DF.de", "" },
155        { "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "N",  // Greek with final sigma
156          "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "" },
157        { "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "T",  // Greek with final sigma
158          "\u03B2\u03CC\u03BB\u03BF\u03C3.com", "" },
159        { "xn--nxasmm1c", "B",  // Greek with final sigma in Punycode
160          "\u03B2\u03CC\u03BB\u03BF\u03C2", "" },
161        { "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "N",  // "Sri" in "Sri Lanka" has a ZWJ
162          "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "" },
163        { "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "T",  // "Sri" in "Sri Lanka" has a ZWJ
164          "www.\u0DC1\u0DCA\u0DBB\u0DD3.com", "" },
165        { "www.xn--10cl1a0b660p.com", "B",  // "Sri" in Punycode
166          "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "" },
167        { "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "N",  // ZWNJ
168          "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "" },
169        { "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "T",  // ZWNJ
170          "\u0646\u0627\u0645\u0647\u0627\u06CC", "" },
171        { "xn--mgba3gch31f060k.com", "B",  // ZWNJ in Punycode
172          "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC.com", "" },
173        { "a.b\uFF0Ec\u3002d\uFF61", "B",
174          "a.b.c.d.", "" },
175        { "U\u0308.xn--tda", "B",  // U+umlaut.u-umlaut
176          "\u00FC.\u00FC", "" },
177        { "xn--u-ccb", "B",  // u+umlaut in Punycode
178          "xn--u-ccb\uFFFD", "UIDNA_ERROR_INVALID_ACE_LABEL" },
179        { "a\u2488com", "B",  // contains 1-dot
180          "a\uFFFDcom", "UIDNA_ERROR_DISALLOWED" },
181        { "xn--a-ecp.ru", "B",  // contains 1-dot in Punycode
182          "xn--a-ecp\uFFFD.ru", "UIDNA_ERROR_INVALID_ACE_LABEL" },
183        { "xn--0.pt", "B",  // invalid Punycode
184          "xn--0\uFFFD.pt", "UIDNA_ERROR_PUNYCODE" },
185        { "xn--a.pt", "B",  // U+0080
186          "xn--a\uFFFD.pt", "UIDNA_ERROR_INVALID_ACE_LABEL" },
187        { "xn--a-\u00C4.pt", "B",  // invalid Punycode
188          "xn--a-\u00E4.pt", "UIDNA_ERROR_PUNYCODE" },
189        { "\u65E5\u672C\u8A9E\u3002\uFF2A\uFF30", "B",  // Japanese with fullwidth ".jp"
190          "\u65E5\u672C\u8A9E.jp", "" },
191        { "\u2615", "B", "\u2615", "" },  // Unicode 4.0 HOT BEVERAGE
192        // some characters are disallowed because they are canonically equivalent
193        // to sequences with non-LDH ASCII
194        { "a\u2260b\u226Ec\u226Fd", "B",
195          "a\uFFFDb\uFFFDc\uFFFDd", "UIDNA_ERROR_DISALLOWED" },
196        // many deviation characters, test the special mapping code
197        { "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+
198          "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+
199          "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+
200          "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+
201          "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", "N",
202          "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+
203          "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+
204          "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+
205          "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+
206          "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz",
207          "UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_CONTEXTJ" },
208        { "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+
209          "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+
210          "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+
211          "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+
212          "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", "T",
213          "1.assbcssssssssd"+
214          "\u03C3\u03C3sssssssssssssssse"+
215          "ssssssssssssssssssssx"+
216          "ssssssssssssssssssssy"+
217          "sssssssssssssss\u015Dssz", "UIDNA_ERROR_LABEL_TOO_LONG" },
218        // "xn--bss" with deviation characters
219        { "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "N",
220          "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "UIDNA_ERROR_CONTEXTJ" },
221        { "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "T",
222          "\u5919", "" },
223        // "xn--bssffl" written as:
224        // 02E3 MODIFIER LETTER SMALL X
225        // 034F COMBINING GRAPHEME JOINER (ignored)
226        // 2115 DOUBLE-STRUCK CAPITAL N
227        // 200B ZERO WIDTH SPACE (ignored)
228        // FE63 SMALL HYPHEN-MINUS
229        // 00AD SOFT HYPHEN (ignored)
230        // FF0D FULLWIDTH HYPHEN-MINUS
231        // 180C MONGOLIAN FREE VARIATION SELECTOR TWO (ignored)
232        // 212C SCRIPT CAPITAL B
233        // FE00 VARIATION SELECTOR-1 (ignored)
234        // 017F LATIN SMALL LETTER LONG S
235        // 2064 INVISIBLE PLUS (ignored)
236        // 1D530 MATHEMATICAL FRAKTUR SMALL S
237        // E01EF VARIATION SELECTOR-256 (ignored)
238        // FB04 LATIN SMALL LIGATURE FFL
239        { "\u02E3\u034F\u2115\u200B\uFE63\u00AD\uFF0D\u180C"+
240          "\u212C\uFE00\u017F\u2064"+"\uD835\uDD30\uDB40\uDDEF"/*1D530 E01EF*/+"\uFB04", "B",
241          "\u5921\u591E\u591C\u5919", "" },
242        { "123456789012345678901234567890123456789012345678901234567890123."+
243          "123456789012345678901234567890123456789012345678901234567890123."+
244          "123456789012345678901234567890123456789012345678901234567890123."+
245          "1234567890123456789012345678901234567890123456789012345678901", "B",
246          "123456789012345678901234567890123456789012345678901234567890123."+
247          "123456789012345678901234567890123456789012345678901234567890123."+
248          "123456789012345678901234567890123456789012345678901234567890123."+
249          "1234567890123456789012345678901234567890123456789012345678901", "" },
250        { "123456789012345678901234567890123456789012345678901234567890123."+
251          "123456789012345678901234567890123456789012345678901234567890123."+
252          "123456789012345678901234567890123456789012345678901234567890123."+
253          "1234567890123456789012345678901234567890123456789012345678901.", "B",
254          "123456789012345678901234567890123456789012345678901234567890123."+
255          "123456789012345678901234567890123456789012345678901234567890123."+
256          "123456789012345678901234567890123456789012345678901234567890123."+
257          "1234567890123456789012345678901234567890123456789012345678901.", "" },
258        // Domain name >256 characters, forces slow path in UTF-8 processing.
259        { "123456789012345678901234567890123456789012345678901234567890123."+
260          "123456789012345678901234567890123456789012345678901234567890123."+
261          "123456789012345678901234567890123456789012345678901234567890123."+
262          "123456789012345678901234567890123456789012345678901234567890123."+
263          "12345678901234567890123456789012345678901234567890123456789012", "B",
264          "123456789012345678901234567890123456789012345678901234567890123."+
265          "123456789012345678901234567890123456789012345678901234567890123."+
266          "123456789012345678901234567890123456789012345678901234567890123."+
267          "123456789012345678901234567890123456789012345678901234567890123."+
268          "12345678901234567890123456789012345678901234567890123456789012",
269          "UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" },
270        { "123456789012345678901234567890123456789012345678901234567890123."+
271          "123456789012345678901234567890123456789012345678901234567890123."+
272          "123456789012345678901234567890123456789012345678901234567890123."+
273          "123456789012345678901234567890123456789012345678901234567890123."+
274          "1234567890123456789012345678901234567890123456789\u05D0", "B",
275          "123456789012345678901234567890123456789012345678901234567890123."+
276          "123456789012345678901234567890123456789012345678901234567890123."+
277          "123456789012345678901234567890123456789012345678901234567890123."+
278          "123456789012345678901234567890123456789012345678901234567890123."+
279          "1234567890123456789012345678901234567890123456789\u05D0",
280          "UIDNA_ERROR_DOMAIN_NAME_TOO_LONG|UIDNA_ERROR_BIDI" },
281        { "123456789012345678901234567890123456789012345678901234567890123."+
282          "1234567890123456789012345678901234567890123456789012345678901234."+
283          "123456789012345678901234567890123456789012345678901234567890123."+
284          "123456789012345678901234567890123456789012345678901234567890", "B",
285          "123456789012345678901234567890123456789012345678901234567890123."+
286          "1234567890123456789012345678901234567890123456789012345678901234."+
287          "123456789012345678901234567890123456789012345678901234567890123."+
288          "123456789012345678901234567890123456789012345678901234567890",
289          "UIDNA_ERROR_LABEL_TOO_LONG" },
290        { "123456789012345678901234567890123456789012345678901234567890123."+
291          "1234567890123456789012345678901234567890123456789012345678901234."+
292          "123456789012345678901234567890123456789012345678901234567890123."+
293          "123456789012345678901234567890123456789012345678901234567890.", "B",
294          "123456789012345678901234567890123456789012345678901234567890123."+
295          "1234567890123456789012345678901234567890123456789012345678901234."+
296          "123456789012345678901234567890123456789012345678901234567890123."+
297          "123456789012345678901234567890123456789012345678901234567890.",
298          "UIDNA_ERROR_LABEL_TOO_LONG" },
299        { "123456789012345678901234567890123456789012345678901234567890123."+
300          "1234567890123456789012345678901234567890123456789012345678901234."+
301          "123456789012345678901234567890123456789012345678901234567890123."+
302          "1234567890123456789012345678901234567890123456789012345678901", "B",
303          "123456789012345678901234567890123456789012345678901234567890123."+
304          "1234567890123456789012345678901234567890123456789012345678901234."+
305          "123456789012345678901234567890123456789012345678901234567890123."+
306          "1234567890123456789012345678901234567890123456789012345678901",
307          "UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" },
308        // label length 63: xn--1234567890123456789012345678901234567890123456789012345-9te
309        { "\u00E41234567890123456789012345678901234567890123456789012345", "B",
310          "\u00E41234567890123456789012345678901234567890123456789012345", "" },
311        { "1234567890\u00E41234567890123456789012345678901234567890123456", "B",
312          "1234567890\u00E41234567890123456789012345678901234567890123456", "UIDNA_ERROR_LABEL_TOO_LONG" },
313        { "123456789012345678901234567890123456789012345678901234567890123."+
314          "1234567890\u00E4123456789012345678901234567890123456789012345."+
315          "123456789012345678901234567890123456789012345678901234567890123."+
316          "1234567890123456789012345678901234567890123456789012345678901", "B",
317          "123456789012345678901234567890123456789012345678901234567890123."+
318          "1234567890\u00E4123456789012345678901234567890123456789012345."+
319          "123456789012345678901234567890123456789012345678901234567890123."+
320          "1234567890123456789012345678901234567890123456789012345678901", "" },
321        { "123456789012345678901234567890123456789012345678901234567890123."+
322          "1234567890\u00E4123456789012345678901234567890123456789012345."+
323          "123456789012345678901234567890123456789012345678901234567890123."+
324          "1234567890123456789012345678901234567890123456789012345678901.", "B",
325          "123456789012345678901234567890123456789012345678901234567890123."+
326          "1234567890\u00E4123456789012345678901234567890123456789012345."+
327          "123456789012345678901234567890123456789012345678901234567890123."+
328          "1234567890123456789012345678901234567890123456789012345678901.", "" },
329        { "123456789012345678901234567890123456789012345678901234567890123."+
330          "1234567890\u00E4123456789012345678901234567890123456789012345."+
331          "123456789012345678901234567890123456789012345678901234567890123."+
332          "12345678901234567890123456789012345678901234567890123456789012", "B",
333          "123456789012345678901234567890123456789012345678901234567890123."+
334          "1234567890\u00E4123456789012345678901234567890123456789012345."+
335          "123456789012345678901234567890123456789012345678901234567890123."+
336          "12345678901234567890123456789012345678901234567890123456789012",
337          "UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" },
338        { "123456789012345678901234567890123456789012345678901234567890123."+
339          "1234567890\u00E41234567890123456789012345678901234567890123456."+
340          "123456789012345678901234567890123456789012345678901234567890123."+
341          "123456789012345678901234567890123456789012345678901234567890", "B",
342          "123456789012345678901234567890123456789012345678901234567890123."+
343          "1234567890\u00E41234567890123456789012345678901234567890123456."+
344          "123456789012345678901234567890123456789012345678901234567890123."+
345          "123456789012345678901234567890123456789012345678901234567890",
346          "UIDNA_ERROR_LABEL_TOO_LONG" },
347        { "123456789012345678901234567890123456789012345678901234567890123."+
348          "1234567890\u00E41234567890123456789012345678901234567890123456."+
349          "123456789012345678901234567890123456789012345678901234567890123."+
350          "123456789012345678901234567890123456789012345678901234567890.", "B",
351          "123456789012345678901234567890123456789012345678901234567890123."+
352          "1234567890\u00E41234567890123456789012345678901234567890123456."+
353          "123456789012345678901234567890123456789012345678901234567890123."+
354          "123456789012345678901234567890123456789012345678901234567890.",
355          "UIDNA_ERROR_LABEL_TOO_LONG" },
356        { "123456789012345678901234567890123456789012345678901234567890123."+
357          "1234567890\u00E41234567890123456789012345678901234567890123456."+
358          "123456789012345678901234567890123456789012345678901234567890123."+
359          "1234567890123456789012345678901234567890123456789012345678901", "B",
360          "123456789012345678901234567890123456789012345678901234567890123."+
361          "1234567890\u00E41234567890123456789012345678901234567890123456."+
362          "123456789012345678901234567890123456789012345678901234567890123."+
363          "1234567890123456789012345678901234567890123456789012345678901",
364          "UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" },
365        // hyphen errors and empty-label errors
366        // Ticket #10883: ToUnicode also checks for empty labels.
367        { ".", "B", ".", "UIDNA_ERROR_EMPTY_LABEL" },
368        { "\uFF0E", "B", ".", "UIDNA_ERROR_EMPTY_LABEL" },
369        // "xn---q----jra"=="-q--a-umlaut-"
370        { "a.b..-q--a-.e", "B", "a.b..-q--a-.e",
371          "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+
372          "UIDNA_ERROR_HYPHEN_3_4" },
373        { "a.b..-q--\u00E4-.e", "B", "a.b..-q--\u00E4-.e",
374          "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+
375          "UIDNA_ERROR_HYPHEN_3_4" },
376        { "a.b..xn---q----jra.e", "B", "a.b..-q--\u00E4-.e",
377          "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+
378          "UIDNA_ERROR_HYPHEN_3_4" },
379        { "a..c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" },
380        { "a.xn--.c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" },
381        { "a.-b.", "B", "a.-b.", "UIDNA_ERROR_LEADING_HYPHEN" },
382        { "a.b-.c", "B", "a.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" },
383        { "a.-.c", "B", "a.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },
384        { "a.bc--de.f", "B", "a.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" },
385        { "\u00E4.\u00AD.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" },
386        { "\u00E4.xn--.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" },
387        { "\u00E4.-b.", "B", "\u00E4.-b.", "UIDNA_ERROR_LEADING_HYPHEN" },
388        { "\u00E4.b-.c", "B", "\u00E4.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" },
389        { "\u00E4.-.c", "B", "\u00E4.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },
390        { "\u00E4.bc--de.f", "B", "\u00E4.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" },
391        { "a.b.\u0308c.d", "B", "a.b.\uFFFDc.d", "UIDNA_ERROR_LEADING_COMBINING_MARK" },
392        { "a.b.xn--c-bcb.d", "B",
393          "a.b.xn--c-bcb\uFFFD.d", "UIDNA_ERROR_LEADING_COMBINING_MARK|UIDNA_ERROR_INVALID_ACE_LABEL" },
394        // BiDi
395        { "A0", "B", "a0", "" },
396        { "0A", "B", "0a", "" },  // all-LTR is ok to start with a digit (EN)
397        { "0A.\u05D0", "B",  // ASCII label does not start with L/R/AL
398          "0a.\u05D0", "UIDNA_ERROR_BIDI" },
399        { "c.xn--0-eha.xn--4db", "B",  // 2nd label does not start with L/R/AL
400          "c.0\u00FC.\u05D0", "UIDNA_ERROR_BIDI" },
401        { "b-.\u05D0", "B",  // label does not end with L/EN
402          "b-.\u05D0", "UIDNA_ERROR_TRAILING_HYPHEN|UIDNA_ERROR_BIDI" },
403        { "d.xn----dha.xn--4db", "B",  // 2nd label does not end with L/EN
404          "d.\u00FC-.\u05D0", "UIDNA_ERROR_TRAILING_HYPHEN|UIDNA_ERROR_BIDI" },
405        { "a\u05D0", "B", "a\u05D0", "UIDNA_ERROR_BIDI" },  // first dir != last dir
406        { "\u05D0\u05C7", "B", "\u05D0\u05C7", "" },
407        { "\u05D09\u05C7", "B", "\u05D09\u05C7", "" },
408        { "\u05D0a\u05C7", "B", "\u05D0a\u05C7", "UIDNA_ERROR_BIDI" },  // first dir != last dir
409        { "\u05D0\u05EA", "B", "\u05D0\u05EA", "" },
410        { "\u05D0\u05F3\u05EA", "B", "\u05D0\u05F3\u05EA", "" },
411        { "a\u05D0Tz", "B", "a\u05D0tz", "UIDNA_ERROR_BIDI" },  // mixed dir
412        { "\u05D0T\u05EA", "B", "\u05D0t\u05EA", "UIDNA_ERROR_BIDI" },  // mixed dir
413        { "\u05D07\u05EA", "B", "\u05D07\u05EA", "" },
414        { "\u05D0\u0667\u05EA", "B", "\u05D0\u0667\u05EA", "" },  // Arabic 7 in the middle
415        { "a7\u0667z", "B", "a7\u0667z", "UIDNA_ERROR_BIDI" },  // AN digit in LTR
416        { "\u05D07\u0667\u05EA", "B",  // mixed EN/AN digits in RTL
417          "\u05D07\u0667\u05EA", "UIDNA_ERROR_BIDI" },
418        // ZWJ
419        { "\u0BB9\u0BCD\u200D", "N", "\u0BB9\u0BCD\u200D", "" },  // Virama+ZWJ
420        { "\u0BB9\u200D", "N", "\u0BB9\u200D", "UIDNA_ERROR_CONTEXTJ" },  // no Virama
421        { "\u200D", "N", "\u200D", "UIDNA_ERROR_CONTEXTJ" },  // no Virama
422        // ZWNJ
423        { "\u0BB9\u0BCD\u200C", "N", "\u0BB9\u0BCD\u200C", "" },  // Virama+ZWNJ
424        { "\u0BB9\u200C", "N", "\u0BB9\u200C", "UIDNA_ERROR_CONTEXTJ" },  // no Virama
425        { "\u200C", "N", "\u200C", "UIDNA_ERROR_CONTEXTJ" },  // no Virama
426        { "\u0644\u0670\u200C\u06ED\u06EF", "N",  // Joining types D T ZWNJ T R
427          "\u0644\u0670\u200C\u06ED\u06EF", "" },
428        { "\u0644\u0670\u200C\u06EF", "N",  // D T ZWNJ R
429          "\u0644\u0670\u200C\u06EF", "" },
430        { "\u0644\u200C\u06ED\u06EF", "N",  // D ZWNJ T R
431          "\u0644\u200C\u06ED\u06EF", "" },
432        { "\u0644\u200C\u06EF", "N",  // D ZWNJ R
433          "\u0644\u200C\u06EF", "" },
434        { "\u0644\u0670\u200C\u06ED", "N",  // D T ZWNJ T
435          "\u0644\u0670\u200C\u06ED", "UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ" },
436        { "\u06EF\u200C\u06EF", "N",  // R ZWNJ R
437          "\u06EF\u200C\u06EF", "UIDNA_ERROR_CONTEXTJ" },
438        { "\u0644\u200C", "N",  // D ZWNJ
439          "\u0644\u200C", "UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ" },
440        { "\u0660\u0661", "B",  // Arabic-Indic Digits alone
441          "\u0660\u0661", "UIDNA_ERROR_BIDI" },
442        { "\u06F0\u06F1", "B",  // Extended Arabic-Indic Digits alone
443          "\u06F0\u06F1", "" },
444        { "\u0660\u06F1", "B",  // Mixed Arabic-Indic Digits
445          "\u0660\u06F1", "UIDNA_ERROR_CONTEXTO_DIGITS|UIDNA_ERROR_BIDI" },
446        // All of the CONTEXTO "Would otherwise have been DISALLOWED" characters
447        // in their correct contexts,
448        // then each in incorrect context.
449        { "l\u00B7l\u4E00\u0375\u03B1\u05D0\u05F3\u05F4\u30FB", "B",
450          "l\u00B7l\u4E00\u0375\u03B1\u05D0\u05F3\u05F4\u30FB", "UIDNA_ERROR_BIDI" },
451        { "l\u00B7", "B",
452          "l\u00B7", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
453        { "\u00B7l", "B",
454          "\u00B7l", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
455        { "\u0375", "B",
456          "\u0375", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
457        { "\u03B1\u05F3", "B",
458          "\u03B1\u05F3", "UIDNA_ERROR_CONTEXTO_PUNCTUATION|UIDNA_ERROR_BIDI" },
459        { "\u05F4", "B",
460          "\u05F4", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
461        { "l\u30FB", "B",
462          "l\u30FB", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
463        // { "", "B",
464        //   "", "" },
465    };
466
467    @Test
468    public void TestSomeCases() {
469        StringBuilder aT=new StringBuilder(), uT=new StringBuilder();
470        StringBuilder aN=new StringBuilder(), uN=new StringBuilder();
471        IDNA.Info aTInfo=new IDNA.Info(), uTInfo=new IDNA.Info();
472        IDNA.Info aNInfo=new IDNA.Info(), uNInfo=new IDNA.Info();
473
474        StringBuilder aTuN=new StringBuilder(), uTaN=new StringBuilder();
475        StringBuilder aNuN=new StringBuilder(), uNaN=new StringBuilder();
476        IDNA.Info aTuNInfo=new IDNA.Info(), uTaNInfo=new IDNA.Info();
477        IDNA.Info aNuNInfo=new IDNA.Info(), uNaNInfo=new IDNA.Info();
478
479        StringBuilder aTL=new StringBuilder(), uTL=new StringBuilder();
480        StringBuilder aNL=new StringBuilder(), uNL=new StringBuilder();
481        IDNA.Info aTLInfo=new IDNA.Info(), uTLInfo=new IDNA.Info();
482        IDNA.Info aNLInfo=new IDNA.Info(), uNLInfo=new IDNA.Info();
483
484        EnumSet<IDNA.Error> uniErrors=EnumSet.noneOf(IDNA.Error.class);
485
486        TestCase testCase=new TestCase();
487        int i;
488        for(i=0; i<testCases.length; ++i) {
489            testCase.set(testCases[i]);
490            String input=testCase.s;
491            String expected=testCase.u;
492            // ToASCII/ToUnicode, transitional/nontransitional
493            try {
494                trans.nameToASCII(input, aT, aTInfo);
495                trans.nameToUnicode(input, uT, uTInfo);
496                nontrans.nameToASCII(input, aN, aNInfo);
497                nontrans.nameToUnicode(input, uN, uNInfo);
498            } catch(Exception e) {
499                errln(String.format("first-level processing [%d/%s] %s - %s",
500                                    i, testCase.o, testCase.s, e));
501                continue;
502            }
503            // ToUnicode does not set length-overflow errors.
504            uniErrors.clear();
505            uniErrors.addAll(testCase.errors);
506            uniErrors.removeAll(lengthOverflowErrors);
507            char mode=testCase.o.charAt(0);
508            if(mode=='B' || mode=='N') {
509                if(!sameErrors(uNInfo, uniErrors)) {
510                    errln(String.format("N.nameToUnicode([%d] %s) unexpected errors %s",
511                                        i, testCase.s, uNInfo.getErrors()));
512                    continue;
513                }
514                if(!UTF16Plus.equal(uN, expected)) {
515                    errln(String.format("N.nameToUnicode([%d] %s) unexpected string %s",
516                                        i, testCase.s, prettify(uN.toString())));
517                    continue;
518                }
519                if(!sameErrors(aNInfo, testCase.errors)) {
520                    errln(String.format("N.nameToASCII([%d] %s) unexpected errors %s",
521                                        i, testCase.s, aNInfo.getErrors()));
522                    continue;
523                }
524            }
525            if(mode=='B' || mode=='T') {
526                if(!sameErrors(uTInfo, uniErrors)) {
527                    errln(String.format("T.nameToUnicode([%d] %s) unexpected errors %s",
528                                        i, testCase.s, uTInfo.getErrors()));
529                    continue;
530                }
531                if(!UTF16Plus.equal(uT, expected)) {
532                    errln(String.format("T.nameToUnicode([%d] %s) unexpected string %s",
533                                        i, testCase.s, prettify(uT.toString())));
534                    continue;
535                }
536                if(!sameErrors(aTInfo, testCase.errors)) {
537                    errln(String.format("T.nameToASCII([%d] %s) unexpected errors %s",
538                                        i, testCase.s, aTInfo.getErrors()));
539                    continue;
540                }
541            }
542            // ToASCII is all-ASCII if no severe errors
543            if(!hasCertainErrors(aNInfo, severeErrors) && !isASCII(aN)) {
544                errln(String.format("N.nameToASCII([%d] %s) (errors %s) result is not ASCII %s",
545                                    i, testCase.s, aNInfo.getErrors(), prettify(aN.toString())));
546                continue;
547            }
548            if(!hasCertainErrors(aTInfo, severeErrors) && !isASCII(aT)) {
549                errln(String.format("T.nameToASCII([%d] %s) (errors %s) result is not ASCII %s",
550                                    i, testCase.s, aTInfo.getErrors(), prettify(aT.toString())));
551                continue;
552            }
553            if(isVerbose()) {
554                char m= mode=='B' ? mode : 'N';
555                logln(String.format("%c.nameToASCII([%d] %s) (errors %s) result string: %s",
556                                    m, i, testCase.s, aNInfo.getErrors(), prettify(aN.toString())));
557                if(mode!='B') {
558                    logln(String.format("T.nameToASCII([%d] %s) (errors %s) result string: %s",
559                                        i, testCase.s, aTInfo.getErrors(), prettify(aT.toString())));
560                }
561            }
562            // second-level processing
563            try {
564                nontrans.nameToUnicode(aT, aTuN, aTuNInfo);
565                nontrans.nameToASCII(uT, uTaN, uTaNInfo);
566                nontrans.nameToUnicode(aN, aNuN, aNuNInfo);
567                nontrans.nameToASCII(uN, uNaN, uNaNInfo);
568            } catch(Exception e) {
569                errln(String.format("second-level processing [%d/%s] %s - %s",
570                                    i, testCase.o, testCase.s, e));
571                continue;
572            }
573            if(!UTF16Plus.equal(aN, uNaN)) {
574                errln(String.format("N.nameToASCII([%d] %s)!=N.nameToUnicode().N.nameToASCII() "+
575                                    "(errors %s) %s vs. %s",
576                                    i, testCase.s, aNInfo.getErrors(),
577                                    prettify(aN.toString()), prettify(uNaN.toString())));
578                continue;
579            }
580            if(!UTF16Plus.equal(aT, uTaN)) {
581                errln(String.format("T.nameToASCII([%d] %s)!=T.nameToUnicode().N.nameToASCII() "+
582                                    "(errors %s) %s vs. %s",
583                                    i, testCase.s, aNInfo.getErrors(),
584                                    prettify(aT.toString()), prettify(uTaN.toString())));
585                continue;
586            }
587            if(!UTF16Plus.equal(uN, aNuN)) {
588                errln(String.format("N.nameToUnicode([%d] %s)!=N.nameToASCII().N.nameToUnicode() "+
589                                    "(errors %s) %s vs. %s",
590                                    i, testCase.s, uNInfo.getErrors(), prettify(uN.toString()), prettify(aNuN.toString())));
591                continue;
592            }
593            if(!UTF16Plus.equal(uT, aTuN)) {
594                errln(String.format("T.nameToUnicode([%d] %s)!=T.nameToASCII().N.nameToUnicode() "+
595                                    "(errors %s) %s vs. %s",
596                                    i, testCase.s, uNInfo.getErrors(),
597                                    prettify(uT.toString()), prettify(aTuN.toString())));
598                continue;
599            }
600            // labelToUnicode
601            try {
602                trans.labelToASCII(input, aTL, aTLInfo);
603                trans.labelToUnicode(input, uTL, uTLInfo);
604                nontrans.labelToASCII(input, aNL, aNLInfo);
605                nontrans.labelToUnicode(input, uNL, uNLInfo);
606            } catch(Exception e) {
607                errln(String.format("labelToXYZ processing [%d/%s] %s - %s",
608                                    i, testCase.o, testCase.s, e));
609                continue;
610            }
611            if(aN.indexOf(".")<0) {
612                if(!UTF16Plus.equal(aN, aNL) || !sameErrors(aNInfo, aNLInfo)) {
613                    errln(String.format("N.nameToASCII([%d] %s)!=N.labelToASCII() "+
614                                        "(errors %s vs %s) %s vs. %s",
615                                        i, testCase.s, aNInfo.getErrors().toString(), aNLInfo.getErrors().toString(),
616                                        prettify(aN.toString()), prettify(aNL.toString())));
617                    continue;
618                }
619            } else {
620                if(!hasError(aNLInfo, IDNA.Error.LABEL_HAS_DOT)) {
621                    errln(String.format("N.labelToASCII([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT",
622                                        i, testCase.s, aNLInfo.getErrors()));
623                    continue;
624                }
625            }
626            if(aT.indexOf(".")<0) {
627                if(!UTF16Plus.equal(aT, aTL) || !sameErrors(aTInfo, aTLInfo)) {
628                    errln(String.format("T.nameToASCII([%d] %s)!=T.labelToASCII() "+
629                                        "(errors %s vs %s) %s vs. %s",
630                                        i, testCase.s, aTInfo.getErrors().toString(), aTLInfo.getErrors().toString(),
631                                        prettify(aT.toString()), prettify(aTL.toString())));
632                    continue;
633                }
634            } else {
635                if(!hasError(aTLInfo, IDNA.Error.LABEL_HAS_DOT)) {
636                    errln(String.format("T.labelToASCII([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT",
637                                        i, testCase.s, aTLInfo.getErrors()));
638                    continue;
639                }
640            }
641            if(uN.indexOf(".")<0) {
642                if(!UTF16Plus.equal(uN, uNL) || !sameErrors(uNInfo, uNLInfo)) {
643                    errln(String.format("N.nameToUnicode([%d] %s)!=N.labelToUnicode() "+
644                                        "(errors %s vs %s) %s vs. %s",
645                                        i, testCase.s, uNInfo.getErrors().toString(), uNLInfo.getErrors().toString(),
646                                        prettify(uN.toString()), prettify(uNL.toString())));
647                    continue;
648                }
649            } else {
650                if(!hasError(uNLInfo, IDNA.Error.LABEL_HAS_DOT)) {
651                    errln(String.format("N.labelToUnicode([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT",
652                                        i, testCase.s, uNLInfo.getErrors()));
653                    continue;
654                }
655            }
656            if(uT.indexOf(".")<0) {
657                if(!UTF16Plus.equal(uT, uTL) || !sameErrors(uTInfo, uTLInfo)) {
658                    errln(String.format("T.nameToUnicode([%d] %s)!=T.labelToUnicode() "+
659                                        "(errors %s vs %s) %s vs. %s",
660                                        i, testCase.s, uTInfo.getErrors().toString(), uTLInfo.getErrors().toString(),
661                                        prettify(uT.toString()), prettify(uTL.toString())));
662                    continue;
663                }
664            } else {
665                if(!hasError(uTLInfo, IDNA.Error.LABEL_HAS_DOT)) {
666                    errln(String.format("T.labelToUnicode([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT",
667                                        i, testCase.s, uTLInfo.getErrors()));
668                    continue;
669                }
670            }
671            // Differences between transitional and nontransitional processing
672            if(mode=='B') {
673                if( aNInfo.isTransitionalDifferent() ||
674                    aTInfo.isTransitionalDifferent() ||
675                    uNInfo.isTransitionalDifferent() ||
676                    uTInfo.isTransitionalDifferent() ||
677                    aNLInfo.isTransitionalDifferent() ||
678                    aTLInfo.isTransitionalDifferent() ||
679                    uNLInfo.isTransitionalDifferent() ||
680                    uTLInfo.isTransitionalDifferent()
681                ) {
682                    errln(String.format("B.process([%d] %s) isTransitionalDifferent()", i, testCase.s));
683                    continue;
684                }
685                if( !UTF16Plus.equal(aN, aT) || !UTF16Plus.equal(uN, uT) ||
686                    !UTF16Plus.equal(aNL, aTL) || !UTF16Plus.equal(uNL, uTL) ||
687                    !sameErrors(aNInfo, aTInfo) || !sameErrors(uNInfo, uTInfo) ||
688                    !sameErrors(aNLInfo, aTLInfo) || !sameErrors(uNLInfo, uTLInfo)
689                ) {
690                    errln(String.format("N.process([%d] %s) vs. T.process() different errors or result strings",
691                                        i, testCase.s));
692                    continue;
693                }
694            } else {
695                if( !aNInfo.isTransitionalDifferent() ||
696                    !aTInfo.isTransitionalDifferent() ||
697                    !uNInfo.isTransitionalDifferent() ||
698                    !uTInfo.isTransitionalDifferent() ||
699                    !aNLInfo.isTransitionalDifferent() ||
700                    !aTLInfo.isTransitionalDifferent() ||
701                    !uNLInfo.isTransitionalDifferent() ||
702                    !uTLInfo.isTransitionalDifferent()
703                ) {
704                    errln(String.format("%s.process([%d] %s) !isTransitionalDifferent()",
705                                        testCase.o, i, testCase.s));
706                    continue;
707                }
708                if( UTF16Plus.equal(aN, aT) || UTF16Plus.equal(uN, uT) ||
709                    UTF16Plus.equal(aNL, aTL) || UTF16Plus.equal(uNL, uTL)
710                ) {
711                    errln(String.format("N.process([%d] %s) vs. T.process() same result strings",
712                                        i, testCase.s));
713                    continue;
714                }
715            }
716        }
717    }
718
719    private final IDNA trans, nontrans;
720
721    private static final EnumSet<IDNA.Error> severeErrors=EnumSet.of(
722        IDNA.Error.LEADING_COMBINING_MARK,
723        IDNA.Error.DISALLOWED,
724        IDNA.Error.PUNYCODE,
725        IDNA.Error.LABEL_HAS_DOT,
726        IDNA.Error.INVALID_ACE_LABEL);
727    private static final EnumSet<IDNA.Error> lengthOverflowErrors=EnumSet.of(
728            IDNA.Error.LABEL_TOO_LONG,
729            IDNA.Error.DOMAIN_NAME_TOO_LONG);
730
731    private boolean hasError(IDNA.Info info, IDNA.Error error) {
732        return info.getErrors().contains(error);
733    }
734    // assumes that certainErrors is not empty
735    private boolean hasCertainErrors(Set<IDNA.Error> errors, Set<IDNA.Error> certainErrors) {
736        return !errors.isEmpty() && !Collections.disjoint(errors, certainErrors);
737    }
738    private boolean hasCertainErrors(IDNA.Info info, Set<IDNA.Error> certainErrors) {
739        return hasCertainErrors(info.getErrors(), certainErrors);
740    }
741    private boolean sameErrors(Set<IDNA.Error> a, Set<IDNA.Error> b) {
742        return a.equals(b);
743    }
744    private boolean sameErrors(IDNA.Info a, IDNA.Info b) {
745        return sameErrors(a.getErrors(), b.getErrors());
746    }
747    private boolean sameErrors(IDNA.Info a, Set<IDNA.Error> b) {
748        return sameErrors(a.getErrors(), b);
749    }
750
751    private static boolean
752    isASCII(CharSequence str) {
753        int length=str.length();
754        for(int i=0; i<length; ++i) {
755            if(str.charAt(i)>=0x80) {
756                return false;
757            }
758        }
759        return true;
760    }
761}
762