17935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 27935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Copyright (C) 2010-2014, International Business Machines 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Corporation and others. All Rights Reserved. 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/ 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.impl; 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.EnumSet; 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UCharacter; 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UCharacterCategory; 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UCharacterDirection; 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UScript; 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.IDNA; 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.Normalizer2; 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.StringPrepParseException; 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.ICUException; 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// Note about tests for IDNA.Error.DOMAIN_NAME_TOO_LONG: 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// The domain name length limit is 255 octets in an internal DNS representation 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// where the last ("root") label is the empty label 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// represented by length byte 0 alone. 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// In a conventional string, this translates to 253 characters, or 254 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// if there is a trailing dot for the root label. 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * UTS #46 (IDNA2008) implementation. 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @author Markus Scherer 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @since 2010jul09 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic final class UTS46 extends IDNA { 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public UTS46(int options) { 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.options=options; 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public StringBuilder labelToASCII(CharSequence label, StringBuilder dest, Info info) { 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return process(label, true, true, dest, info); 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public StringBuilder labelToUnicode(CharSequence label, StringBuilder dest, Info info) { 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return process(label, true, false, dest, info); 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public StringBuilder nameToASCII(CharSequence name, StringBuilder dest, Info info) { 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert process(name, false, true, dest, info); 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( dest.length()>=254 && !info.getErrors().contains(Error.DOMAIN_NAME_TOO_LONG) && 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isASCIIString(dest) && 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (dest.length()>254 || dest.charAt(253)!='.') 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addError(info, Error.DOMAIN_NAME_TOO_LONG); 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return dest; 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info info) { 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return process(name, false, false, dest, info); 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final Normalizer2 uts46Norm2= 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Normalizer2.getInstance(null, "uts46", Normalizer2.Mode.COMPOSE); // uts46.nrm 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert final int options; 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Severe errors which usually result in a U+FFFD replacement character in the result string. 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final EnumSet<Error> severeErrors=EnumSet.of( 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Error.LEADING_COMBINING_MARK, 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Error.DISALLOWED, 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Error.PUNYCODE, 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Error.LABEL_HAS_DOT, 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Error.INVALID_ACE_LABEL); 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isASCIIString(CharSequence dest) { 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length=dest.length(); 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i=0; i<length; ++i) { 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(dest.charAt(i)>0x7f) { 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // UTS #46 data for ASCII characters. 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and passes through all other ASCII characters. 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If USE_STD3_RULES is set, then non-LDH characters are disallowed 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // using this data. 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The ASCII fastpath also uses this data. 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Values: -1=disallowed 0==valid 1==mapped (lowercase) 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final byte asciiData[]={ 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 002D..002E; valid # HYPHEN-MINUS..FULL STOP 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 0030..0039; valid # DIGIT ZERO..DIGIT NINE 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }; 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private StringBuilder 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert process(CharSequence src, 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean isLabel, boolean toASCII, 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder dest, 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Info info) { 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // uts46Norm2.normalize() would do all of this error checking and setup, 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // but with the ASCII fastpath we do not always call it, and do not 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // call it first. 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(dest==src) { 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException(); 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Arguments are fine, reset output values. 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.delete(0, 0x7fffffff); 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert resetInfo(info); 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int srcLength=src.length(); 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(srcLength==0) { 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addError(info, Error.EMPTY_LABEL); 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return dest; 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ASCII fastpath 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int labelStart=0; 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int i; 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(i=0;; ++i) { 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(i==srcLength) { 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(toASCII) { 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((i-labelStart)>63) { 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.LABEL_TOO_LONG); 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // There is a trailing dot if labelStart==i. 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!isLabel && i>=254 && (i>254 || labelStart<i)) { 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addError(info, Error.DOMAIN_NAME_TOO_LONG); 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert promoteAndResetLabelErrors(info); 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return dest; 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c=src.charAt(i); 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c>0x7f) { 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cData=asciiData[c]; 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(cData>0) { 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.append((char)(c+0x20)); // Lowercase an uppercase ASCII letter. 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(cData<0 && disallowNonLDHDot) { 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; // Replacing with U+FFFD can be complicated for toASCII. 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.append(c); 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c=='-') { // hyphen 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(i==(labelStart+3) && src.charAt(i-1)=='-') { 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // "??--..." is Punycode or forbidden. 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++i; // '-' was copied to dest already 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(i==labelStart) { 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // label starts with "-" 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.LEADING_HYPHEN); 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((i+1)==srcLength || src.charAt(i+1)=='.') { 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // label ends with "-" 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.TRAILING_HYPHEN); 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(c=='.') { // dot 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isLabel) { 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Replacing with U+FFFD can be complicated for toASCII. 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++i; // '.' was copied to dest already 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(i==labelStart) { 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.EMPTY_LABEL); 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(toASCII && (i-labelStart)>63) { 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.LABEL_TOO_LONG); 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert promoteAndResetLabelErrors(info); 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert labelStart=i+1; 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert promoteAndResetLabelErrors(info); 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert processUnicode(src, labelStart, i, isLabel, toASCII, dest, info); 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( isBiDi(info) && !hasCertainErrors(info, severeErrors) && 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (!isOkBiDi(info) || (labelStart>0 && !isASCIIOkBiDi(dest, labelStart))) 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addError(info, Error.BIDI); 1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return dest; 1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private StringBuilder 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert processUnicode(CharSequence src, 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int labelStart, int mappingStart, 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean isLabel, boolean toASCII, 2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder dest, 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Info info) { 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(mappingStart==0) { 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert uts46Norm2.normalize(src, dest); 2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert uts46Norm2.normalizeSecondAndAppend(dest, src.subSequence(mappingStart, src.length())); 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean doMapDevChars= 2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert toASCII ? (options&NONTRANSITIONAL_TO_ASCII)==0 : 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (options&NONTRANSITIONAL_TO_UNICODE)==0; 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int destLength=dest.length(); 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int labelLimit=labelStart; 2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(labelLimit<destLength) { 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c=dest.charAt(labelLimit); 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c=='.' && !isLabel) { 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int labelLength=labelLimit-labelStart; 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int newLength=processLabel(dest, labelStart, labelLength, 2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert toASCII, info); 2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert promoteAndResetLabelErrors(info); 2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert destLength+=newLength-labelLength; 2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert labelLimit=labelStart+=newLength+1; 2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(0xdf<=c && c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) { 2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setTransitionalDifferent(info); 2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(doMapDevChars) { 2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert destLength=mapDevChars(dest, labelStart, labelLimit); 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Do not increment labelLimit in case c was removed. 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // All deviation characters have been mapped, no need to check for them again. 2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert doMapDevChars=false; 2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++labelLimit; 2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++labelLimit; 2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok) 2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // but not an empty label elsewhere nor a completely empty domain name. 2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0. 2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(0==labelStart || labelStart<labelLimit) { 2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert processLabel(dest, labelStart, labelLimit-labelStart, toASCII, info); 2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert promoteAndResetLabelErrors(info); 2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return dest; 2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // returns the new dest.length() 2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int 2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mapDevChars(StringBuilder dest, int labelStart, int mappingStart) { 2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length=dest.length(); 2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean didMapDevChars=false; 2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i=mappingStart; i<length;) { 2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c=dest.charAt(i); 2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch(c) { 2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case 0xdf: 2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Map sharp s to ss. 2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert didMapDevChars=true; 2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.setCharAt(i++, 's'); 2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.insert(i++, 's'); 2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++length; 2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case 0x3c2: // Map final sigma to nonfinal sigma. 2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert didMapDevChars=true; 2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.setCharAt(i++, '\u03c3'); 2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case 0x200c: // Ignore/remove ZWNJ. 2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case 0x200d: // Ignore/remove ZWJ. 2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert didMapDevChars=true; 2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.delete(i, i+1); 2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --length; 2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert default: 2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++i; 2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(didMapDevChars) { 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Mapping deviation characters might have resulted in an un-NFC string. 2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We could use either the NFC or the UTS #46 normalizer. 2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file. 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String normalized=uts46Norm2.normalize(dest.subSequence(labelStart, dest.length())); 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.replace(labelStart, 0x7fffffff, normalized); 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return dest.length(); 2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return length; 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Some non-ASCII characters are equivalent to sequences with 2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // non-LDH ASCII characters. To find them: 2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt) 2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean 2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isNonASCIIDisallowedSTD3Valid(int c) { 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return c==0x2260 || c==0x226E || c==0x226F; 2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Replace the label in dest with the label string, if the label was modified. 2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If label==dest then the label was modified in-place and labelLength 2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // is the new label length, different from label.length(). 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If label!=dest then labelLength==label.length(). 3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Returns labelLength (= the new label length). 3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int 3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert replaceLabel(StringBuilder dest, int destLabelStart, int destLabelLength, 3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharSequence label, int labelLength) { 3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(label!=dest) { 3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.delete(destLabelStart, destLabelStart+destLabelLength).insert(destLabelStart, label); 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // or dest.replace(destLabelStart, destLabelStart+destLabelLength, label.toString()); 3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // which would create a String rather than moving characters in the StringBuilder. 3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return labelLength; 3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // returns the new label length 3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int 3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert processLabel(StringBuilder dest, 3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int labelStart, int labelLength, 3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean toASCII, 3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Info info) { 3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder fromPunycode; 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder labelString; 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int destLabelStart=labelStart; 3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int destLabelLength=labelLength; 3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean wasPunycode; 3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( labelLength>=4 && 3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.charAt(labelStart)=='x' && dest.charAt(labelStart+1)=='n' && 3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-' 3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Label starts with "xn--", try to un-Punycode it. 3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert wasPunycode=true; 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert try { 3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null); 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } catch (StringPrepParseException e) { 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.PUNYCODE); 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return markBadACELabel(dest, labelStart, labelLength, toASCII, info); 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Check for NFC, and for characters that are not 3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // valid or deviation characters according to the normalizer. 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If there is something wrong, then the string will change. 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note that the normalizer passes through non-LDH ASCII and deviation characters. 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Deviation characters are ok in Punycode even in transitional processing. 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES 3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too. 3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean isValid=uts46Norm2.isNormalized(fromPunycode); 3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!isValid) { 3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.INVALID_ACE_LABEL); 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return markBadACELabel(dest, labelStart, labelLength, toASCII, info); 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert labelString=fromPunycode; 3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert labelStart=0; 3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert labelLength=fromPunycode.length(); 3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert wasPunycode=false; 3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert labelString=dest; 3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Validity check 3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(labelLength==0) { 3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.EMPTY_LABEL); 3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength); 3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // labelLength>0 3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(labelLength>=4 && labelString.charAt(labelStart+2)=='-' && labelString.charAt(labelStart+3)=='-') { 3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // label starts with "??--" 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.HYPHEN_3_4); 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(labelString.charAt(labelStart)=='-') { 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // label starts with "-" 3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.LEADING_HYPHEN); 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(labelString.charAt(labelStart+labelLength-1)=='-') { 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // label ends with "-" 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.TRAILING_HYPHEN); 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If the label was not a Punycode label, then it was the result of 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // mapping, normalization and label segmentation. 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If the label was in Punycode, then we mapped it again above 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and checked its validity. 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Now we handle the STD3 restriction to LDH characters (if set) 3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and we look for U+FFFD which indicates disallowed characters 3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // in a non-Punycode label or U+FFFD itself in a Punycode label. 3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We also check for dots which can come from the input to a single-label function. 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Ok to cast away const because we own the UnicodeString. 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int i=labelStart; 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int limit=labelStart+labelLength; 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char oredChars=0; 3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed. 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c=labelString.charAt(i); 3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<=0x7f) { 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c=='.') { 3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.LABEL_HAS_DOT); 3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert labelString.setCharAt(i, '\ufffd'); 3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(disallowNonLDHDot && asciiData[c]<0) { 3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.DISALLOWED); 3957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert labelString.setCharAt(i, '\ufffd'); 3967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert oredChars|=c; 3997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) { 4007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.DISALLOWED); 4017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert labelString.setCharAt(i, '\ufffd'); 4027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(c==0xfffd) { 4037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.DISALLOWED); 4047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++i; 4077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while(i<limit); 4087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Check for a leading combining mark after other validity checks 4097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // so that we don't report IDNA.Error.DISALLOWED for the U+FFFD from here. 4107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c; 4117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD. 4127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=labelString.codePointAt(labelStart); 4137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) { 4147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.LEADING_COMBINING_MARK); 4157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert labelString.setCharAt(labelStart, '\ufffd'); 4167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c>0xffff) { 4177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Remove c's trail surrogate. 4187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert labelString.deleteCharAt(labelStart+1); 4197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --labelLength; 4207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(labelString==dest) { 4217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --destLabelLength; 4227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!hasCertainLabelErrors(info, severeErrors)) { 4267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Do contextual checks only if we do not have U+FFFD from a severe error 4277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // because U+FFFD can make these checks fail. 4287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((options&CHECK_BIDI)!=0 && (!isBiDi(info) || isOkBiDi(info))) { 4297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkLabelBiDi(labelString, labelStart, labelLength, info); 4307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( (options&CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c && 4327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert !isLabelOkContextJ(labelString, labelStart, labelLength) 4337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 4347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.CONTEXTJ); 4357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((options&CHECK_CONTEXTO)!=0 && oredChars>=0xb7) { 4377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkLabelContextO(labelString, labelStart, labelLength, info); 4387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(toASCII) { 4407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(wasPunycode) { 4417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Leave a Punycode label unchanged if it has no severe errors. 4427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(destLabelLength>63) { 4437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.LABEL_TOO_LONG); 4447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return destLabelLength; 4467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(oredChars>=0x80) { 4477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Contains non-ASCII characters. 4487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder punycode; 4497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert try { 4507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert punycode=Punycode.encode(labelString.subSequence(labelStart, labelStart+labelLength), null); 4517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } catch (StringPrepParseException e) { 4527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUException(e); // unexpected 4537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert punycode.insert(0, "xn--"); 4557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(punycode.length()>63) { 4567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.LABEL_TOO_LONG); 4577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return replaceLabel(dest, destLabelStart, destLabelLength, 4597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert punycode, punycode.length()); 4607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 4617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // all-ASCII label 4627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(labelLength>63) { 4637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.LABEL_TOO_LONG); 4647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 4687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If a Punycode label has severe errors, 4697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // then leave it but make sure it does not look valid. 4707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(wasPunycode) { 4717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.INVALID_ACE_LABEL); 4727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info); 4737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength); 4767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int 4787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert markBadACELabel(StringBuilder dest, 4797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int labelStart, int labelLength, 4807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean toASCII, Info info) { 4817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; 4827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean isASCII=true; 4837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean onlyLDH=true; 4847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int i=labelStart+4; // After the initial "xn--". 4857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int limit=labelStart+labelLength; 4867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 4877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c=dest.charAt(i); 4887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<=0x7f) { 4897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c=='.') { 4907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.LABEL_HAS_DOT); 4917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.setCharAt(i, '\ufffd'); 4927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isASCII=onlyLDH=false; 4937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(asciiData[c]<0) { 4947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert onlyLDH=false; 4957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(disallowNonLDHDot) { 4967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.setCharAt(i, '\ufffd'); 4977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isASCII=false; 4987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 5017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isASCII=onlyLDH=false; 5027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while(++i<limit); 5047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(onlyLDH) { 5057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.insert(labelStart+labelLength, '\ufffd'); 5067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++labelLength; 5077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 5087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(toASCII && isASCII && labelLength>63) { 5097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.LABEL_TOO_LONG); 5107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return labelLength; 5137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int L_MASK=U_MASK(UCharacterDirection.LEFT_TO_RIGHT); 5167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int R_AL_MASK= 5177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert U_MASK(UCharacterDirection.RIGHT_TO_LEFT)| 5187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert U_MASK(UCharacterDirection.RIGHT_TO_LEFT_ARABIC); 5197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int L_R_AL_MASK=L_MASK|R_AL_MASK; 5207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int R_AL_AN_MASK=R_AL_MASK|U_MASK(UCharacterDirection.ARABIC_NUMBER); 5227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int EN_AN_MASK= 5247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert U_MASK(UCharacterDirection.EUROPEAN_NUMBER)| 5257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert U_MASK(UCharacterDirection.ARABIC_NUMBER); 5267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK; 5277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int L_EN_MASK=L_MASK|U_MASK(UCharacterDirection.EUROPEAN_NUMBER); 5287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int ES_CS_ET_ON_BN_NSM_MASK= 5307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert U_MASK(UCharacterDirection.EUROPEAN_NUMBER_SEPARATOR)| 5317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert U_MASK(UCharacterDirection.COMMON_NUMBER_SEPARATOR)| 5327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert U_MASK(UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR)| 5337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert U_MASK(UCharacterDirection.OTHER_NEUTRAL)| 5347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert U_MASK(UCharacterDirection.BOUNDARY_NEUTRAL)| 5357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert U_MASK(UCharacterDirection.DIR_NON_SPACING_MARK); 5367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 5377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 5387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We scan the whole label and check both for whether it contains RTL characters 5407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and whether it passes the BiDi Rule. 5417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find 5427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // that a domain name is a BiDi domain name (has an RTL label) only after 5437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // processing several earlier labels. 5447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void 5457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkLabelBiDi(CharSequence label, int labelStart, int labelLength, Info info) { 5467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // IDNA2008 BiDi rule 5477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Get the directionality of the first character. 5487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c; 5497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int i=labelStart; 5507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.codePointAt(label, i); 5517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert i+=Character.charCount(c); 5527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstMask=U_MASK(UBiDiProps.INSTANCE.getClass(c)); 5537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 1. The first character must be a character with BIDI property L, R 5547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // or AL. If it has the R or AL property, it is an RTL label; if it 5557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // has the L property, it is an LTR label. 5567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((firstMask&~L_R_AL_MASK)!=0) { 5577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setNotOkBiDi(info); 5587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Get the directionality of the last non-NSM character. 5607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lastMask; 5617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int labelLimit=labelStart+labelLength; 5627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 5637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(i>=labelLimit) { 5647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastMask=firstMask; 5657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 5667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.codePointBefore(label, labelLimit); 5687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert labelLimit-=Character.charCount(c); 5697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int dir=UBiDiProps.INSTANCE.getClass(c); 5707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(dir!=UCharacterDirection.DIR_NON_SPACING_MARK) { 5717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastMask=U_MASK(dir); 5727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 5737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 3. In an RTL label, the end of the label must be a character with 5767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // BIDI property R, AL, EN or AN, followed by zero or more 5777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // characters with BIDI property NSM. 5787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 6. In an LTR label, the end of the label must be a character with 5797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // BIDI property L or EN, followed by zero or more characters with 5807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // BIDI property NSM. 5817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( (firstMask&L_MASK)!=0 ? 5827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (lastMask&~L_EN_MASK)!=0 : 5837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (lastMask&~R_AL_EN_AN_MASK)!=0 5847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 5857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setNotOkBiDi(info); 5867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Get the directionalities of the intervening characters. 5887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int mask=0; 5897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(i<labelLimit) { 5907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.codePointAt(label, i); 5917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert i+=Character.charCount(c); 5927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mask|=U_MASK(UBiDiProps.INSTANCE.getClass(c)); 5937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((firstMask&L_MASK)!=0) { 5957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 5. In an LTR label, only characters with the BIDI properties L, EN, 5967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ES, CS, ET, ON, BN and NSM are allowed. 5977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 5987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setNotOkBiDi(info); 5997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 6017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 2. In an RTL label, only characters with the BIDI properties R, AL, 6027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // AN, EN, ES, CS, ET, ON, BN and NSM are allowed. 6037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 6047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setNotOkBiDi(info); 6057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 4. In an RTL label, if an EN is present, no AN may be present, and 6077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // vice versa. 6087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((mask&EN_AN_MASK)==EN_AN_MASK) { 6097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setNotOkBiDi(info); 6107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // An RTL label is a label that contains at least one character of type 6137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // R, AL or AN. [...] 6147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // A "BIDI domain name" is a domain name that contains at least one RTL 6157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // label. [...] 6167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The following rule, consisting of six conditions, applies to labels 6177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // in BIDI domain names. 6187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(((firstMask|mask|lastMask)&R_AL_AN_MASK)!=0) { 6197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setBiDi(info); 6207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Special code for the ASCII prefix of a BiDi domain name. 6247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The ASCII prefix is all-LTR. 6257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // IDNA2008 BiDi rule, parts relevant to ASCII labels: 6277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 1. The first character must be a character with BIDI property L [...] 6287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 5. In an LTR label, only characters with the BIDI properties L, EN, 6297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ES, CS, ET, ON, BN and NSM are allowed. 6307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 6. In an LTR label, the end of the label must be a character with 6317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // BIDI property L or EN [...] 6327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // UTF-16 version, called for mapped ASCII prefix. 6347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Cannot contain uppercase A-Z. 6357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // s[length-1] must be the trailing dot. 6367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean 6377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isASCIIOkBiDi(CharSequence s, int length) { 6387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int labelStart=0; 6397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i=0; i<length; ++i) { 6407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c=s.charAt(i); 6417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c=='.') { // dot 6427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(i>labelStart) { 6437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=s.charAt(i-1); 6447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!('a'<=c && c<='z') && !('0'<=c && c<='9')) { 6457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Last character in the label is not an L or EN. 6467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 6477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert labelStart=i+1; 6507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(i==labelStart) { 6517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!('a'<=c && c<='z')) { 6527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // First character in the label is not an L. 6537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 6547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 6567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { 6577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Intermediate character in the label is a B, S or WS. 6587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 6597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 6637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean 6667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isLabelOkContextJ(CharSequence label, int labelStart, int labelLength) { 6677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // [IDNA2008-Tables] 6687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 6697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int labelLimit=labelStart+labelLength; 6707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i=labelStart; i<labelLimit; ++i) { 6717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(label.charAt(i)==0x200c) { 6727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Appendix A.1. ZERO WIDTH NON-JOINER 6737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Rule Set: 6747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // False; 6757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 6767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C 6777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (Joining_Type:T)*(Joining_Type:{R,D})) Then True; 6787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(i==labelStart) { 6797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 6807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c; 6827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int j=i; 6837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.codePointBefore(label, j); 6847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert j-=Character.charCount(c); 6857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(uts46Norm2.getCombiningClass(c)==9) { 6867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 6877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // check precontext (Joining_Type:{L,D})(Joining_Type:T)* 6897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 6907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* UJoiningType */ int type=UBiDiProps.INSTANCE.getJoiningType(c); 6917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(type==UCharacter.JoiningType.TRANSPARENT) { 6927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(j==0) { 6937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 6947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.codePointBefore(label, j); 6967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert j-=Character.charCount(c); 6977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(type==UCharacter.JoiningType.LEFT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) { 6987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; // precontext fulfilled 6997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 7007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 7017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // check postcontext (Joining_Type:T)*(Joining_Type:{R,D}) 7047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(j=i+1;;) { 7057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(j==labelLimit) { 7067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 7077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.codePointAt(label, j); 7097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert j+=Character.charCount(c); 7107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* UJoiningType */ int type=UBiDiProps.INSTANCE.getJoiningType(c); 7117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(type==UCharacter.JoiningType.TRANSPARENT) { 7127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // just skip this character 7137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(type==UCharacter.JoiningType.RIGHT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) { 7147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; // postcontext fulfilled 7157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 7167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 7177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(label.charAt(i)==0x200d) { 7207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Appendix A.2. ZERO WIDTH JOINER (U+200D) 7217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Rule Set: 7227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // False; 7237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 7247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(i==labelStart) { 7257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 7267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=Character.codePointBefore(label, i); 7287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(uts46Norm2.getCombiningClass(c)!=9) { 7297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 7307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 7347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void 7377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) { 7387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int labelEnd=labelStart+labelLength-1; // inclusive 7397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int arabicDigits=0; // -1 for 066x, +1 for 06Fx 7407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i=labelStart; i<=labelEnd; ++i) { 7417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=label.charAt(i); 7427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<0xb7) { 7437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ASCII fastpath 7447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(c<=0x6f9) { 7457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c==0xb7) { 7467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Appendix A.3. MIDDLE DOT (U+00B7) 7477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Rule Set: 7487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // False; 7497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If Before(cp) .eq. U+006C And 7507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // After(cp) .eq. U+006C Then True; 7517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!(labelStart<i && label.charAt(i-1)=='l' && 7527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert i<labelEnd && label.charAt(i+1)=='l')) { 7537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.CONTEXTO_PUNCTUATION); 7547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(c==0x375) { 7567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375) 7577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Rule Set: 7587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // False; 7597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If Script(After(cp)) .eq. Greek Then True; 7607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!(i<labelEnd && 7617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) { 7627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.CONTEXTO_PUNCTUATION); 7637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(c==0x5f3 || c==0x5f4) { 7657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3) 7667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Rule Set: 7677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // False; 7687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If Script(Before(cp)) .eq. Hebrew Then True; 7697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 7707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4) 7717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Rule Set: 7727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // False; 7737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If Script(Before(cp)) .eq. Hebrew Then True; 7747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!(labelStart<i && 7757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) { 7767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.CONTEXTO_PUNCTUATION); 7777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(0x660<=c /* && c<=0x6f9 */) { 7797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669) 7807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Rule Set: 7817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // True; 7827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // For All Characters: 7837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If cp .in. 06F0..06F9 Then False; 7847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // End For; 7857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 7867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9) 7877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Rule Set: 7887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // True; 7897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // For All Characters: 7907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If cp .in. 0660..0669 Then False; 7917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // End For; 7927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<=0x669) { 7937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(arabicDigits>0) { 7947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.CONTEXTO_DIGITS); 7957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert arabicDigits=-1; 7977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(0x6f0<=c) { 7987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(arabicDigits<0) { 7997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.CONTEXTO_DIGITS); 8007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert arabicDigits=1; 8027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(c==0x30fb) { 8057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) 8067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Rule Set: 8077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // False; 8087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // For All Characters: 8097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; 8107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // End For; 8117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int j=labelStart;; j+=Character.charCount(c)) { 8127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(j>labelEnd) { 8137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addLabelError(info, Error.CONTEXTO_PUNCTUATION); 8147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 8157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.codePointAt(label, j); 8177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int script=UScript.getScript(c); 8187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) { 8197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 8207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: make public(?) -- in C, these are public in uchar.h 8277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int U_MASK(int x) { 8287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 1<<x; 8297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int U_GET_GC_MASK(int c) { 8317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (1<<UCharacter.getType(c)); 8327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int U_GC_M_MASK= 8347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert U_MASK(UCharacterCategory.NON_SPACING_MARK)| 8357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert U_MASK(UCharacterCategory.ENCLOSING_MARK)| 8367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert U_MASK(UCharacterCategory.COMBINING_SPACING_MARK); 8377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 838