/* ******************************************************************************* * Copyright (C) 2010-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* */ package com.ibm.icu.impl; import java.util.EnumSet; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UCharacterCategory; import com.ibm.icu.lang.UCharacterDirection; import com.ibm.icu.lang.UScript; import com.ibm.icu.text.IDNA; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.StringPrepParseException; import com.ibm.icu.util.ICUException; // Note about tests for IDNA.Error.DOMAIN_NAME_TOO_LONG: // // The domain name length limit is 255 octets in an internal DNS representation // where the last ("root") label is the empty label // represented by length byte 0 alone. // In a conventional string, this translates to 253 characters, or 254 // if there is a trailing dot for the root label. /** * UTS #46 (IDNA2008) implementation. * @author Markus Scherer * @since 2010jul09 */ public final class UTS46 extends IDNA { public UTS46(int options) { this.options=options; } @Override public StringBuilder labelToASCII(CharSequence label, StringBuilder dest, Info info) { return process(label, true, true, dest, info); } @Override public StringBuilder labelToUnicode(CharSequence label, StringBuilder dest, Info info) { return process(label, true, false, dest, info); } @Override public StringBuilder nameToASCII(CharSequence name, StringBuilder dest, Info info) { process(name, false, true, dest, info); if( dest.length()>=254 && !info.getErrors().contains(Error.DOMAIN_NAME_TOO_LONG) && isASCIIString(dest) && (dest.length()>254 || dest.charAt(253)!='.') ) { addError(info, Error.DOMAIN_NAME_TOO_LONG); } return dest; } @Override public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info info) { return process(name, false, false, dest, info); } private static final Normalizer2 uts46Norm2= Normalizer2.getInstance(null, "uts46", Normalizer2.Mode.COMPOSE); // uts46.nrm final int options; // Severe errors which usually result in a U+FFFD replacement character in the result string. private static final EnumSet severeErrors=EnumSet.of( Error.LEADING_COMBINING_MARK, Error.DISALLOWED, Error.PUNYCODE, Error.LABEL_HAS_DOT, Error.INVALID_ACE_LABEL); private static boolean isASCIIString(CharSequence dest) { int length=dest.length(); for(int i=0; i0x7f) { return false; } } return true; } // UTS #46 data for ASCII characters. // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase // and passes through all other ASCII characters. // If USE_STD3_RULES is set, then non-LDH characters are disallowed // using this data. // The ASCII fastpath also uses this data. // Values: -1=disallowed 0==valid 1==mapped (lowercase) private static final byte asciiData[]={ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 002D..002E; valid # HYPHEN-MINUS..FULL STOP -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, // 0030..0039; valid # DIGIT ZERO..DIGIT NINE 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 }; private StringBuilder process(CharSequence src, boolean isLabel, boolean toASCII, StringBuilder dest, Info info) { // uts46Norm2.normalize() would do all of this error checking and setup, // but with the ASCII fastpath we do not always call it, and do not // call it first. if(dest==src) { throw new IllegalArgumentException(); } // Arguments are fine, reset output values. dest.delete(0, 0x7fffffff); resetInfo(info); int srcLength=src.length(); if(srcLength==0) { addError(info, Error.EMPTY_LABEL); return dest; } // ASCII fastpath boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; int labelStart=0; int i; for(i=0;; ++i) { if(i==srcLength) { if(toASCII) { if((i-labelStart)>63) { addLabelError(info, Error.LABEL_TOO_LONG); } // There is a trailing dot if labelStart==i. if(!isLabel && i>=254 && (i>254 || labelStart0x7f) { break; } int cData=asciiData[c]; if(cData>0) { dest.append((char)(c+0x20)); // Lowercase an uppercase ASCII letter. } else if(cData<0 && disallowNonLDHDot) { break; // Replacing with U+FFFD can be complicated for toASCII. } else { dest.append(c); if(c=='-') { // hyphen if(i==(labelStart+3) && src.charAt(i-1)=='-') { // "??--..." is Punycode or forbidden. ++i; // '-' was copied to dest already break; } if(i==labelStart) { // label starts with "-" addLabelError(info, Error.LEADING_HYPHEN); } if((i+1)==srcLength || src.charAt(i+1)=='.') { // label ends with "-" addLabelError(info, Error.TRAILING_HYPHEN); } } else if(c=='.') { // dot if(isLabel) { // Replacing with U+FFFD can be complicated for toASCII. ++i; // '.' was copied to dest already break; } if(i==labelStart) { addLabelError(info, Error.EMPTY_LABEL); } if(toASCII && (i-labelStart)>63) { addLabelError(info, Error.LABEL_TOO_LONG); } promoteAndResetLabelErrors(info); labelStart=i+1; } } } promoteAndResetLabelErrors(info); processUnicode(src, labelStart, i, isLabel, toASCII, dest, info); if( isBiDi(info) && !hasCertainErrors(info, severeErrors) && (!isOkBiDi(info) || (labelStart>0 && !isASCIIOkBiDi(dest, labelStart))) ) { addError(info, Error.BIDI); } return dest; } private StringBuilder processUnicode(CharSequence src, int labelStart, int mappingStart, boolean isLabel, boolean toASCII, StringBuilder dest, Info info) { if(mappingStart==0) { uts46Norm2.normalize(src, dest); } else { uts46Norm2.normalizeSecondAndAppend(dest, src.subSequence(mappingStart, src.length())); } boolean doMapDevChars= toASCII ? (options&NONTRANSITIONAL_TO_ASCII)==0 : (options&NONTRANSITIONAL_TO_UNICODE)==0; int destLength=dest.length(); int labelLimit=labelStart; while(labelLimit=0x200c)) { setTransitionalDifferent(info); if(doMapDevChars) { destLength=mapDevChars(dest, labelStart, labelLimit); // Do not increment labelLimit in case c was removed. // All deviation characters have been mapped, no need to check for them again. doMapDevChars=false; } else { ++labelLimit; } } else { ++labelLimit; } } // Permit an empty label at the end (0=4 && dest.charAt(labelStart)=='x' && dest.charAt(labelStart+1)=='n' && dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-' ) { // Label starts with "xn--", try to un-Punycode it. wasPunycode=true; try { fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null); } catch (StringPrepParseException e) { addLabelError(info, Error.PUNYCODE); return markBadACELabel(dest, labelStart, labelLength, toASCII, info); } // Check for NFC, and for characters that are not // valid or deviation characters according to the normalizer. // If there is something wrong, then the string will change. // Note that the normalizer passes through non-LDH ASCII and deviation characters. // Deviation characters are ok in Punycode even in transitional processing. // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too. boolean isValid=uts46Norm2.isNormalized(fromPunycode); if(!isValid) { addLabelError(info, Error.INVALID_ACE_LABEL); return markBadACELabel(dest, labelStart, labelLength, toASCII, info); } labelString=fromPunycode; labelStart=0; labelLength=fromPunycode.length(); } else { wasPunycode=false; labelString=dest; } // Validity check if(labelLength==0) { addLabelError(info, Error.EMPTY_LABEL); return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength); } // labelLength>0 if(labelLength>=4 && labelString.charAt(labelStart+2)=='-' && labelString.charAt(labelStart+3)=='-') { // label starts with "??--" addLabelError(info, Error.HYPHEN_3_4); } if(labelString.charAt(labelStart)=='-') { // label starts with "-" addLabelError(info, Error.LEADING_HYPHEN); } if(labelString.charAt(labelStart+labelLength-1)=='-') { // label ends with "-" addLabelError(info, Error.TRAILING_HYPHEN); } // If the label was not a Punycode label, then it was the result of // mapping, normalization and label segmentation. // If the label was in Punycode, then we mapped it again above // and checked its validity. // Now we handle the STD3 restriction to LDH characters (if set) // and we look for U+FFFD which indicates disallowed characters // in a non-Punycode label or U+FFFD itself in a Punycode label. // We also check for dots which can come from the input to a single-label function. // Ok to cast away const because we own the UnicodeString. int i=labelStart; int limit=labelStart+labelLength; char oredChars=0; // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed. boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; do { char c=labelString.charAt(i); if(c<=0x7f) { if(c=='.') { addLabelError(info, Error.LABEL_HAS_DOT); labelString.setCharAt(i, '\ufffd'); } else if(disallowNonLDHDot && asciiData[c]<0) { addLabelError(info, Error.DISALLOWED); labelString.setCharAt(i, '\ufffd'); } } else { oredChars|=c; if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) { addLabelError(info, Error.DISALLOWED); labelString.setCharAt(i, '\ufffd'); } else if(c==0xfffd) { addLabelError(info, Error.DISALLOWED); } } ++i; } while(i0xffff) { // Remove c's trail surrogate. labelString.deleteCharAt(labelStart+1); --labelLength; if(labelString==dest) { --destLabelLength; } } } if(!hasCertainLabelErrors(info, severeErrors)) { // Do contextual checks only if we do not have U+FFFD from a severe error // because U+FFFD can make these checks fail. if((options&CHECK_BIDI)!=0 && (!isBiDi(info) || isOkBiDi(info))) { checkLabelBiDi(labelString, labelStart, labelLength, info); } if( (options&CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c && !isLabelOkContextJ(labelString, labelStart, labelLength) ) { addLabelError(info, Error.CONTEXTJ); } if((options&CHECK_CONTEXTO)!=0 && oredChars>=0xb7) { checkLabelContextO(labelString, labelStart, labelLength, info); } if(toASCII) { if(wasPunycode) { // Leave a Punycode label unchanged if it has no severe errors. if(destLabelLength>63) { addLabelError(info, Error.LABEL_TOO_LONG); } return destLabelLength; } else if(oredChars>=0x80) { // Contains non-ASCII characters. StringBuilder punycode; try { punycode=Punycode.encode(labelString.subSequence(labelStart, labelStart+labelLength), null); } catch (StringPrepParseException e) { throw new ICUException(e); // unexpected } punycode.insert(0, "xn--"); if(punycode.length()>63) { addLabelError(info, Error.LABEL_TOO_LONG); } return replaceLabel(dest, destLabelStart, destLabelLength, punycode, punycode.length()); } else { // all-ASCII label if(labelLength>63) { addLabelError(info, Error.LABEL_TOO_LONG); } } } } else { // If a Punycode label has severe errors, // then leave it but make sure it does not look valid. if(wasPunycode) { addLabelError(info, Error.INVALID_ACE_LABEL); return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info); } } return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength); } private int markBadACELabel(StringBuilder dest, int labelStart, int labelLength, boolean toASCII, Info info) { boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; boolean isASCII=true; boolean onlyLDH=true; int i=labelStart+4; // After the initial "xn--". int limit=labelStart+labelLength; do { char c=dest.charAt(i); if(c<=0x7f) { if(c=='.') { addLabelError(info, Error.LABEL_HAS_DOT); dest.setCharAt(i, '\ufffd'); isASCII=onlyLDH=false; } else if(asciiData[c]<0) { onlyLDH=false; if(disallowNonLDHDot) { dest.setCharAt(i, '\ufffd'); isASCII=false; } } } else { isASCII=onlyLDH=false; } } while(++i63) { addLabelError(info, Error.LABEL_TOO_LONG); } } return labelLength; } private static final int L_MASK=U_MASK(UCharacterDirection.LEFT_TO_RIGHT); private static final int R_AL_MASK= U_MASK(UCharacterDirection.RIGHT_TO_LEFT)| U_MASK(UCharacterDirection.RIGHT_TO_LEFT_ARABIC); private static final int L_R_AL_MASK=L_MASK|R_AL_MASK; private static final int R_AL_AN_MASK=R_AL_MASK|U_MASK(UCharacterDirection.ARABIC_NUMBER); private static final int EN_AN_MASK= U_MASK(UCharacterDirection.EUROPEAN_NUMBER)| U_MASK(UCharacterDirection.ARABIC_NUMBER); private static final int R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK; private static final int L_EN_MASK=L_MASK|U_MASK(UCharacterDirection.EUROPEAN_NUMBER); private static final int ES_CS_ET_ON_BN_NSM_MASK= U_MASK(UCharacterDirection.EUROPEAN_NUMBER_SEPARATOR)| U_MASK(UCharacterDirection.COMMON_NUMBER_SEPARATOR)| U_MASK(UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR)| U_MASK(UCharacterDirection.OTHER_NEUTRAL)| U_MASK(UCharacterDirection.BOUNDARY_NEUTRAL)| U_MASK(UCharacterDirection.DIR_NON_SPACING_MARK); private static final int L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK; private static final int R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK; // We scan the whole label and check both for whether it contains RTL characters // and whether it passes the BiDi Rule. // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find // that a domain name is a BiDi domain name (has an RTL label) only after // processing several earlier labels. private void checkLabelBiDi(CharSequence label, int labelStart, int labelLength, Info info) { // IDNA2008 BiDi rule // Get the directionality of the first character. int c; int i=labelStart; c=Character.codePointAt(label, i); i+=Character.charCount(c); int firstMask=U_MASK(UBiDiProps.INSTANCE.getClass(c)); // 1. The first character must be a character with BIDI property L, R // or AL. If it has the R or AL property, it is an RTL label; if it // has the L property, it is an LTR label. if((firstMask&~L_R_AL_MASK)!=0) { setNotOkBiDi(info); } // Get the directionality of the last non-NSM character. int lastMask; int labelLimit=labelStart+labelLength; for(;;) { if(i>=labelLimit) { lastMask=firstMask; break; } c=Character.codePointBefore(label, labelLimit); labelLimit-=Character.charCount(c); int dir=UBiDiProps.INSTANCE.getClass(c); if(dir!=UCharacterDirection.DIR_NON_SPACING_MARK) { lastMask=U_MASK(dir); break; } } // 3. In an RTL label, the end of the label must be a character with // BIDI property R, AL, EN or AN, followed by zero or more // characters with BIDI property NSM. // 6. In an LTR label, the end of the label must be a character with // BIDI property L or EN, followed by zero or more characters with // BIDI property NSM. if( (firstMask&L_MASK)!=0 ? (lastMask&~L_EN_MASK)!=0 : (lastMask&~R_AL_EN_AN_MASK)!=0 ) { setNotOkBiDi(info); } // Get the directionalities of the intervening characters. int mask=0; while(ilabelStart) { c=s.charAt(i-1); if(!('a'<=c && c<='z') && !('0'<=c && c<='9')) { // Last character in the label is not an L or EN. return false; } } labelStart=i+1; } else if(i==labelStart) { if(!('a'<=c && c<='z')) { // First character in the label is not an L. return false; } } else { if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { // Intermediate character in the label is a B, S or WS. return false; } } } return true; } private boolean isLabelOkContextJ(CharSequence label, int labelStart, int labelLength) { // [IDNA2008-Tables] // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER int labelLimit=labelStart+labelLength; for(int i=labelStart; i0) { addLabelError(info, Error.CONTEXTO_DIGITS); } arabicDigits=-1; } else if(0x6f0<=c) { if(arabicDigits<0) { addLabelError(info, Error.CONTEXTO_DIGITS); } arabicDigits=1; } } } else if(c==0x30fb) { // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) // Rule Set: // False; // For All Characters: // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; // End For; for(int j=labelStart;; j+=Character.charCount(c)) { if(j>labelEnd) { addLabelError(info, Error.CONTEXTO_PUNCTUATION); break; } c=Character.codePointAt(label, j); int script=UScript.getScript(c); if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) { break; } } } } } // TODO: make public(?) -- in C, these are public in uchar.h private static int U_MASK(int x) { return 1<