1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4 ******************************************************************************* 5 * Copyright (C) 2003-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10package com.ibm.icu.text; 11 12import java.io.IOException; 13import java.io.InputStream; 14import java.lang.ref.WeakReference; 15import java.nio.ByteBuffer; 16 17import com.ibm.icu.impl.CharTrie; 18import com.ibm.icu.impl.ICUBinary; 19import com.ibm.icu.impl.StringPrepDataReader; 20import com.ibm.icu.impl.UBiDiProps; 21import com.ibm.icu.lang.UCharacter; 22import com.ibm.icu.lang.UCharacterDirection; 23import com.ibm.icu.util.ICUUncheckedIOException; 24import com.ibm.icu.util.VersionInfo; 25 26/** 27 * StringPrep API implements the StingPrep framework as described by 28 * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>. 29 * StringPrep prepares Unicode strings for use in network protocols. 30 * Profiles of StingPrep are set of rules and data according to which the 31 * Unicode Strings are prepared. Each profiles contains tables which describe 32 * how a code point should be treated. The tables are broadly classied into 33 * <ul> 34 * <li> Unassigned Table: Contains code points that are unassigned 35 * in the Unicode Version supported by StringPrep. Currently 36 * RFC 3454 supports Unicode 3.2. </li> 37 * <li> Prohibited Table: Contains code points that are prohibted from 38 * the output of the StringPrep processing function. </li> 39 * <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li> 40 * </ul> 41 * 42 * The procedure for preparing Unicode strings: 43 * <ol> 44 * <li> Map: For each character in the input, check if it has a mapping 45 * and, if so, replace it with its mapping. </li> 46 * <li> Normalize: Possibly normalize the result of step 1 using Unicode 47 * normalization. </li> 48 * <li> Prohibit: Check for any characters that are not allowed in the 49 * output. If any are found, return an error.</li> 50 * <li> Check bidi: Possibly check for right-to-left characters, and if 51 * any are found, make sure that the whole string satisfies the 52 * requirements for bidirectional strings. If the string does not 53 * satisfy the requirements for bidirectional strings, return an 54 * error. </li> 55 * </ol> 56 * @author Ram Viswanadha 57 * @stable ICU 2.8 58 */ 59public final class StringPrep { 60 /** 61 * Option to prohibit processing of unassigned code points in the input 62 * 63 * @see #prepare 64 * @stable ICU 2.8 65 */ 66 public static final int DEFAULT = 0x0000; 67 68 /** 69 * Option to allow processing of unassigned code points in the input 70 * 71 * @see #prepare 72 * @stable ICU 2.8 73 */ 74 public static final int ALLOW_UNASSIGNED = 0x0001; 75 76 /** 77 * Profile type: RFC3491 Nameprep 78 * @see #getInstance(int) 79 * @stable ICU 4.2 80 */ 81 public static final int RFC3491_NAMEPREP = 0; 82 83 /** 84 * Profile type: RFC3530 nfs4_cs_prep 85 * @see #getInstance(int) 86 * @stable ICU 4.2 87 */ 88 public static final int RFC3530_NFS4_CS_PREP = 1; 89 90 /** 91 * Profile type: RFC3530 nfs4_cs_prep with case insensitive option 92 * @see #getInstance(int) 93 * @stable ICU 4.2 94 */ 95 public static final int RFC3530_NFS4_CS_PREP_CI = 2; 96 97 /** 98 * Profile type: RFC3530 nfs4_cis_prep 99 * @see #getInstance(int) 100 * @stable ICU 4.2 101 */ 102 public static final int RFC3530_NFS4_CIS_PREP = 3; 103 104 /** 105 * Profile type: RFC3530 nfs4_mixed_prep for prefix 106 * @see #getInstance(int) 107 * @stable ICU 4.2 108 */ 109 public static final int RFC3530_NFS4_MIXED_PREP_PREFIX = 4; 110 111 /** 112 * Profile type: RFC3530 nfs4_mixed_prep for suffix 113 * @see #getInstance(int) 114 * @stable ICU 4.2 115 */ 116 public static final int RFC3530_NFS4_MIXED_PREP_SUFFIX = 5; 117 118 /** 119 * Profile type: RFC3722 iSCSI 120 * @see #getInstance(int) 121 * @stable ICU 4.2 122 */ 123 public static final int RFC3722_ISCSI = 6; 124 125 /** 126 * Profile type: RFC3920 XMPP Nodeprep 127 * @see #getInstance(int) 128 * @stable ICU 4.2 129 */ 130 public static final int RFC3920_NODEPREP = 7; 131 132 /** 133 * Profile type: RFC3920 XMPP Resourceprep 134 * @see #getInstance(int) 135 * @stable ICU 4.2 136 */ 137 public static final int RFC3920_RESOURCEPREP = 8; 138 139 /** 140 * Profile type: RFC4011 Policy MIB Stringprep 141 * @see #getInstance(int) 142 * @stable ICU 4.2 143 */ 144 public static final int RFC4011_MIB = 9; 145 146 /** 147 * Profile type: RFC4013 SASLprep 148 * @see #getInstance(int) 149 * @stable ICU 4.2 150 */ 151 public static final int RFC4013_SASLPREP = 10; 152 153 /** 154 * Profile type: RFC4505 trace 155 * @see #getInstance(int) 156 * @stable ICU 4.2 157 */ 158 public static final int RFC4505_TRACE = 11; 159 160 /** 161 * Profile type: RFC4518 LDAP 162 * @see #getInstance(int) 163 * @stable ICU 4.2 164 */ 165 public static final int RFC4518_LDAP = 12; 166 167 /** 168 * Profile type: RFC4518 LDAP for case ignore, numeric and stored prefix 169 * matching rules 170 * @see #getInstance(int) 171 * @stable ICU 4.2 172 */ 173 public static final int RFC4518_LDAP_CI = 13; 174 175 // Last available profile 176 private static final int MAX_PROFILE = RFC4518_LDAP_CI; 177 178 // Profile names must be aligned to profile type definitions 179 private static final String[] PROFILE_NAMES = { 180 "rfc3491", /* RFC3491_NAMEPREP */ 181 "rfc3530cs", /* RFC3530_NFS4_CS_PREP */ 182 "rfc3530csci", /* RFC3530_NFS4_CS_PREP_CI */ 183 "rfc3491", /* RFC3530_NSF4_CIS_PREP */ 184 "rfc3530mixp", /* RFC3530_NSF4_MIXED_PREP_PREFIX */ 185 "rfc3491", /* RFC3530_NSF4_MIXED_PREP_SUFFIX */ 186 "rfc3722", /* RFC3722_ISCSI */ 187 "rfc3920node", /* RFC3920_NODEPREP */ 188 "rfc3920res", /* RFC3920_RESOURCEPREP */ 189 "rfc4011", /* RFC4011_MIB */ 190 "rfc4013", /* RFC4013_SASLPREP */ 191 "rfc4505", /* RFC4505_TRACE */ 192 "rfc4518", /* RFC4518_LDAP */ 193 "rfc4518ci", /* RFC4518_LDAP_CI */ 194 }; 195 196 @SuppressWarnings({"unchecked", "rawtypes"}) 197 private static final WeakReference<StringPrep>[] CACHE = (WeakReference<StringPrep>[])new WeakReference[MAX_PROFILE+1]; 198 199 private static final int UNASSIGNED = 0x0000; 200 private static final int MAP = 0x0001; 201 private static final int PROHIBITED = 0x0002; 202 private static final int DELETE = 0x0003; 203 private static final int TYPE_LIMIT = 0x0004; 204 205 private static final int NORMALIZATION_ON = 0x0001; 206 private static final int CHECK_BIDI_ON = 0x0002; 207 208 private static final int TYPE_THRESHOLD = 0xFFF0; 209 private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/ 210 //private static final int MAX_INDEX_TOP_LENGTH = 0x0003; 211 212 /* indexes[] value names */ 213// private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */ 214 private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */ 215 private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */ 216 private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */ 217 private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */ 218 private static final int THREE_UCHARS_MAPPING_INDEX_START = 5; 219 private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6; 220 private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */ 221 private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */ 222 223 224 // CharTrie implmentation for reading the trie data 225 private CharTrie sprepTrie; 226 // Indexes read from the data file 227 private int[] indexes; 228 // mapping data read from the data file 229 private char[] mappingData; 230 // the version of Unicode supported by the data file 231 private VersionInfo sprepUniVer; 232 // the Unicode version of last entry in the 233 // NormalizationCorrections.txt file if normalization 234 // is turned on 235 private VersionInfo normCorrVer; 236 // Option to turn on Normalization 237 private boolean doNFKC; 238 // Option to turn on checking for BiDi rules 239 private boolean checkBiDi; 240 // bidi properties 241 private UBiDiProps bdp; 242 243 private char getCodePointValue(int ch){ 244 return sprepTrie.getCodePointValue(ch); 245 } 246 247 private static VersionInfo getVersionInfo(int comp){ 248 int micro = comp & 0xFF; 249 int milli =(comp >> 8) & 0xFF; 250 int minor =(comp >> 16) & 0xFF; 251 int major =(comp >> 24) & 0xFF; 252 return VersionInfo.getInstance(major,minor,milli,micro); 253 } 254 255 private static VersionInfo getVersionInfo(byte[] version){ 256 if(version.length != 4){ 257 return null; 258 } 259 return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]); 260 } 261 262 /** 263 * Creates an StringPrep object after reading the input stream. 264 * The object does not hold a reference to the input steam, so the stream can be 265 * closed after the method returns. 266 * 267 * @param inputStream The stream for reading the StringPrep profile binarySun 268 * @throws IOException An exception occurs when I/O of the inputstream is invalid 269 * @stable ICU 2.8 270 */ 271 public StringPrep(InputStream inputStream) throws IOException{ 272 // TODO: Add a public constructor that takes ByteBuffer directly. 273 this(ICUBinary.getByteBufferFromInputStreamAndCloseStream(inputStream)); 274 } 275 276 private StringPrep(ByteBuffer bytes) throws IOException { 277 StringPrepDataReader reader = new StringPrepDataReader(bytes); 278 279 // read the indexes 280 indexes = reader.readIndexes(INDEX_TOP); 281 282 sprepTrie = new CharTrie(bytes, null); 283 284 //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes 285 // load the rest of the data data and initialize the data members 286 mappingData = reader.read(indexes[INDEX_MAPPING_DATA_SIZE]/2); 287 288 // get the options 289 doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0); 290 checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0); 291 sprepUniVer = getVersionInfo(reader.getUnicodeVersion()); 292 normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]); 293 VersionInfo normUniVer = UCharacter.getUnicodeVersion(); 294 if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */ 295 normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */ 296 ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/ 297 ){ 298 throw new IOException("Normalization Correction version not supported"); 299 } 300 301 if(checkBiDi) { 302 bdp=UBiDiProps.INSTANCE; 303 } 304 } 305 306 /** 307 * Gets a StringPrep instance for the specified profile 308 * 309 * @param profile The profile passed to find the StringPrep instance. 310 * @stable ICU 4.2 311 */ 312 public static StringPrep getInstance(int profile) { 313 if (profile < 0 || profile > MAX_PROFILE) { 314 throw new IllegalArgumentException("Bad profile type"); 315 } 316 317 StringPrep instance = null; 318 319 // A StringPrep instance is immutable. We use a single instance 320 // per type and store it in the internal cache. 321 synchronized (CACHE) { 322 WeakReference<StringPrep> ref = CACHE[profile]; 323 if (ref != null) { 324 instance = ref.get(); 325 } 326 327 if (instance == null) { 328 ByteBuffer bytes = ICUBinary.getRequiredData(PROFILE_NAMES[profile] + ".spp"); 329 if (bytes != null) { 330 try { 331 instance = new StringPrep(bytes); 332 } catch (IOException e) { 333 throw new ICUUncheckedIOException(e); 334 } 335 } 336 if (instance != null) { 337 CACHE[profile] = new WeakReference<StringPrep>(instance); 338 } 339 } 340 } 341 return instance; 342 } 343 344 private static final class Values{ 345 boolean isIndex; 346 int value; 347 int type; 348 public void reset(){ 349 isIndex = false; 350 value = 0; 351 type = -1; 352 } 353 } 354 355 private static final void getValues(char trieWord,Values values){ 356 values.reset(); 357 if(trieWord == 0){ 358 /* 359 * Initial value stored in the mapping table 360 * just return TYPE_LIMIT .. so that 361 * the source codepoint is copied to the destination 362 */ 363 values.type = TYPE_LIMIT; 364 }else if(trieWord >= TYPE_THRESHOLD){ 365 values.type = (trieWord - TYPE_THRESHOLD); 366 }else{ 367 /* get the type */ 368 values.type = MAP; 369 /* ascertain if the value is index or delta */ 370 if((trieWord & 0x02)>0){ 371 values.isIndex = true; 372 values.value = trieWord >> 2; //mask off the lower 2 bits and shift 373 374 }else{ 375 values.isIndex = false; 376 values.value = (trieWord<<16)>>16; 377 values.value = (values.value >> 2); 378 379 } 380 381 if((trieWord>>2) == MAX_INDEX_VALUE){ 382 values.type = DELETE; 383 values.isIndex = false; 384 values.value = 0; 385 } 386 } 387 } 388 389 390 391 private StringBuffer map( UCharacterIterator iter, int options) 392 throws StringPrepParseException{ 393 394 Values val = new Values(); 395 char result = 0; 396 int ch = UCharacterIterator.DONE; 397 StringBuffer dest = new StringBuffer(); 398 boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0); 399 400 while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ 401 402 result = getCodePointValue(ch); 403 getValues(result,val); 404 405 // check if the source codepoint is unassigned 406 if(val.type == UNASSIGNED && allowUnassigned == false){ 407 throw new StringPrepParseException("An unassigned code point was found in the input", 408 StringPrepParseException.UNASSIGNED_ERROR, 409 iter.getText(),iter.getIndex()); 410 }else if((val.type == MAP)){ 411 int index, length; 412 413 if(val.isIndex){ 414 index = val.value; 415 if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] && 416 index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){ 417 length = 1; 418 }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] && 419 index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){ 420 length = 2; 421 }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] && 422 index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){ 423 length = 3; 424 }else{ 425 length = mappingData[index++]; 426 } 427 /* copy mapping to destination */ 428 dest.append(mappingData,index,length); 429 continue; 430 431 }else{ 432 ch -= val.value; 433 } 434 }else if(val.type == DELETE){ 435 // just consume the codepoint and contine 436 continue; 437 } 438 //copy the source into destination 439 UTF16.append(dest,ch); 440 } 441 442 return dest; 443 } 444 445 446 private StringBuffer normalize(StringBuffer src){ 447 return new StringBuffer( 448 Normalizer.normalize( 449 src.toString(), 450 Normalizer.NFKC, 451 Normalizer.UNICODE_3_2)); 452 } 453 /* 454 boolean isLabelSeparator(int ch){ 455 int result = getCodePointValue(ch); 456 if( (result & 0x07) == LABEL_SEPARATOR){ 457 return true; 458 } 459 return false; 460 } 461 */ 462 /* 463 1) Map -- For each character in the input, check if it has a mapping 464 and, if so, replace it with its mapping. 465 466 2) Normalize -- Possibly normalize the result of step 1 using Unicode 467 normalization. 468 469 3) Prohibit -- Check for any characters that are not allowed in the 470 output. If any are found, return an error. 471 472 4) Check bidi -- Possibly check for right-to-left characters, and if 473 any are found, make sure that the whole string satisfies the 474 requirements for bidirectional strings. If the string does not 475 satisfy the requirements for bidirectional strings, return an 476 error. 477 [Unicode3.2] defines several bidirectional categories; each character 478 has one bidirectional category assigned to it. For the purposes of 479 the requirements below, an "RandALCat character" is a character that 480 has Unicode bidirectional categories "R" or "AL"; an "LCat character" 481 is a character that has Unicode bidirectional category "L". Note 482 483 484 that there are many characters which fall in neither of the above 485 definitions; Latin digits (<U+0030> through <U+0039>) are examples of 486 this because they have bidirectional category "EN". 487 488 In any profile that specifies bidirectional character handling, all 489 three of the following requirements MUST be met: 490 491 1) The characters in section 5.8 MUST be prohibited. 492 493 2) If a string contains any RandALCat character, the string MUST NOT 494 contain any LCat character. 495 496 3) If a string contains any RandALCat character, a RandALCat 497 character MUST be the first character of the string, and a 498 RandALCat character MUST be the last character of the string. 499 */ 500 /** 501 * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC), 502 * checks for prohibited and BiDi characters in the order defined by RFC 3454 503 * depending on the options specified in the profile. 504 * 505 * @param src A UCharacterIterator object containing the source string 506 * @param options A bit set of options: 507 * <ul> 508 * <li>{@link #DEFAULT} Prohibit processing of unassigned code points in the input</li> 509 * <li>{@link #ALLOW_UNASSIGNED} Treat the unassigned code points are in the input 510 * as normal Unicode code points.</li> 511 * </ul> 512 * @return StringBuffer A StringBuffer containing the output 513 * @throws StringPrepParseException An exception occurs when parsing a string is invalid. 514 * @stable ICU 2.8 515 */ 516 public StringBuffer prepare(UCharacterIterator src, int options) 517 throws StringPrepParseException{ 518 519 // map 520 StringBuffer mapOut = map(src,options); 521 StringBuffer normOut = mapOut;// initialize 522 523 if(doNFKC){ 524 // normalize 525 normOut = normalize(mapOut); 526 } 527 528 int ch; 529 char result; 530 UCharacterIterator iter = UCharacterIterator.getInstance(normOut); 531 Values val = new Values(); 532 int direction=UCharacterDirection.CHAR_DIRECTION_COUNT, 533 firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT; 534 int rtlPos=-1, ltrPos=-1; 535 boolean rightToLeft=false, leftToRight=false; 536 537 while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ 538 result = getCodePointValue(ch); 539 getValues(result,val); 540 541 if(val.type == PROHIBITED ){ 542 throw new StringPrepParseException("A prohibited code point was found in the input", 543 StringPrepParseException.PROHIBITED_ERROR,iter.getText(),val.value); 544 } 545 546 if(checkBiDi) { 547 direction = bdp.getClass(ch); 548 if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){ 549 firstCharDir = direction; 550 } 551 if(direction == UCharacterDirection.LEFT_TO_RIGHT){ 552 leftToRight = true; 553 ltrPos = iter.getIndex()-1; 554 } 555 if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){ 556 rightToLeft = true; 557 rtlPos = iter.getIndex()-1; 558 } 559 } 560 } 561 if(checkBiDi == true){ 562 // satisfy 2 563 if( leftToRight == true && rightToLeft == true){ 564 throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.", 565 StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(), 566 (rtlPos>ltrPos) ? rtlPos : ltrPos); 567 } 568 569 //satisfy 3 570 if( rightToLeft == true && 571 !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) && 572 (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)) 573 ){ 574 throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.", 575 StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(), 576 (rtlPos>ltrPos) ? rtlPos : ltrPos); 577 } 578 } 579 return normOut; 580 581 } 582 583 /** 584 * Prepare the input String for use in applications with the given profile. This operation maps, normalizes(NFKC), 585 * checks for prohibited and BiDi characters in the order defined by RFC 3454 586 * depending on the options specified in the profile. 587 * 588 * @param src A string 589 * @param options A bit set of options: 590 * <ul> 591 * <li>{@link #DEFAULT} Prohibit processing of unassigned code points in the input</li> 592 * <li>{@link #ALLOW_UNASSIGNED} Treat the unassigned code points are in the input 593 * as normal Unicode code points.</li> 594 * </ul> 595 * @return String A String containing the output 596 * @throws StringPrepParseException An exception when parsing or preparing a string is invalid. 597 * @stable ICU 4.2 598 */ 599 public String prepare(String src, int options) 600 throws StringPrepParseException{ 601 StringBuffer result = prepare(UCharacterIterator.getInstance(src), options); 602 return result.toString(); 603 } 604} 605