1/* GENERATED SOURCE. DO NOT MODIFY. */ 2/** 3 ******************************************************************************* 4 * Copyright (C) 1996-2016, International Business Machines Corporation and 5 * others. All Rights Reserved. 6 ******************************************************************************* 7 */ 8 9package android.icu.text; 10 11/** 12 * <p> 13 * Standalone utility class providing UTF16 character conversions and indexing conversions. 14 * </p> 15 * <p> 16 * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap, 17 * so searching for strings is a safe operation. Similarly, concatenation is always safe. 18 * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the 19 * values for start and end are on those boundaries, since they arose from operations like 20 * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>. 21 * </p> 22 * <strong>Examples:</strong> 23 * <p> 24 * The following examples illustrate use of some of these methods. 25 * 26 * <pre> 27 * // iteration forwards: Original 28 * for (int i = 0; i < s.length(); ++i) { 29 * char ch = s.charAt(i); 30 * doSomethingWith(ch); 31 * } 32 * 33 * // iteration forwards: Changes for UTF-32 34 * int ch; 35 * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) { 36 * ch = UTF16.charAt(s, i); 37 * doSomethingWith(ch); 38 * } 39 * 40 * // iteration backwards: Original 41 * for (int i = s.length() - 1; i >= 0; --i) { 42 * char ch = s.charAt(i); 43 * doSomethingWith(ch); 44 * } 45 * 46 * // iteration backwards: Changes for UTF-32 47 * int ch; 48 * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) { 49 * ch = UTF16.charAt(s, i); 50 * doSomethingWith(ch); 51 * } 52 * </pre> 53 * 54 * <strong>Notes:</strong> 55 * <ul> 56 * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code> 57 * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string. 58 * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16 59 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32 60 * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li> 61 * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a 62 * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16 63 * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>. 64 * </li> 65 * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out 66 * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates 67 * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to 68 * check for validity if desired. </li> 69 * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then 70 * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It 71 * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4, 72 * 5.5). </li> 73 * <li> <strong>Optimization:</strong> The method implementations may need optimization if the 74 * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small 75 * percentage of all the text in the world, the singleton case should always be optimized for. </li> 76 * </ul> 77 * 78 * @author Mark Davis, with help from Markus Scherer 79 * @hide Only a subset of ICU is exposed in Android 80 */ 81 82public final class UTF16 { 83 // public variables --------------------------------------------------- 84 85 /** 86 * Value returned in {@link #bounds(String, int) bounds()}. 87 * These values are chosen specifically so that it actually represents the position of the 88 * character [offset16 - (value >> 2), offset16 + (value & 3)] 89 */ 90 public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2, 91 TRAIL_SURROGATE_BOUNDARY = 5; 92 93 /** 94 * The lowest Unicode code point value. 95 */ 96 public static final int CODEPOINT_MIN_VALUE = 0; 97 98 /** 99 * The highest Unicode code point value (scalar value) according to the Unicode Standard. 100 */ 101 public static final int CODEPOINT_MAX_VALUE = 0x10ffff; 102 103 /** 104 * The minimum value for Supplementary code points 105 */ 106 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; 107 108 /** 109 * Lead surrogate minimum value 110 */ 111 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; 112 113 /** 114 * Trail surrogate minimum value 115 */ 116 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; 117 118 /** 119 * Lead surrogate maximum value 120 */ 121 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; 122 123 /** 124 * Trail surrogate maximum value 125 */ 126 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; 127 128 /** 129 * Surrogate minimum value 130 */ 131 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE; 132 133 /** 134 * Maximum surrogate value 135 */ 136 public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE; 137 138 /** 139 * Lead surrogate bitmask 140 */ 141 private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00; 142 143 /** 144 * Trail surrogate bitmask 145 */ 146 private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00; 147 148 /** 149 * Surrogate bitmask 150 */ 151 private static final int SURROGATE_BITMASK = 0xFFFFF800; 152 153 /** 154 * Lead surrogate bits 155 */ 156 private static final int LEAD_SURROGATE_BITS = 0xD800; 157 158 /** 159 * Trail surrogate bits 160 */ 161 private static final int TRAIL_SURROGATE_BITS = 0xDC00; 162 163 /** 164 * Surrogate bits 165 */ 166 private static final int SURROGATE_BITS = 0xD800; 167 168 // constructor -------------------------------------------------------- 169 170 // /CLOVER:OFF 171 /** 172 * Prevent instance from being created. 173 */ 174 private UTF16() { 175 } 176 177 // /CLOVER:ON 178 // public method ------------------------------------------------------ 179 180 /** 181 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 182 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 183 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)"> 184 * UCharacter.isLegal()</a></code> 185 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 186 * character will be returned. If a complete supplementary character is not found the incomplete 187 * character will be returned 188 * 189 * @param source Array of UTF-16 chars 190 * @param offset16 UTF-16 offset to the start of the character. 191 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 192 * of that codepoint are the same as in <code>bounds32()</code>. 193 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 194 */ 195 public static int charAt(String source, int offset16) { 196 char single = source.charAt(offset16); 197 if (single < LEAD_SURROGATE_MIN_VALUE) { 198 return single; 199 } 200 return _charAt(source, offset16, single); 201 } 202 203 private static int _charAt(String source, int offset16, char single) { 204 if (single > TRAIL_SURROGATE_MAX_VALUE) { 205 return single; 206 } 207 208 // Convert the UTF-16 surrogate pair if necessary. 209 // For simplicity in usage, and because the frequency of pairs is 210 // low, look both directions. 211 212 if (single <= LEAD_SURROGATE_MAX_VALUE) { 213 ++offset16; 214 if (source.length() != offset16) { 215 char trail = source.charAt(offset16); 216 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) { 217 return Character.toCodePoint(single, trail); 218 } 219 } 220 } else { 221 --offset16; 222 if (offset16 >= 0) { 223 // single is a trail surrogate so 224 char lead = source.charAt(offset16); 225 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) { 226 return Character.toCodePoint(lead, single); 227 } 228 } 229 } 230 return single; // return unmatched surrogate 231 } 232 233 /** 234 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 235 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 236 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)"> 237 * UCharacter.isLegal()</a></code> 238 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 239 * character will be returned. If a complete supplementary character is not found the incomplete 240 * character will be returned 241 * 242 * @param source Array of UTF-16 chars 243 * @param offset16 UTF-16 offset to the start of the character. 244 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 245 * of that codepoint are the same as in <code>bounds32()</code>. 246 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 247 */ 248 public static int charAt(CharSequence source, int offset16) { 249 char single = source.charAt(offset16); 250 if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) { 251 return single; 252 } 253 return _charAt(source, offset16, single); 254 } 255 256 private static int _charAt(CharSequence source, int offset16, char single) { 257 if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) { 258 return single; 259 } 260 261 // Convert the UTF-16 surrogate pair if necessary. 262 // For simplicity in usage, and because the frequency of pairs is 263 // low, look both directions. 264 265 if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 266 ++offset16; 267 if (source.length() != offset16) { 268 char trail = source.charAt(offset16); 269 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE 270 && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { 271 return Character.toCodePoint(single, trail); 272 } 273 } 274 } else { 275 --offset16; 276 if (offset16 >= 0) { 277 // single is a trail surrogate so 278 char lead = source.charAt(offset16); 279 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE 280 && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 281 return Character.toCodePoint(lead, single); 282 } 283 } 284 } 285 return single; // return unmatched surrogate 286 } 287 288 /** 289 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 290 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 291 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 292 * </a></code> 293 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 294 * character will be returned. If a complete supplementary character is not found the incomplete 295 * character will be returned 296 * 297 * @param source UTF-16 chars string buffer 298 * @param offset16 UTF-16 offset to the start of the character. 299 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 300 * of that codepoint are the same as in <code>bounds32()</code>. 301 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 302 */ 303 public static int charAt(StringBuffer source, int offset16) { 304 if (offset16 < 0 || offset16 >= source.length()) { 305 throw new StringIndexOutOfBoundsException(offset16); 306 } 307 308 char single = source.charAt(offset16); 309 if (!isSurrogate(single)) { 310 return single; 311 } 312 313 // Convert the UTF-16 surrogate pair if necessary. 314 // For simplicity in usage, and because the frequency of pairs is 315 // low, look both directions. 316 317 if (single <= LEAD_SURROGATE_MAX_VALUE) { 318 ++offset16; 319 if (source.length() != offset16) { 320 char trail = source.charAt(offset16); 321 if (isTrailSurrogate(trail)) 322 return Character.toCodePoint(single, trail); 323 } 324 } else { 325 --offset16; 326 if (offset16 >= 0) { 327 // single is a trail surrogate so 328 char lead = source.charAt(offset16); 329 if (isLeadSurrogate(lead)) { 330 return Character.toCodePoint(lead, single); 331 } 332 } 333 } 334 return single; // return unmatched surrogate 335 } 336 337 /** 338 * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards 339 * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 340 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 341 * </a></code> 342 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 343 * character will be returned. If a complete supplementary character is not found the incomplete 344 * character will be returned 345 * 346 * @param source Array of UTF-16 chars 347 * @param start Offset to substring in the source array for analyzing 348 * @param limit Offset to substring in the source array for analyzing 349 * @param offset16 UTF-16 offset relative to start 350 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 351 * of that codepoint are the same as in <code>bounds32()</code>. 352 * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit. 353 */ 354 public static int charAt(char source[], int start, int limit, int offset16) { 355 offset16 += start; 356 if (offset16 < start || offset16 >= limit) { 357 throw new ArrayIndexOutOfBoundsException(offset16); 358 } 359 360 char single = source[offset16]; 361 if (!isSurrogate(single)) { 362 return single; 363 } 364 365 // Convert the UTF-16 surrogate pair if necessary. 366 // For simplicity in usage, and because the frequency of pairs is 367 // low, look both directions. 368 if (single <= LEAD_SURROGATE_MAX_VALUE) { 369 offset16++; 370 if (offset16 >= limit) { 371 return single; 372 } 373 char trail = source[offset16]; 374 if (isTrailSurrogate(trail)) { 375 return Character.toCodePoint(single, trail); 376 } 377 } else { // isTrailSurrogate(single), so 378 if (offset16 == start) { 379 return single; 380 } 381 offset16--; 382 char lead = source[offset16]; 383 if (isLeadSurrogate(lead)) 384 return Character.toCodePoint(lead, single); 385 } 386 return single; // return unmatched surrogate 387 } 388 389 /** 390 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 391 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 392 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 393 * </a></code> 394 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 395 * character will be returned. If a complete supplementary character is not found the incomplete 396 * character will be returned 397 * 398 * @param source UTF-16 chars string buffer 399 * @param offset16 UTF-16 offset to the start of the character. 400 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 401 * of that codepoint are the same as in <code>bounds32()</code>. 402 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 403 */ 404 public static int charAt(Replaceable source, int offset16) { 405 if (offset16 < 0 || offset16 >= source.length()) { 406 throw new StringIndexOutOfBoundsException(offset16); 407 } 408 409 char single = source.charAt(offset16); 410 if (!isSurrogate(single)) { 411 return single; 412 } 413 414 // Convert the UTF-16 surrogate pair if necessary. 415 // For simplicity in usage, and because the frequency of pairs is 416 // low, look both directions. 417 418 if (single <= LEAD_SURROGATE_MAX_VALUE) { 419 ++offset16; 420 if (source.length() != offset16) { 421 char trail = source.charAt(offset16); 422 if (isTrailSurrogate(trail)) 423 return Character.toCodePoint(single, trail); 424 } 425 } else { 426 --offset16; 427 if (offset16 >= 0) { 428 // single is a trail surrogate so 429 char lead = source.charAt(offset16); 430 if (isLeadSurrogate(lead)) { 431 return Character.toCodePoint(lead, single); 432 } 433 } 434 } 435 return single; // return unmatched surrogate 436 } 437 438 /** 439 * Determines how many chars this char32 requires. If a validity check is required, use <code> 440 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 441 * on char32 before calling. 442 * 443 * @param char32 The input codepoint. 444 * @return 2 if is in supplementary space, otherwise 1. 445 */ 446 public static int getCharCount(int char32) { 447 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 448 return 1; 449 } 450 return 2; 451 } 452 453 /** 454 * Returns the type of the boundaries around the char at offset16. Used for random access. 455 * 456 * @param source Text to analyse 457 * @param offset16 UTF-16 offset 458 * @return 459 * <ul> 460 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1] 461 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 462 * are [offset16, offset16 + 2] 463 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 464 * bounds are [offset16 - 1, offset16 + 1] 465 * </ul> 466 * For bit-twiddlers, the return values for these are chosen so that the boundaries 467 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. 468 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 469 */ 470 public static int bounds(String source, int offset16) { 471 char ch = source.charAt(offset16); 472 if (isSurrogate(ch)) { 473 if (isLeadSurrogate(ch)) { 474 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) { 475 return LEAD_SURROGATE_BOUNDARY; 476 } 477 } else { 478 // isTrailSurrogate(ch), so 479 --offset16; 480 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) { 481 return TRAIL_SURROGATE_BOUNDARY; 482 } 483 } 484 } 485 return SINGLE_CHAR_BOUNDARY; 486 } 487 488 /** 489 * Returns the type of the boundaries around the char at offset16. Used for random access. 490 * 491 * @param source String buffer to analyse 492 * @param offset16 UTF16 offset 493 * @return 494 * <ul> 495 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1] 496 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 497 * are [offset16, offset16 + 2] 498 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 499 * bounds are [offset16 - 1, offset16 + 1] 500 * </ul> 501 * For bit-twiddlers, the return values for these are chosen so that the boundaries 502 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. 503 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 504 */ 505 public static int bounds(StringBuffer source, int offset16) { 506 char ch = source.charAt(offset16); 507 if (isSurrogate(ch)) { 508 if (isLeadSurrogate(ch)) { 509 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) { 510 return LEAD_SURROGATE_BOUNDARY; 511 } 512 } else { 513 // isTrailSurrogate(ch), so 514 --offset16; 515 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) { 516 return TRAIL_SURROGATE_BOUNDARY; 517 } 518 } 519 } 520 return SINGLE_CHAR_BOUNDARY; 521 } 522 523 /** 524 * Returns the type of the boundaries around the char at offset16. Used for random access. Note 525 * that the boundaries are determined with respect to the subarray, hence the char array 526 * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1. 527 * 528 * @param source Char array to analyse 529 * @param start Offset to substring in the source array for analyzing 530 * @param limit Offset to substring in the source array for analyzing 531 * @param offset16 UTF16 offset relative to start 532 * @return 533 * <ul> 534 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are 535 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 536 * are [offset16, offset16 + 2] 537 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 538 * bounds are [offset16 - 1, offset16 + 1] 539 * </ul> 540 * For bit-twiddlers, the boundary values for these are chosen so that the boundaries 541 * can be gotten by: [offset16 - (boundvalue >> 2), offset16 + (boundvalue & 3)]. 542 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. 543 */ 544 public static int bounds(char source[], int start, int limit, int offset16) { 545 offset16 += start; 546 if (offset16 < start || offset16 >= limit) { 547 throw new ArrayIndexOutOfBoundsException(offset16); 548 } 549 char ch = source[offset16]; 550 if (isSurrogate(ch)) { 551 if (isLeadSurrogate(ch)) { 552 ++offset16; 553 if (offset16 < limit && isTrailSurrogate(source[offset16])) { 554 return LEAD_SURROGATE_BOUNDARY; 555 } 556 } else { // isTrailSurrogate(ch), so 557 --offset16; 558 if (offset16 >= start && isLeadSurrogate(source[offset16])) { 559 return TRAIL_SURROGATE_BOUNDARY; 560 } 561 } 562 } 563 return SINGLE_CHAR_BOUNDARY; 564 } 565 566 /** 567 * Determines whether the code value is a surrogate. 568 * 569 * @param char16 The input character. 570 * @return true If the input character is a surrogate. 571 */ 572 public static boolean isSurrogate(char char16) { 573 return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS; 574 } 575 576 /** 577 * Determines whether the character is a trail surrogate. 578 * 579 * @param char16 The input character. 580 * @return true If the input character is a trail surrogate. 581 */ 582 public static boolean isTrailSurrogate(char char16) { 583 return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS; 584 } 585 586 /** 587 * Determines whether the character is a lead surrogate. 588 * 589 * @param char16 The input character. 590 * @return true If the input character is a lead surrogate 591 */ 592 public static boolean isLeadSurrogate(char char16) { 593 return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS; 594 } 595 596 /** 597 * Returns the lead surrogate. If a validity check is required, use 598 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 599 * before calling. 600 * 601 * @param char32 The input character. 602 * @return lead surrogate if the getCharCount(ch) is 2; <br> 603 * and 0 otherwise (note: 0 is not a valid lead surrogate). 604 */ 605 public static char getLeadSurrogate(int char32) { 606 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 607 return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_)); 608 } 609 return 0; 610 } 611 612 /** 613 * Returns the trail surrogate. If a validity check is required, use 614 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 615 * before calling. 616 * 617 * @param char32 The input character. 618 * @return the trail surrogate if the getCharCount(ch) is 2; <br> 619 * otherwise the character itself 620 */ 621 public static char getTrailSurrogate(int char32) { 622 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 623 return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_)); 624 } 625 return (char) char32; 626 } 627 628 /** 629 * Convenience method corresponding to String.valueOf(char). Returns a one or two char string 630 * containing the UTF-32 value in UTF16 format. If a validity check is required, use 631 * {@link android.icu.lang.UCharacter#isLegal(int)} on char32 before calling. 632 * 633 * @param char32 The input character. 634 * @return string value of char32 in UTF16 format 635 * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint. 636 */ 637 public static String valueOf(int char32) { 638 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 639 throw new IllegalArgumentException("Illegal codepoint"); 640 } 641 return toString(char32); 642 } 643 644 /** 645 * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or 646 * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate 647 * character, the whole supplementary codepoint will be returned. If a validity check is 648 * required, use {@link android.icu.lang.UCharacter#isLegal(int)} on the 649 * codepoint at offset16 before calling. The result returned will be a newly created String 650 * obtained by calling source.substring(..) with the appropriate indexes. 651 * 652 * @param source The input string. 653 * @param offset16 The UTF16 index to the codepoint in source 654 * @return string value of char32 in UTF16 format 655 */ 656 public static String valueOf(String source, int offset16) { 657 switch (bounds(source, offset16)) { 658 case LEAD_SURROGATE_BOUNDARY: 659 return source.substring(offset16, offset16 + 2); 660 case TRAIL_SURROGATE_BOUNDARY: 661 return source.substring(offset16 - 1, offset16 + 1); 662 default: 663 return source.substring(offset16, offset16 + 1); 664 } 665 } 666 667 /** 668 * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a 669 * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a 670 * surrogate character, the whole supplementary codepoint will be returned. If a validity check 671 * is required, use {@link android.icu.lang.UCharacter#isLegal(int)} on 672 * the codepoint at offset16 before calling. The result returned will be a newly created String 673 * obtained by calling source.substring(..) with the appropriate indexes. 674 * 675 * @param source The input string buffer. 676 * @param offset16 The UTF16 index to the codepoint in source 677 * @return string value of char32 in UTF16 format 678 */ 679 public static String valueOf(StringBuffer source, int offset16) { 680 switch (bounds(source, offset16)) { 681 case LEAD_SURROGATE_BOUNDARY: 682 return source.substring(offset16, offset16 + 2); 683 case TRAIL_SURROGATE_BOUNDARY: 684 return source.substring(offset16 - 1, offset16 + 1); 685 default: 686 return source.substring(offset16, offset16 + 1); 687 } 688 } 689 690 /** 691 * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16 692 * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be 693 * returned, except when either the leading or trailing surrogate character lies out of the 694 * specified subarray. In the latter case, only the surrogate character within bounds will be 695 * returned. If a validity check is required, use 696 * {@link android.icu.lang.UCharacter#isLegal(int)} on the codepoint at 697 * offset16 before calling. The result returned will be a newly created String containing the 698 * relevant characters. 699 * 700 * @param source The input char array. 701 * @param start Start index of the subarray 702 * @param limit End index of the subarray 703 * @param offset16 The UTF16 index to the codepoint in source relative to start 704 * @return string value of char32 in UTF16 format 705 */ 706 public static String valueOf(char source[], int start, int limit, int offset16) { 707 switch (bounds(source, start, limit, offset16)) { 708 case LEAD_SURROGATE_BOUNDARY: 709 return new String(source, start + offset16, 2); 710 case TRAIL_SURROGATE_BOUNDARY: 711 return new String(source, start + offset16 - 1, 2); 712 } 713 return new String(source, start + offset16, 1); 714 } 715 716 /** 717 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 718 * the {@link UTF16 class description} for notes on roundtripping. 719 * 720 * @param source The UTF-16 string 721 * @param offset32 UTF-32 offset 722 * @return UTF-16 offset 723 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 724 */ 725 public static int findOffsetFromCodePoint(String source, int offset32) { 726 char ch; 727 int size = source.length(), result = 0, count = offset32; 728 if (offset32 < 0 || offset32 > size) { 729 throw new StringIndexOutOfBoundsException(offset32); 730 } 731 while (result < size && count > 0) { 732 ch = source.charAt(result); 733 if (isLeadSurrogate(ch) && ((result + 1) < size) 734 && isTrailSurrogate(source.charAt(result + 1))) { 735 result++; 736 } 737 738 count--; 739 result++; 740 } 741 if (count != 0) { 742 throw new StringIndexOutOfBoundsException(offset32); 743 } 744 return result; 745 } 746 747 /** 748 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 749 * the {@link UTF16 class description} for notes on roundtripping. 750 * 751 * @param source The UTF-16 string buffer 752 * @param offset32 UTF-32 offset 753 * @return UTF-16 offset 754 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 755 */ 756 public static int findOffsetFromCodePoint(StringBuffer source, int offset32) { 757 char ch; 758 int size = source.length(), result = 0, count = offset32; 759 if (offset32 < 0 || offset32 > size) { 760 throw new StringIndexOutOfBoundsException(offset32); 761 } 762 while (result < size && count > 0) { 763 ch = source.charAt(result); 764 if (isLeadSurrogate(ch) && ((result + 1) < size) 765 && isTrailSurrogate(source.charAt(result + 1))) { 766 result++; 767 } 768 769 count--; 770 result++; 771 } 772 if (count != 0) { 773 throw new StringIndexOutOfBoundsException(offset32); 774 } 775 return result; 776 } 777 778 /** 779 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 780 * the {@link UTF16 class description} for notes on roundtripping. 781 * 782 * @param source The UTF-16 char array whose substring is to be analysed 783 * @param start Offset of the substring to be analysed 784 * @param limit Offset of the substring to be analysed 785 * @param offset32 UTF-32 offset relative to start 786 * @return UTF-16 offset relative to start 787 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 788 */ 789 public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) { 790 char ch; 791 int result = start, count = offset32; 792 if (offset32 > limit - start) { 793 throw new ArrayIndexOutOfBoundsException(offset32); 794 } 795 while (result < limit && count > 0) { 796 ch = source[result]; 797 if (isLeadSurrogate(ch) && ((result + 1) < limit) 798 && isTrailSurrogate(source[result + 1])) { 799 result++; 800 } 801 802 count--; 803 result++; 804 } 805 if (count != 0) { 806 throw new ArrayIndexOutOfBoundsException(offset32); 807 } 808 return result - start; 809 } 810 811 /** 812 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given 813 * UTF-16 offset. Used for random access. See the {@link UTF16 class description} for 814 * notes on roundtripping.<br> 815 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 816 * of the <strong>lead</strong> of the pair is returned. </i> 817 * <p> 818 * To find the UTF-32 length of a string, use: 819 * 820 * <pre> 821 * len32 = countCodePoint(source, source.length()); 822 * </pre> 823 * 824 * @param source Text to analyse 825 * @param offset16 UTF-16 offset < source text length. 826 * @return UTF-32 offset 827 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 828 */ 829 public static int findCodePointOffset(String source, int offset16) { 830 if (offset16 < 0 || offset16 > source.length()) { 831 throw new StringIndexOutOfBoundsException(offset16); 832 } 833 834 int result = 0; 835 char ch; 836 boolean hadLeadSurrogate = false; 837 838 for (int i = 0; i < offset16; ++i) { 839 ch = source.charAt(i); 840 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 841 hadLeadSurrogate = false; // count valid trail as zero 842 } else { 843 hadLeadSurrogate = isLeadSurrogate(ch); 844 ++result; // count others as 1 845 } 846 } 847 848 if (offset16 == source.length()) { 849 return result; 850 } 851 852 // end of source being the less significant surrogate character 853 // shift result back to the start of the supplementary character 854 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) { 855 result--; 856 } 857 858 return result; 859 } 860 861 /** 862 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16 863 * offset. Used for random access. See the {@link UTF16 class description} for notes on 864 * roundtripping.<br> 865 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 866 * of the <strong>lead</strong> of the pair is returned. </i> 867 * <p> 868 * To find the UTF-32 length of a string, use: 869 * 870 * <pre> 871 * len32 = countCodePoint(source); 872 * </pre> 873 * 874 * @param source Text to analyse 875 * @param offset16 UTF-16 offset < source text length. 876 * @return UTF-32 offset 877 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 878 */ 879 public static int findCodePointOffset(StringBuffer source, int offset16) { 880 if (offset16 < 0 || offset16 > source.length()) { 881 throw new StringIndexOutOfBoundsException(offset16); 882 } 883 884 int result = 0; 885 char ch; 886 boolean hadLeadSurrogate = false; 887 888 for (int i = 0; i < offset16; ++i) { 889 ch = source.charAt(i); 890 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 891 hadLeadSurrogate = false; // count valid trail as zero 892 } else { 893 hadLeadSurrogate = isLeadSurrogate(ch); 894 ++result; // count others as 1 895 } 896 } 897 898 if (offset16 == source.length()) { 899 return result; 900 } 901 902 // end of source being the less significant surrogate character 903 // shift result back to the start of the supplementary character 904 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) { 905 result--; 906 } 907 908 return result; 909 } 910 911 /** 912 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16 913 * offset. Used for random access. See the {@link UTF16 class description} for notes on 914 * roundtripping.<br> 915 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 916 * of the <strong>lead</strong> of the pair is returned. </i> 917 * <p> 918 * To find the UTF-32 length of a substring, use: 919 * 920 * <pre> 921 * len32 = countCodePoint(source, start, limit); 922 * </pre> 923 * 924 * @param source Text to analyse 925 * @param start Offset of the substring 926 * @param limit Offset of the substring 927 * @param offset16 UTF-16 relative to start 928 * @return UTF-32 offset relative to start 929 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. 930 */ 931 public static int findCodePointOffset(char source[], int start, int limit, int offset16) { 932 offset16 += start; 933 if (offset16 > limit) { 934 throw new StringIndexOutOfBoundsException(offset16); 935 } 936 937 int result = 0; 938 char ch; 939 boolean hadLeadSurrogate = false; 940 941 for (int i = start; i < offset16; ++i) { 942 ch = source[i]; 943 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 944 hadLeadSurrogate = false; // count valid trail as zero 945 } else { 946 hadLeadSurrogate = isLeadSurrogate(ch); 947 ++result; // count others as 1 948 } 949 } 950 951 if (offset16 == limit) { 952 return result; 953 } 954 955 // end of source being the less significant surrogate character 956 // shift result back to the start of the supplementary character 957 if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) { 958 result--; 959 } 960 961 return result; 962 } 963 964 /** 965 * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required, 966 * use {@link android.icu.lang.UCharacter#isLegal(int)} on char32 before 967 * calling. 968 * 969 * @param target The buffer to append to 970 * @param char32 Value to append. 971 * @return the updated StringBuffer 972 * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints 973 */ 974 public static StringBuffer append(StringBuffer target, int char32) { 975 // Check for irregular values 976 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 977 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32)); 978 } 979 980 // Write the UTF-16 values 981 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 982 target.append(getLeadSurrogate(char32)); 983 target.append(getTrailSurrogate(char32)); 984 } else { 985 target.append((char) char32); 986 } 987 return target; 988 } 989 990 /** 991 * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a 992 * convenience. 993 * 994 * @param target The buffer to append to 995 * @param cp The code point to append 996 * @return the updated StringBuffer 997 * @throws IllegalArgumentException If cp is not a valid code point 998 */ 999 public static StringBuffer appendCodePoint(StringBuffer target, int cp) { 1000 return append(target, cp); 1001 } 1002 1003 /** 1004 * Adds a codepoint to offset16 position of the argument char array. 1005 * 1006 * @param target Char array to be append with the new code point 1007 * @param limit UTF16 offset which the codepoint will be appended. 1008 * @param char32 Code point to be appended 1009 * @return offset after char32 in the array. 1010 * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not 1011 * lie within the range of the Unicode codepoints. 1012 */ 1013 public static int append(char[] target, int limit, int char32) { 1014 // Check for irregular values 1015 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1016 throw new IllegalArgumentException("Illegal codepoint"); 1017 } 1018 // Write the UTF-16 values 1019 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 1020 target[limit++] = getLeadSurrogate(char32); 1021 target[limit++] = getTrailSurrogate(char32); 1022 } else { 1023 target[limit++] = (char) char32; 1024 } 1025 return limit; 1026 } 1027 1028 /** 1029 * Number of codepoints in a UTF16 String 1030 * 1031 * @param source UTF16 string 1032 * @return number of codepoint in string 1033 */ 1034 public static int countCodePoint(String source) { 1035 if (source == null || source.length() == 0) { 1036 return 0; 1037 } 1038 return findCodePointOffset(source, source.length()); 1039 } 1040 1041 /** 1042 * Number of codepoints in a UTF16 String buffer 1043 * 1044 * @param source UTF16 string buffer 1045 * @return number of codepoint in string 1046 */ 1047 public static int countCodePoint(StringBuffer source) { 1048 if (source == null || source.length() == 0) { 1049 return 0; 1050 } 1051 return findCodePointOffset(source, source.length()); 1052 } 1053 1054 /** 1055 * Number of codepoints in a UTF16 char array substring 1056 * 1057 * @param source UTF16 char array 1058 * @param start Offset of the substring 1059 * @param limit Offset of the substring 1060 * @return number of codepoint in the substring 1061 * @exception IndexOutOfBoundsException If start and limit are not valid. 1062 */ 1063 public static int countCodePoint(char source[], int start, int limit) { 1064 if (source == null || source.length == 0) { 1065 return 0; 1066 } 1067 return findCodePointOffset(source, start, limit, limit - start); 1068 } 1069 1070 /** 1071 * Set a code point into a UTF16 position. Adjusts target according if we are replacing a 1072 * non-supplementary codepoint with a supplementary and vice versa. 1073 * 1074 * @param target Stringbuffer 1075 * @param offset16 UTF16 position to insert into 1076 * @param char32 Code point 1077 */ 1078 public static void setCharAt(StringBuffer target, int offset16, int char32) { 1079 int count = 1; 1080 char single = target.charAt(offset16); 1081 1082 if (isSurrogate(single)) { 1083 // pairs of the surrogate with offset16 at the lead char found 1084 if (isLeadSurrogate(single) && (target.length() > offset16 + 1) 1085 && isTrailSurrogate(target.charAt(offset16 + 1))) { 1086 count++; 1087 } else { 1088 // pairs of the surrogate with offset16 at the trail char 1089 // found 1090 if (isTrailSurrogate(single) && (offset16 > 0) 1091 && isLeadSurrogate(target.charAt(offset16 - 1))) { 1092 offset16--; 1093 count++; 1094 } 1095 } 1096 } 1097 target.replace(offset16, offset16 + count, valueOf(char32)); 1098 } 1099 1100 /** 1101 * Set a code point into a UTF16 position in a char array. Adjusts target according if we are 1102 * replacing a non-supplementary codepoint with a supplementary and vice versa. 1103 * 1104 * @param target char array 1105 * @param limit numbers of valid chars in target, different from target.length. limit counts the 1106 * number of chars in target that represents a string, not the size of array target. 1107 * @param offset16 UTF16 position to insert into 1108 * @param char32 code point 1109 * @return new number of chars in target that represents a string 1110 * @exception IndexOutOfBoundsException if offset16 is out of range 1111 */ 1112 public static int setCharAt(char target[], int limit, int offset16, int char32) { 1113 if (offset16 >= limit) { 1114 throw new ArrayIndexOutOfBoundsException(offset16); 1115 } 1116 int count = 1; 1117 char single = target[offset16]; 1118 1119 if (isSurrogate(single)) { 1120 // pairs of the surrogate with offset16 at the lead char found 1121 if (isLeadSurrogate(single) && (target.length > offset16 + 1) 1122 && isTrailSurrogate(target[offset16 + 1])) { 1123 count++; 1124 } else { 1125 // pairs of the surrogate with offset16 at the trail char 1126 // found 1127 if (isTrailSurrogate(single) && (offset16 > 0) 1128 && isLeadSurrogate(target[offset16 - 1])) { 1129 offset16--; 1130 count++; 1131 } 1132 } 1133 } 1134 1135 String str = valueOf(char32); 1136 int result = limit; 1137 int strlength = str.length(); 1138 target[offset16] = str.charAt(0); 1139 if (count == strlength) { 1140 if (count == 2) { 1141 target[offset16 + 1] = str.charAt(1); 1142 } 1143 } else { 1144 // this is not exact match in space, we'll have to do some 1145 // shifting 1146 System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit 1147 - (offset16 + count)); 1148 if (count < strlength) { 1149 // char32 is a supplementary character trying to squeeze into 1150 // a non-supplementary space 1151 target[offset16 + 1] = str.charAt(1); 1152 result++; 1153 if (result < target.length) { 1154 target[result] = 0; 1155 } 1156 } else { 1157 // char32 is a non-supplementary character trying to fill 1158 // into a supplementary space 1159 result--; 1160 target[result] = 0; 1161 } 1162 } 1163 return result; 1164 } 1165 1166 /** 1167 * Shifts offset16 by the argument number of codepoints 1168 * 1169 * @param source string 1170 * @param offset16 UTF16 position to shift 1171 * @param shift32 number of codepoints to shift 1172 * @return new shifted offset16 1173 * @exception IndexOutOfBoundsException if the new offset16 is out of bounds. 1174 */ 1175 public static int moveCodePointOffset(String source, int offset16, int shift32) { 1176 int result = offset16; 1177 int size = source.length(); 1178 int count; 1179 char ch; 1180 if (offset16 < 0 || offset16 > size) { 1181 throw new StringIndexOutOfBoundsException(offset16); 1182 } 1183 if (shift32 > 0) { 1184 if (shift32 + offset16 > size) { 1185 throw new StringIndexOutOfBoundsException(offset16); 1186 } 1187 count = shift32; 1188 while (result < size && count > 0) { 1189 ch = source.charAt(result); 1190 if (isLeadSurrogate(ch) && ((result + 1) < size) 1191 && isTrailSurrogate(source.charAt(result + 1))) { 1192 result++; 1193 } 1194 count--; 1195 result++; 1196 } 1197 } else { 1198 if (offset16 + shift32 < 0) { 1199 throw new StringIndexOutOfBoundsException(offset16); 1200 } 1201 for (count = -shift32; count > 0; count--) { 1202 result--; 1203 if (result < 0) { 1204 break; 1205 } 1206 ch = source.charAt(result); 1207 if (isTrailSurrogate(ch) && result > 0 1208 && isLeadSurrogate(source.charAt(result - 1))) { 1209 result--; 1210 } 1211 } 1212 } 1213 if (count != 0) { 1214 throw new StringIndexOutOfBoundsException(shift32); 1215 } 1216 return result; 1217 } 1218 1219 /** 1220 * Shifts offset16 by the argument number of codepoints 1221 * 1222 * @param source String buffer 1223 * @param offset16 UTF16 position to shift 1224 * @param shift32 Number of codepoints to shift 1225 * @return new shifted offset16 1226 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds. 1227 */ 1228 public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) { 1229 int result = offset16; 1230 int size = source.length(); 1231 int count; 1232 char ch; 1233 if (offset16 < 0 || offset16 > size) { 1234 throw new StringIndexOutOfBoundsException(offset16); 1235 } 1236 if (shift32 > 0) { 1237 if (shift32 + offset16 > size) { 1238 throw new StringIndexOutOfBoundsException(offset16); 1239 } 1240 count = shift32; 1241 while (result < size && count > 0) { 1242 ch = source.charAt(result); 1243 if (isLeadSurrogate(ch) && ((result + 1) < size) 1244 && isTrailSurrogate(source.charAt(result + 1))) { 1245 result++; 1246 } 1247 count--; 1248 result++; 1249 } 1250 } else { 1251 if (offset16 + shift32 < 0) { 1252 throw new StringIndexOutOfBoundsException(offset16); 1253 } 1254 for (count = -shift32; count > 0; count--) { 1255 result--; 1256 if (result < 0) { 1257 break; 1258 } 1259 ch = source.charAt(result); 1260 if (isTrailSurrogate(ch) && result > 0 1261 && isLeadSurrogate(source.charAt(result - 1))) { 1262 result--; 1263 } 1264 } 1265 } 1266 if (count != 0) { 1267 throw new StringIndexOutOfBoundsException(shift32); 1268 } 1269 return result; 1270 } 1271 1272 /** 1273 * Shifts offset16 by the argument number of codepoints within a subarray. 1274 * 1275 * @param source Char array 1276 * @param start Position of the subarray to be performed on 1277 * @param limit Position of the subarray to be performed on 1278 * @param offset16 UTF16 position to shift relative to start 1279 * @param shift32 Number of codepoints to shift 1280 * @return new shifted offset16 relative to start 1281 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the 1282 * subarray bounds are out of range. 1283 */ 1284 public static int moveCodePointOffset(char source[], int start, int limit, int offset16, 1285 int shift32) { 1286 int size = source.length; 1287 int count; 1288 char ch; 1289 int result = offset16 + start; 1290 if (start < 0 || limit < start) { 1291 throw new StringIndexOutOfBoundsException(start); 1292 } 1293 if (limit > size) { 1294 throw new StringIndexOutOfBoundsException(limit); 1295 } 1296 if (offset16 < 0 || result > limit) { 1297 throw new StringIndexOutOfBoundsException(offset16); 1298 } 1299 if (shift32 > 0) { 1300 if (shift32 + result > size) { 1301 throw new StringIndexOutOfBoundsException(result); 1302 } 1303 count = shift32; 1304 while (result < limit && count > 0) { 1305 ch = source[result]; 1306 if (isLeadSurrogate(ch) && (result + 1 < limit) 1307 && isTrailSurrogate(source[result + 1])) { 1308 result++; 1309 } 1310 count--; 1311 result++; 1312 } 1313 } else { 1314 if (result + shift32 < start) { 1315 throw new StringIndexOutOfBoundsException(result); 1316 } 1317 for (count = -shift32; count > 0; count--) { 1318 result--; 1319 if (result < start) { 1320 break; 1321 } 1322 ch = source[result]; 1323 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) { 1324 result--; 1325 } 1326 } 1327 } 1328 if (count != 0) { 1329 throw new StringIndexOutOfBoundsException(shift32); 1330 } 1331 result -= start; 1332 return result; 1333 } 1334 1335 /** 1336 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the 1337 * middle of a supplementary codepoint, char32 will be inserted after the supplementary 1338 * codepoint. The length of target increases by one if codepoint is non-supplementary, 2 1339 * otherwise. 1340 * <p> 1341 * The overall effect is exactly as if the argument were converted to a string by the method 1342 * valueOf(char) and the characters in that string were then inserted into target at the 1343 * position indicated by offset16. 1344 * </p> 1345 * <p> 1346 * The offset argument must be greater than or equal to 0, and less than or equal to the length 1347 * of source. 1348 * 1349 * @param target String buffer to insert to 1350 * @param offset16 Offset which char32 will be inserted in 1351 * @param char32 Codepoint to be inserted 1352 * @return a reference to target 1353 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1354 */ 1355 public static StringBuffer insert(StringBuffer target, int offset16, int char32) { 1356 String str = valueOf(char32); 1357 if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) { 1358 offset16++; 1359 } 1360 target.insert(offset16, str); 1361 return target; 1362 } 1363 1364 /** 1365 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the 1366 * middle of a supplementary codepoint, char32 will be inserted after the supplementary 1367 * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise. 1368 * <p> 1369 * The overall effect is exactly as if the argument were converted to a string by the method 1370 * valueOf(char) and the characters in that string were then inserted into target at the 1371 * position indicated by offset16. 1372 * </p> 1373 * <p> 1374 * The offset argument must be greater than or equal to 0, and less than or equal to the limit. 1375 * 1376 * @param target Char array to insert to 1377 * @param limit End index of the char array, limit <= target.length 1378 * @param offset16 Offset which char32 will be inserted in 1379 * @param char32 Codepoint to be inserted 1380 * @return new limit size 1381 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1382 */ 1383 public static int insert(char target[], int limit, int offset16, int char32) { 1384 String str = valueOf(char32); 1385 if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) { 1386 offset16++; 1387 } 1388 int size = str.length(); 1389 if (limit + size > target.length) { 1390 throw new ArrayIndexOutOfBoundsException(offset16 + size); 1391 } 1392 System.arraycopy(target, offset16, target, offset16 + size, limit - offset16); 1393 target[offset16] = str.charAt(0); 1394 if (size == 2) { 1395 target[offset16 + 1] = str.charAt(1); 1396 } 1397 return limit + size; 1398 } 1399 1400 /** 1401 * Removes the codepoint at the specified position in this target (shortening target by 1 1402 * character if the codepoint is a non-supplementary, 2 otherwise). 1403 * 1404 * @param target String buffer to remove codepoint from 1405 * @param offset16 Offset which the codepoint will be removed 1406 * @return a reference to target 1407 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1408 */ 1409 public static StringBuffer delete(StringBuffer target, int offset16) { 1410 int count = 1; 1411 switch (bounds(target, offset16)) { 1412 case LEAD_SURROGATE_BOUNDARY: 1413 count++; 1414 break; 1415 case TRAIL_SURROGATE_BOUNDARY: 1416 count++; 1417 offset16--; 1418 break; 1419 } 1420 target.delete(offset16, offset16 + count); 1421 return target; 1422 } 1423 1424 /** 1425 * Removes the codepoint at the specified position in this target (shortening target by 1 1426 * character if the codepoint is a non-supplementary, 2 otherwise). 1427 * 1428 * @param target String buffer to remove codepoint from 1429 * @param limit End index of the char array, limit <= target.length 1430 * @param offset16 Offset which the codepoint will be removed 1431 * @return a new limit size 1432 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1433 */ 1434 public static int delete(char target[], int limit, int offset16) { 1435 int count = 1; 1436 switch (bounds(target, 0, limit, offset16)) { 1437 case LEAD_SURROGATE_BOUNDARY: 1438 count++; 1439 break; 1440 case TRAIL_SURROGATE_BOUNDARY: 1441 count++; 1442 offset16--; 1443 break; 1444 } 1445 System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count)); 1446 target[limit - count] = 0; 1447 return limit - count; 1448 } 1449 1450 /** 1451 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1452 * the argument codepoint. I.e., the smallest index <code>i</code> such that 1453 * <code>UTF16.charAt(source, i) == 1454 * char32</code> is true. 1455 * <p> 1456 * If no such character occurs in this string, then -1 is returned. 1457 * </p> 1458 * <p> 1459 * Examples:<br> 1460 * UTF16.indexOf("abc", 'a') returns 0<br> 1461 * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br> 1462 * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1463 * </p> 1464 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1465 * characters to its fullest. 1466 * 1467 * @param source UTF16 format Unicode string that will be searched 1468 * @param char32 Codepoint to search for 1469 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1470 * -1 if the codepoint does not occur. 1471 */ 1472 public static int indexOf(String source, int char32) { 1473 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1474 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1475 } 1476 // non-surrogate bmp 1477 if (char32 < LEAD_SURROGATE_MIN_VALUE 1478 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1479 return source.indexOf((char) char32); 1480 } 1481 // surrogate 1482 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1483 int result = source.indexOf((char) char32); 1484 if (result >= 0) { 1485 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1486 && isTrailSurrogate(source.charAt(result + 1))) { 1487 return indexOf(source, char32, result + 1); 1488 } 1489 // trail surrogate 1490 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1491 return indexOf(source, char32, result + 1); 1492 } 1493 } 1494 return result; 1495 } 1496 // supplementary 1497 String char32str = toString(char32); 1498 return source.indexOf(char32str); 1499 } 1500 1501 /** 1502 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1503 * the argument string str. This method is implemented based on codepoints, hence a "lead 1504 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1505 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1506 * character before str found at in source will not have a valid match. Vice versa for lead 1507 * surrogates that ends str. See example below. 1508 * <p> 1509 * If no such string str occurs in this source, then -1 is returned. 1510 * </p> 1511 * <p> 1512 * Examples:<br> 1513 * UTF16.indexOf("abc", "ab") returns 0<br> 1514 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br> 1515 * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br> 1516 * </p> 1517 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1518 * characters to its fullest. 1519 * 1520 * @param source UTF16 format Unicode string that will be searched 1521 * @param str UTF16 format Unicode string to search for 1522 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1523 * -1 if the codepoint does not occur. 1524 */ 1525 public static int indexOf(String source, String str) { 1526 int strLength = str.length(); 1527 // non-surrogate ends 1528 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1529 return source.indexOf(str); 1530 } 1531 1532 int result = source.indexOf(str); 1533 int resultEnd = result + strLength; 1534 if (result >= 0) { 1535 // check last character 1536 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1537 && isTrailSurrogate(source.charAt(resultEnd + 1))) { 1538 return indexOf(source, str, resultEnd + 1); 1539 } 1540 // check first character which is a trail surrogate 1541 if (isTrailSurrogate(str.charAt(0)) && result > 0 1542 && isLeadSurrogate(source.charAt(result - 1))) { 1543 return indexOf(source, str, resultEnd + 1); 1544 } 1545 } 1546 return result; 1547 } 1548 1549 /** 1550 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1551 * the argument codepoint. I.e., the smallest index i such that: <br> 1552 * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true. 1553 * <p> 1554 * If no such character occurs in this string, then -1 is returned. 1555 * </p> 1556 * <p> 1557 * Examples:<br> 1558 * UTF16.indexOf("abc", 'a', 1) returns -1<br> 1559 * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br> 1560 * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br> 1561 * </p> 1562 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1563 * characters to its fullest. 1564 * 1565 * @param source UTF16 format Unicode string that will be searched 1566 * @param char32 Codepoint to search for 1567 * @param fromIndex The index to start the search from. 1568 * @return the index of the first occurrence of the codepoint in the argument Unicode string at 1569 * or after fromIndex, or -1 if the codepoint does not occur. 1570 */ 1571 public static int indexOf(String source, int char32, int fromIndex) { 1572 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1573 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1574 } 1575 // non-surrogate bmp 1576 if (char32 < LEAD_SURROGATE_MIN_VALUE 1577 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1578 return source.indexOf((char) char32, fromIndex); 1579 } 1580 // surrogate 1581 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1582 int result = source.indexOf((char) char32, fromIndex); 1583 if (result >= 0) { 1584 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1585 && isTrailSurrogate(source.charAt(result + 1))) { 1586 return indexOf(source, char32, result + 1); 1587 } 1588 // trail surrogate 1589 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1590 return indexOf(source, char32, result + 1); 1591 } 1592 } 1593 return result; 1594 } 1595 // supplementary 1596 String char32str = toString(char32); 1597 return source.indexOf(char32str, fromIndex); 1598 } 1599 1600 /** 1601 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1602 * the argument string str. This method is implemented based on codepoints, hence a "lead 1603 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1604 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1605 * character before str found at in source will not have a valid match. Vice versa for lead 1606 * surrogates that ends str. See example below. 1607 * <p> 1608 * If no such string str occurs in this source, then -1 is returned. 1609 * </p> 1610 * <p> 1611 * Examples:<br> 1612 * UTF16.indexOf("abc", "ab", 0) returns 0<br> 1613 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br> 1614 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br> 1615 * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br> 1616 * </p> 1617 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1618 * characters to its fullest. 1619 * 1620 * @param source UTF16 format Unicode string that will be searched 1621 * @param str UTF16 format Unicode string to search for 1622 * @param fromIndex The index to start the search from. 1623 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1624 * -1 if the codepoint does not occur. 1625 */ 1626 public static int indexOf(String source, String str, int fromIndex) { 1627 int strLength = str.length(); 1628 // non-surrogate ends 1629 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1630 return source.indexOf(str, fromIndex); 1631 } 1632 1633 int result = source.indexOf(str, fromIndex); 1634 int resultEnd = result + strLength; 1635 if (result >= 0) { 1636 // check last character 1637 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1638 && isTrailSurrogate(source.charAt(resultEnd))) { 1639 return indexOf(source, str, resultEnd + 1); 1640 } 1641 // check first character which is a trail surrogate 1642 if (isTrailSurrogate(str.charAt(0)) && result > 0 1643 && isLeadSurrogate(source.charAt(result - 1))) { 1644 return indexOf(source, str, resultEnd + 1); 1645 } 1646 } 1647 return result; 1648 } 1649 1650 /** 1651 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1652 * the argument codepoint. I.e., the index returned is the largest value i such that: 1653 * UTF16.charAt(source, i) == char32 is true. 1654 * <p> 1655 * Examples:<br> 1656 * UTF16.lastIndexOf("abc", 'a') returns 0<br> 1657 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br> 1658 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1659 * </p> 1660 * <p> 1661 * source is searched backwards starting at the last character. 1662 * </p> 1663 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1664 * characters to its fullest. 1665 * 1666 * @param source UTF16 format Unicode string that will be searched 1667 * @param char32 Codepoint to search for 1668 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1669 * does not occur. 1670 */ 1671 public static int lastIndexOf(String source, int char32) { 1672 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1673 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1674 } 1675 // non-surrogate bmp 1676 if (char32 < LEAD_SURROGATE_MIN_VALUE 1677 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1678 return source.lastIndexOf((char) char32); 1679 } 1680 // surrogate 1681 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1682 int result = source.lastIndexOf((char) char32); 1683 if (result >= 0) { 1684 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1685 && isTrailSurrogate(source.charAt(result + 1))) { 1686 return lastIndexOf(source, char32, result - 1); 1687 } 1688 // trail surrogate 1689 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1690 return lastIndexOf(source, char32, result - 1); 1691 } 1692 } 1693 return result; 1694 } 1695 // supplementary 1696 String char32str = toString(char32); 1697 return source.lastIndexOf(char32str); 1698 } 1699 1700 /** 1701 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1702 * the argument string str. This method is implemented based on codepoints, hence a "lead 1703 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1704 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1705 * character before str found at in source will not have a valid match. Vice versa for lead 1706 * surrogates that ends str. See example below. 1707 * <p> 1708 * Examples:<br> 1709 * UTF16.lastIndexOf("abc", "a") returns 0<br> 1710 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br> 1711 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br> 1712 * </p> 1713 * <p> 1714 * source is searched backwards starting at the last character. 1715 * </p> 1716 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1717 * characters to its fullest. 1718 * 1719 * @param source UTF16 format Unicode string that will be searched 1720 * @param str UTF16 format Unicode string to search for 1721 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1722 * does not occur. 1723 */ 1724 public static int lastIndexOf(String source, String str) { 1725 int strLength = str.length(); 1726 // non-surrogate ends 1727 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1728 return source.lastIndexOf(str); 1729 } 1730 1731 int result = source.lastIndexOf(str); 1732 if (result >= 0) { 1733 // check last character 1734 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1735 && isTrailSurrogate(source.charAt(result + strLength + 1))) { 1736 return lastIndexOf(source, str, result - 1); 1737 } 1738 // check first character which is a trail surrogate 1739 if (isTrailSurrogate(str.charAt(0)) && result > 0 1740 && isLeadSurrogate(source.charAt(result - 1))) { 1741 return lastIndexOf(source, str, result - 1); 1742 } 1743 } 1744 return result; 1745 } 1746 1747 /** 1748 * <p> 1749 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1750 * the argument codepoint, where the result is less than or equals to fromIndex. 1751 * </p> 1752 * <p> 1753 * This method is implemented based on codepoints, hence a single surrogate character will not 1754 * match a supplementary character. 1755 * </p> 1756 * <p> 1757 * source is searched backwards starting at the last character starting at the specified index. 1758 * </p> 1759 * <p> 1760 * Examples:<br> 1761 * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br> 1762 * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br> 1763 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br> 1764 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br> 1765 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1766 * </p> 1767 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1768 * characters to its fullest. 1769 * 1770 * @param source UTF16 format Unicode string that will be searched 1771 * @param char32 Codepoint to search for 1772 * @param fromIndex the index to start the search from. There is no restriction on the value of 1773 * fromIndex. If it is greater than or equal to the length of this string, it has the 1774 * same effect as if it were equal to one less than the length of this string: this 1775 * entire string may be searched. If it is negative, it has the same effect as if it 1776 * were -1: -1 is returned. 1777 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1778 * does not occur. 1779 */ 1780 public static int lastIndexOf(String source, int char32, int fromIndex) { 1781 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1782 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1783 } 1784 // non-surrogate bmp 1785 if (char32 < LEAD_SURROGATE_MIN_VALUE 1786 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1787 return source.lastIndexOf((char) char32, fromIndex); 1788 } 1789 // surrogate 1790 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1791 int result = source.lastIndexOf((char) char32, fromIndex); 1792 if (result >= 0) { 1793 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1794 && isTrailSurrogate(source.charAt(result + 1))) { 1795 return lastIndexOf(source, char32, result - 1); 1796 } 1797 // trail surrogate 1798 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1799 return lastIndexOf(source, char32, result - 1); 1800 } 1801 } 1802 return result; 1803 } 1804 // supplementary 1805 String char32str = toString(char32); 1806 return source.lastIndexOf(char32str, fromIndex); 1807 } 1808 1809 /** 1810 * <p> 1811 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1812 * the argument string str, where the result is less than or equals to fromIndex. 1813 * </p> 1814 * <p> 1815 * This method is implemented based on codepoints, hence a "lead surrogate character + trail 1816 * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate 1817 * character at index 0, a source with a leading a surrogate character before str found at in 1818 * source will not have a valid match. Vice versa for lead surrogates that ends str. 1819 * </p> 1820 * See example below. 1821 * <p> 1822 * Examples:<br> 1823 * UTF16.lastIndexOf("abc", "c", 2) returns 2<br> 1824 * UTF16.lastIndexOf("abc", "c", 1) returns -1<br> 1825 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br> 1826 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br> 1827 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br> 1828 * </p> 1829 * <p> 1830 * source is searched backwards starting at the last character. 1831 * </p> 1832 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1833 * characters to its fullest. 1834 * 1835 * @param source UTF16 format Unicode string that will be searched 1836 * @param str UTF16 format Unicode string to search for 1837 * @param fromIndex the index to start the search from. There is no restriction on the value of 1838 * fromIndex. If it is greater than or equal to the length of this string, it has the 1839 * same effect as if it were equal to one less than the length of this string: this 1840 * entire string may be searched. If it is negative, it has the same effect as if it 1841 * were -1: -1 is returned. 1842 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1843 * does not occur. 1844 */ 1845 public static int lastIndexOf(String source, String str, int fromIndex) { 1846 int strLength = str.length(); 1847 // non-surrogate ends 1848 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1849 return source.lastIndexOf(str, fromIndex); 1850 } 1851 1852 int result = source.lastIndexOf(str, fromIndex); 1853 if (result >= 0) { 1854 // check last character 1855 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1856 && isTrailSurrogate(source.charAt(result + strLength))) { 1857 return lastIndexOf(source, str, result - 1); 1858 } 1859 // check first character which is a trail surrogate 1860 if (isTrailSurrogate(str.charAt(0)) && result > 0 1861 && isLeadSurrogate(source.charAt(result - 1))) { 1862 return lastIndexOf(source, str, result - 1); 1863 } 1864 } 1865 return result; 1866 } 1867 1868 /** 1869 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of 1870 * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16 1871 * format Unicode string source, then source will be returned. Otherwise, a new String object is 1872 * created that represents a codepoint sequence identical to the codepoint sequence represented 1873 * by source, except that every occurrence of oldChar32 is replaced by an occurrence of 1874 * newChar32. 1875 * <p> 1876 * Examples: <br> 1877 * UTF16.replace("mesquite in your cellar", 'e', 'o');<br> 1878 * returns "mosquito in your collar"<br> 1879 * UTF16.replace("JonL", 'q', 'x');<br> 1880 * returns "JonL" (no change)<br> 1881 * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br> 1882 * returns "Supplementary character !"<br> 1883 * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br> 1884 * returns "Supplementary character \ud800\udc00"<br> 1885 * </p> 1886 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1887 * characters to its fullest. 1888 * 1889 * @param source UTF16 format Unicode string which the codepoint replacements will be based on. 1890 * @param oldChar32 Non-zero old codepoint to be replaced. 1891 * @param newChar32 The new codepoint to replace oldChar32 1892 * @return new String derived from source by replacing every occurrence of oldChar32 with 1893 * newChar32, unless when no oldChar32 is found in source then source will be returned. 1894 */ 1895 public static String replace(String source, int oldChar32, int newChar32) { 1896 if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) { 1897 throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint"); 1898 } 1899 if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) { 1900 throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint"); 1901 } 1902 1903 int index = indexOf(source, oldChar32); 1904 if (index == -1) { 1905 return source; 1906 } 1907 String newChar32Str = toString(newChar32); 1908 int oldChar32Size = 1; 1909 int newChar32Size = newChar32Str.length(); 1910 StringBuffer result = new StringBuffer(source); 1911 int resultIndex = index; 1912 1913 if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) { 1914 oldChar32Size = 2; 1915 } 1916 1917 while (index != -1) { 1918 int endResultIndex = resultIndex + oldChar32Size; 1919 result.replace(resultIndex, endResultIndex, newChar32Str); 1920 int lastEndIndex = index + oldChar32Size; 1921 index = indexOf(source, oldChar32, lastEndIndex); 1922 resultIndex += newChar32Size + index - lastEndIndex; 1923 } 1924 return result.toString(); 1925 } 1926 1927 /** 1928 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr 1929 * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string 1930 * source, then source will be returned. Otherwise, a new String object is created that 1931 * represents a codepoint sequence identical to the codepoint sequence represented by source, 1932 * except that every occurrence of oldStr is replaced by an occurrence of newStr. 1933 * <p> 1934 * Examples: <br> 1935 * UTF16.replace("mesquite in your cellar", "e", "o");<br> 1936 * returns "mosquito in your collar"<br> 1937 * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br> 1938 * returns "cat in your cellar"<br> 1939 * UTF16.replace("JonL", "q", "x");<br> 1940 * returns "JonL" (no change)<br> 1941 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br> 1942 * returns "Supplementary character !"<br> 1943 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br> 1944 * returns "Supplementary character \ud800\udc00"<br> 1945 * </p> 1946 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1947 * characters to its fullest. 1948 * 1949 * @param source UTF16 format Unicode string which the replacements will be based on. 1950 * @param oldStr Non-zero-length string to be replaced. 1951 * @param newStr The new string to replace oldStr 1952 * @return new String derived from source by replacing every occurrence of oldStr with newStr. 1953 * When no oldStr is found in source, then source will be returned. 1954 */ 1955 public static String replace(String source, String oldStr, String newStr) { 1956 int index = indexOf(source, oldStr); 1957 if (index == -1) { 1958 return source; 1959 } 1960 int oldStrSize = oldStr.length(); 1961 int newStrSize = newStr.length(); 1962 StringBuffer result = new StringBuffer(source); 1963 int resultIndex = index; 1964 1965 while (index != -1) { 1966 int endResultIndex = resultIndex + oldStrSize; 1967 result.replace(resultIndex, endResultIndex, newStr); 1968 int lastEndIndex = index + oldStrSize; 1969 index = indexOf(source, oldStr, lastEndIndex); 1970 resultIndex += newStrSize + index - lastEndIndex; 1971 } 1972 return result.toString(); 1973 } 1974 1975 /** 1976 * Reverses a UTF16 format Unicode string and replaces source's content with it. This method 1977 * will reverse surrogate characters correctly, instead of blindly reversing every character. 1978 * <p> 1979 * Examples:<br> 1980 * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br> 1981 * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS". 1982 * 1983 * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed 1984 * @return a modified source with reversed UTF16 format Unicode string. 1985 */ 1986 public static StringBuffer reverse(StringBuffer source) { 1987 int length = source.length(); 1988 StringBuffer result = new StringBuffer(length); 1989 for (int i = length; i-- > 0;) { 1990 char ch = source.charAt(i); 1991 if (isTrailSurrogate(ch) && i > 0) { 1992 char ch2 = source.charAt(i - 1); 1993 if (isLeadSurrogate(ch2)) { 1994 result.append(ch2); 1995 result.append(ch); 1996 --i; 1997 continue; 1998 } 1999 } 2000 result.append(ch); 2001 } 2002 return result; 2003 } 2004 2005 /** 2006 * Check if the string contains more Unicode code points than a certain number. This is more 2007 * efficient than counting all code points in the entire string and comparing that number with a 2008 * threshold. This function may not need to scan the string at all if the length is within a 2009 * certain range, and never needs to count more than 'number + 1' code points. Logically 2010 * equivalent to (countCodePoint(s) > number). A Unicode code point may occupy either one or two 2011 * code units. 2012 * 2013 * @param source The input string. 2014 * @param number The number of code points in the string is compared against the 'number' 2015 * parameter. 2016 * @return boolean value for whether the string contains more Unicode code points than 'number'. 2017 */ 2018 public static boolean hasMoreCodePointsThan(String source, int number) { 2019 if (number < 0) { 2020 return true; 2021 } 2022 if (source == null) { 2023 return false; 2024 } 2025 int length = source.length(); 2026 2027 // length >= 0 known 2028 // source contains at least (length + 1) / 2 code points: <= 2 2029 // chars per cp 2030 if (((length + 1) >> 1) > number) { 2031 return true; 2032 } 2033 2034 // check if source does not even contain enough chars 2035 int maxsupplementary = length - number; 2036 if (maxsupplementary <= 0) { 2037 return false; 2038 } 2039 2040 // there are maxsupplementary = length - number more chars than 2041 // asked-for code points 2042 2043 // count code points until they exceed and also check that there are 2044 // no more than maxsupplementary supplementary code points (char pairs) 2045 int start = 0; 2046 while (true) { 2047 if (length == 0) { 2048 return false; 2049 } 2050 if (number == 0) { 2051 return true; 2052 } 2053 if (isLeadSurrogate(source.charAt(start++)) && start != length 2054 && isTrailSurrogate(source.charAt(start))) { 2055 start++; 2056 if (--maxsupplementary <= 0) { 2057 // too many pairs - too few code points 2058 return false; 2059 } 2060 } 2061 --number; 2062 } 2063 } 2064 2065 /** 2066 * Check if the sub-range of char array, from argument start to limit, contains more Unicode 2067 * code points than a certain number. This is more efficient than counting all code points in 2068 * the entire char array range and comparing that number with a threshold. This function may not 2069 * need to scan the char array at all if start and limit is within a certain range, and never 2070 * needs to count more than 'number + 1' code points. Logically equivalent to 2071 * (countCodePoint(source, start, limit) > number). A Unicode code point may occupy either one 2072 * or two code units. 2073 * 2074 * @param source Array of UTF-16 chars 2075 * @param start Offset to substring in the source array for analyzing 2076 * @param limit Offset to substring in the source array for analyzing 2077 * @param number The number of code points in the string is compared against the 'number' 2078 * parameter. 2079 * @return boolean value for whether the string contains more Unicode code points than 'number'. 2080 * @exception IndexOutOfBoundsException Thrown when limit < start 2081 */ 2082 public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) { 2083 int length = limit - start; 2084 if (length < 0 || start < 0 || limit < 0) { 2085 throw new IndexOutOfBoundsException( 2086 "Start and limit indexes should be non-negative and start <= limit"); 2087 } 2088 if (number < 0) { 2089 return true; 2090 } 2091 if (source == null) { 2092 return false; 2093 } 2094 2095 // length >= 0 known 2096 // source contains at least (length + 1) / 2 code points: <= 2 2097 // chars per cp 2098 if (((length + 1) >> 1) > number) { 2099 return true; 2100 } 2101 2102 // check if source does not even contain enough chars 2103 int maxsupplementary = length - number; 2104 if (maxsupplementary <= 0) { 2105 return false; 2106 } 2107 2108 // there are maxsupplementary = length - number more chars than 2109 // asked-for code points 2110 2111 // count code points until they exceed and also check that there are 2112 // no more than maxsupplementary supplementary code points (char pairs) 2113 while (true) { 2114 if (length == 0) { 2115 return false; 2116 } 2117 if (number == 0) { 2118 return true; 2119 } 2120 if (isLeadSurrogate(source[start++]) && start != limit 2121 && isTrailSurrogate(source[start])) { 2122 start++; 2123 if (--maxsupplementary <= 0) { 2124 // too many pairs - too few code points 2125 return false; 2126 } 2127 } 2128 --number; 2129 } 2130 } 2131 2132 /** 2133 * Check if the string buffer contains more Unicode code points than a certain number. This is 2134 * more efficient than counting all code points in the entire string buffer and comparing that 2135 * number with a threshold. This function may not need to scan the string buffer at all if the 2136 * length is within a certain range, and never needs to count more than 'number + 1' code 2137 * points. Logically equivalent to (countCodePoint(s) > number). A Unicode code point may 2138 * occupy either one or two code units. 2139 * 2140 * @param source The input string buffer. 2141 * @param number The number of code points in the string buffer is compared against the 'number' 2142 * parameter. 2143 * @return boolean value for whether the string buffer contains more Unicode code points than 2144 * 'number'. 2145 */ 2146 public static boolean hasMoreCodePointsThan(StringBuffer source, int number) { 2147 if (number < 0) { 2148 return true; 2149 } 2150 if (source == null) { 2151 return false; 2152 } 2153 int length = source.length(); 2154 2155 // length >= 0 known 2156 // source contains at least (length + 1) / 2 code points: <= 2 2157 // chars per cp 2158 if (((length + 1) >> 1) > number) { 2159 return true; 2160 } 2161 2162 // check if source does not even contain enough chars 2163 int maxsupplementary = length - number; 2164 if (maxsupplementary <= 0) { 2165 return false; 2166 } 2167 2168 // there are maxsupplementary = length - number more chars than 2169 // asked-for code points 2170 2171 // count code points until they exceed and also check that there are 2172 // no more than maxsupplementary supplementary code points (char pairs) 2173 int start = 0; 2174 while (true) { 2175 if (length == 0) { 2176 return false; 2177 } 2178 if (number == 0) { 2179 return true; 2180 } 2181 if (isLeadSurrogate(source.charAt(start++)) && start != length 2182 && isTrailSurrogate(source.charAt(start))) { 2183 start++; 2184 if (--maxsupplementary <= 0) { 2185 // too many pairs - too few code points 2186 return false; 2187 } 2188 } 2189 --number; 2190 } 2191 } 2192 2193 /** 2194 * Cover JDK 1.5 API. Create a String from an array of codePoints. 2195 * 2196 * @param codePoints The code array 2197 * @param offset The start of the text in the code point array 2198 * @param count The number of code points 2199 * @return a String representing the code points between offset and count 2200 * @throws IllegalArgumentException If an invalid code point is encountered 2201 * @throws IndexOutOfBoundsException If the offset or count are out of bounds. 2202 */ 2203 public static String newString(int[] codePoints, int offset, int count) { 2204 if (count < 0) { 2205 throw new IllegalArgumentException(); 2206 } 2207 char[] chars = new char[count]; 2208 int w = 0; 2209 for (int r = offset, e = offset + count; r < e; ++r) { 2210 int cp = codePoints[r]; 2211 if (cp < 0 || cp > 0x10ffff) { 2212 throw new IllegalArgumentException(); 2213 } 2214 while (true) { 2215 try { 2216 if (cp < 0x010000) { 2217 chars[w] = (char) cp; 2218 w++; 2219 } else { 2220 chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_)); 2221 chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_)); 2222 w += 2; 2223 } 2224 break; 2225 } catch (IndexOutOfBoundsException ex) { 2226 int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2) 2227 / (r - offset + 1))); 2228 char[] temp = new char[newlen]; 2229 System.arraycopy(chars, 0, temp, 0, w); 2230 chars = temp; 2231 } 2232 } 2233 } 2234 return new String(chars, 0, w); 2235 } 2236 2237 /** 2238 * <p> 2239 * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various 2240 * modes 2241 * </p> 2242 * <ul> 2243 * <li> Code point comparison or code unit comparison 2244 * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison 2245 * with special handling for character 'i'. 2246 * </ul> 2247 * <p> 2248 * The code unit or code point comparison differ only when comparing supplementary code points 2249 * (\u10000..\u10ffff) to BMP code points near the end of the BMP (i.e., 2250 * \ue000..\uffff). In code unit comparison, high BMP code points sort after 2251 * supplementary code points because they are stored as pairs of surrogates which are at 2252 * \ud800..\udfff. 2253 * </p> 2254 * 2255 * @see #FOLD_CASE_DEFAULT 2256 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2257 */ 2258 public static final class StringComparator implements java.util.Comparator<String> { 2259 // public constructor ------------------------------------------------ 2260 2261 /** 2262 * Default constructor that does code unit comparison and case sensitive comparison. 2263 */ 2264 public StringComparator() { 2265 this(false, false, FOLD_CASE_DEFAULT); 2266 } 2267 2268 /** 2269 * Constructor that does comparison based on the argument options. 2270 * 2271 * @param codepointcompare Flag to indicate true for code point comparison or false for code unit 2272 * comparison. 2273 * @param ignorecase False for case sensitive comparison, true for case-insensitive comparison 2274 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only 2275 * when ignorecase is set to true. If ignorecase is false, this option is 2276 * ignored. 2277 * @see #FOLD_CASE_DEFAULT 2278 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2279 * @throws IllegalArgumentException If foldcaseoption is out of range 2280 */ 2281 public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) { 2282 setCodePointCompare(codepointcompare); 2283 m_ignoreCase_ = ignorecase; 2284 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) { 2285 throw new IllegalArgumentException("Invalid fold case option"); 2286 } 2287 m_foldCase_ = foldcaseoption; 2288 } 2289 2290 // public data member ------------------------------------------------ 2291 2292 /** 2293 * Option value for case folding comparison: 2294 * 2295 * <p>Comparison is case insensitive, strings are folded using default mappings defined in 2296 * Unicode data file CaseFolding.txt, before comparison. 2297 */ 2298 public static final int FOLD_CASE_DEFAULT = 0; 2299 2300 /** 2301 * Option value for case folding: 2302 * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I 2303 * and dotless i appropriately for Turkic languages (tr, az). 2304 * 2305 * <p>Comparison is case insensitive, strings are folded using modified mappings defined in 2306 * Unicode data file CaseFolding.txt, before comparison. 2307 * 2308 * @see android.icu.lang.UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I 2309 */ 2310 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1; 2311 2312 // public methods ---------------------------------------------------- 2313 2314 // public setters ---------------------------------------------------- 2315 2316 /** 2317 * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode 2318 * is set to code unit compare 2319 * 2320 * @param flag True for code point compare, false for code unit compare 2321 */ 2322 public void setCodePointCompare(boolean flag) { 2323 if (flag) { 2324 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER; 2325 } else { 2326 m_codePointCompare_ = 0; 2327 } 2328 } 2329 2330 /** 2331 * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise 2332 * case sensitive comparison mode if set to false. 2333 * 2334 * @param ignorecase True for case-insitive comparison, false for case sensitive comparison 2335 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only 2336 * when ignorecase is set to true. If ignorecase is false, this option is 2337 * ignored. 2338 * @see #FOLD_CASE_DEFAULT 2339 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2340 */ 2341 public void setIgnoreCase(boolean ignorecase, int foldcaseoption) { 2342 m_ignoreCase_ = ignorecase; 2343 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) { 2344 throw new IllegalArgumentException("Invalid fold case option"); 2345 } 2346 m_foldCase_ = foldcaseoption; 2347 } 2348 2349 // public getters ---------------------------------------------------- 2350 2351 /** 2352 * Checks if the comparison mode is code point compare. 2353 * 2354 * @return true for code point compare, false for code unit compare 2355 */ 2356 public boolean getCodePointCompare() { 2357 return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER; 2358 } 2359 2360 /** 2361 * Checks if Comparator is in the case insensitive mode. 2362 * 2363 * @return true if Comparator performs case insensitive comparison, false otherwise 2364 */ 2365 public boolean getIgnoreCase() { 2366 return m_ignoreCase_; 2367 } 2368 2369 /** 2370 * Gets the fold case options set in Comparator to be used with case insensitive comparison. 2371 * 2372 * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I 2373 * @see #FOLD_CASE_DEFAULT 2374 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2375 */ 2376 public int getIgnoreCaseOption() { 2377 return m_foldCase_; 2378 } 2379 2380 // public other methods ---------------------------------------------- 2381 2382 /** 2383 * Compare two strings depending on the options selected during construction. 2384 * 2385 * @param a first source string. 2386 * @param b second source string. 2387 * @return 0 returned if a == b. If a < b, a negative value is returned. Otherwise if a > b, 2388 * a positive value is returned. 2389 * @exception ClassCastException thrown when either a or b is not a String object 2390 */ 2391 public int compare(String a, String b) { 2392 if (a == b) { 2393 return 0; 2394 } 2395 if (a == null) { 2396 return -1; 2397 } 2398 if (b == null) { 2399 return 1; 2400 } 2401 2402 if (m_ignoreCase_) { 2403 return compareCaseInsensitive(a, b); 2404 } 2405 return compareCaseSensitive(a, b); 2406 } 2407 2408 // private data member ---------------------------------------------- 2409 2410 /** 2411 * Code unit comparison flag. True if code unit comparison is required. False if code point 2412 * comparison is required. 2413 */ 2414 private int m_codePointCompare_; 2415 2416 /** 2417 * Fold case comparison option. 2418 */ 2419 private int m_foldCase_; 2420 2421 /** 2422 * Flag indicator if ignore case is to be used during comparison 2423 */ 2424 private boolean m_ignoreCase_; 2425 2426 /** 2427 * Code point order offset for surrogate characters 2428 */ 2429 private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800; 2430 2431 // private method --------------------------------------------------- 2432 2433 /** 2434 * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life 2435 * easier. 2436 * 2437 * @param s1 2438 * first string to compare 2439 * @param s2 2440 * second string to compare 2441 * @return -1 is s1 < s2, 0 if equals, 2442 */ 2443 private int compareCaseInsensitive(String s1, String s2) { 2444 return Normalizer.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_ 2445 | Normalizer.COMPARE_IGNORE_CASE); 2446 } 2447 2448 /** 2449 * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life 2450 * easier. 2451 * 2452 * @param s1 2453 * first string to compare 2454 * @param s2 2455 * second string to compare 2456 * @return -1 is s1 < s2, 0 if equals, 2457 */ 2458 private int compareCaseSensitive(String s1, String s2) { 2459 // compare identical prefixes - they do not need to be fixed up 2460 // limit1 = start1 + min(lenght1, length2) 2461 int length1 = s1.length(); 2462 int length2 = s2.length(); 2463 int minlength = length1; 2464 int result = 0; 2465 if (length1 < length2) { 2466 result = -1; 2467 } else if (length1 > length2) { 2468 result = 1; 2469 minlength = length2; 2470 } 2471 2472 char c1 = 0; 2473 char c2 = 0; 2474 int index = 0; 2475 for (; index < minlength; index++) { 2476 c1 = s1.charAt(index); 2477 c2 = s2.charAt(index); 2478 // check pseudo-limit 2479 if (c1 != c2) { 2480 break; 2481 } 2482 } 2483 2484 if (index == minlength) { 2485 return result; 2486 } 2487 2488 boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER; 2489 // if both values are in or above the surrogate range, fix them up 2490 if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE 2491 && codepointcompare) { 2492 // subtract 0x2800 from BMP code points to make them smaller 2493 // than supplementary ones 2494 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1))) 2495 || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) { 2496 // part of a surrogate pair, leave >=d800 2497 } else { 2498 // BMP code point - may be surrogate code point - make 2499 // < d800 2500 c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_; 2501 } 2502 2503 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1))) 2504 || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) { 2505 // part of a surrogate pair, leave >=d800 2506 } else { 2507 // BMP code point - may be surrogate code point - make <d800 2508 c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_; 2509 } 2510 } 2511 2512 // now c1 and c2 are in UTF-32-compatible order 2513 return c1 - c2; 2514 } 2515 } 2516 2517 /** 2518 * Utility for getting a code point from a CharSequence that contains exactly one code point. 2519 * @return the code point IF the string is non-null and consists of a single code point. 2520 * otherwise returns -1. 2521 * @param s to test 2522 */ 2523 public static int getSingleCodePoint(CharSequence s) { 2524 if (s == null || s.length() == 0) { 2525 return -1; 2526 } else if (s.length() == 1) { 2527 return s.charAt(0); 2528 } else if (s.length() > 2) { 2529 return -1; 2530 } 2531 2532 // at this point, len = 2 2533 int cp = Character.codePointAt(s, 0); 2534 if (cp > 0xFFFF) { // is surrogate pair 2535 return cp; 2536 } 2537 return -1; 2538 } 2539 2540 /** 2541 * Utility for comparing a code point to a string without having to create a new string. Returns the same results 2542 * as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if 2543 * <pre> 2544 * sc = new StringComparator(true,false,0); 2545 * fast = UTF16.compareCodePoint(codePoint, charSequence) 2546 * slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString()) 2547 * </pre> 2548 * then 2549 * <pre> 2550 * Integer.signum(fast) == Integer.signum(slower) 2551 * </pre> 2552 * @param codePoint to test 2553 * @param s to test 2554 * @return equivalent of code point comparator comparing two strings. 2555 */ 2556 public static int compareCodePoint(int codePoint, CharSequence s) { 2557 if (s == null) { 2558 return 1; 2559 } 2560 final int strLen = s.length(); 2561 if (strLen == 0) { 2562 return 1; 2563 } 2564 int second = Character.codePointAt(s, 0); 2565 int diff = codePoint - second; 2566 if (diff != 0) { 2567 return diff; 2568 } 2569 return strLen == Character.charCount(codePoint) ? 0 : -1; 2570 } 2571 2572 // private data members ------------------------------------------------- 2573 2574 /** 2575 * Shift value for lead surrogate to form a supplementary character. 2576 */ 2577 private static final int LEAD_SURROGATE_SHIFT_ = 10; 2578 2579 /** 2580 * Mask to retrieve the significant value from a trail surrogate. 2581 */ 2582 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; 2583 2584 /** 2585 * Value that all lead surrogate starts with 2586 */ 2587 private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE 2588 - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); 2589 2590 // private methods ------------------------------------------------------ 2591 2592 /** 2593 * <p> 2594 * Converts argument code point and returns a String object representing the code point's value 2595 * in UTF16 format. 2596 * </p> 2597 * <p> 2598 * This method does not check for the validity of the codepoint, the results are not guaranteed 2599 * if a invalid codepoint is passed as argument. 2600 * </p> 2601 * <p> 2602 * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise. 2603 * </p> 2604 * 2605 * @param ch 2606 * code point 2607 * @return string representation of the code point 2608 */ 2609 private static String toString(int ch) { 2610 if (ch < SUPPLEMENTARY_MIN_VALUE) { 2611 return String.valueOf((char) ch); 2612 } 2613 2614 StringBuilder result = new StringBuilder(); 2615 result.append(getLeadSurrogate(ch)); 2616 result.append(getTrailSurrogate(ch)); 2617 return result.toString(); 2618 } 2619} 2620// eof 2621