1/* GENERATED SOURCE. DO NOT MODIFY. */ 2// © 2016 and later: Unicode, Inc. and others. 3// License & terms of use: http://www.unicode.org/copyright.html#License 4/* 5 ******************************************************************************* 6 * Copyright (C) 1996-2015, International Business Machines Corporation and * 7 * others. All Rights Reserved. * 8 ******************************************************************************* 9 */ 10package android.icu.impl; 11 12import java.io.IOException; 13import java.util.ArrayList; 14import java.util.Locale; 15import java.util.regex.Pattern; 16 17import android.icu.lang.UCharacter; 18import android.icu.text.Replaceable; 19import android.icu.text.UTF16; 20import android.icu.text.UnicodeMatcher; 21 22/** 23 * @hide Only a subset of ICU is exposed in Android 24 */ 25public final class Utility { 26 27 private static final char APOSTROPHE = '\''; 28 private static final char BACKSLASH = '\\'; 29 private static final int MAGIC_UNSIGNED = 0x80000000; 30 31 /** 32 * Convenience utility to compare two Object[]s. 33 * Ought to be in System 34 */ 35 public final static boolean arrayEquals(Object[] source, Object target) { 36 if (source == null) return (target == null); 37 if (!(target instanceof Object[])) return false; 38 Object[] targ = (Object[]) target; 39 return (source.length == targ.length 40 && arrayRegionMatches(source, 0, targ, 0, source.length)); 41 } 42 43 /** 44 * Convenience utility to compare two int[]s 45 * Ought to be in System 46 */ 47 public final static boolean arrayEquals(int[] source, Object target) { 48 if (source == null) return (target == null); 49 if (!(target instanceof int[])) return false; 50 int[] targ = (int[]) target; 51 return (source.length == targ.length 52 && arrayRegionMatches(source, 0, targ, 0, source.length)); 53 } 54 55 /** 56 * Convenience utility to compare two double[]s 57 * Ought to be in System 58 */ 59 public final static boolean arrayEquals(double[] source, Object target) { 60 if (source == null) return (target == null); 61 if (!(target instanceof double[])) return false; 62 double[] targ = (double[]) target; 63 return (source.length == targ.length 64 && arrayRegionMatches(source, 0, targ, 0, source.length)); 65 } 66 public final static boolean arrayEquals(byte[] source, Object target) { 67 if (source == null) return (target == null); 68 if (!(target instanceof byte[])) return false; 69 byte[] targ = (byte[]) target; 70 return (source.length == targ.length 71 && arrayRegionMatches(source, 0, targ, 0, source.length)); 72 } 73 74 /** 75 * Convenience utility to compare two Object[]s 76 * Ought to be in System 77 */ 78 public final static boolean arrayEquals(Object source, Object target) { 79 if (source == null) return (target == null); 80 // for some reason, the correct arrayEquals is not being called 81 // so do it by hand for now. 82 if (source instanceof Object[]) 83 return(arrayEquals((Object[]) source,target)); 84 if (source instanceof int[]) 85 return(arrayEquals((int[]) source,target)); 86 if (source instanceof double[]) 87 return(arrayEquals((double[]) source, target)); 88 if (source instanceof byte[]) 89 return(arrayEquals((byte[]) source,target)); 90 return source.equals(target); 91 } 92 93 /** 94 * Convenience utility to compare two Object[]s 95 * Ought to be in System. 96 * @param len the length to compare. 97 * The start indices and start+len must be valid. 98 */ 99 public final static boolean arrayRegionMatches(Object[] source, int sourceStart, 100 Object[] target, int targetStart, 101 int len) 102 { 103 int sourceEnd = sourceStart + len; 104 int delta = targetStart - sourceStart; 105 for (int i = sourceStart; i < sourceEnd; i++) { 106 if (!arrayEquals(source[i],target[i + delta])) 107 return false; 108 } 109 return true; 110 } 111 112 /** 113 * Convenience utility to compare two Object[]s 114 * Ought to be in System. 115 * @param len the length to compare. 116 * The start indices and start+len must be valid. 117 */ 118 public final static boolean arrayRegionMatches(char[] source, int sourceStart, 119 char[] target, int targetStart, 120 int len) 121 { 122 int sourceEnd = sourceStart + len; 123 int delta = targetStart - sourceStart; 124 for (int i = sourceStart; i < sourceEnd; i++) { 125 if (source[i]!=target[i + delta]) 126 return false; 127 } 128 return true; 129 } 130 131 /** 132 * Convenience utility to compare two int[]s. 133 * @param len the length to compare. 134 * The start indices and start+len must be valid. 135 * Ought to be in System 136 */ 137 public final static boolean arrayRegionMatches(int[] source, int sourceStart, 138 int[] target, int targetStart, 139 int len) 140 { 141 int sourceEnd = sourceStart + len; 142 int delta = targetStart - sourceStart; 143 for (int i = sourceStart; i < sourceEnd; i++) { 144 if (source[i] != target[i + delta]) 145 return false; 146 } 147 return true; 148 } 149 150 /** 151 * Convenience utility to compare two arrays of doubles. 152 * @param len the length to compare. 153 * The start indices and start+len must be valid. 154 * Ought to be in System 155 */ 156 public final static boolean arrayRegionMatches(double[] source, int sourceStart, 157 double[] target, int targetStart, 158 int len) 159 { 160 int sourceEnd = sourceStart + len; 161 int delta = targetStart - sourceStart; 162 for (int i = sourceStart; i < sourceEnd; i++) { 163 if (source[i] != target[i + delta]) 164 return false; 165 } 166 return true; 167 } 168 public final static boolean arrayRegionMatches(byte[] source, int sourceStart, 169 byte[] target, int targetStart, int len){ 170 int sourceEnd = sourceStart + len; 171 int delta = targetStart - sourceStart; 172 for (int i = sourceStart; i < sourceEnd; i++) { 173 if (source[i] != target[i + delta]) 174 return false; 175 } 176 return true; 177 } 178 179 /** 180 * Trivial reference equality. 181 * This method should help document that we really want == not equals(), 182 * and to have a single place to suppress warnings from static analysis tools. 183 */ 184 public static final boolean sameObjects(Object a, Object b) { 185 return a == b; 186 } 187 188 /** 189 * Convenience utility. Does null checks on objects, then calls equals. 190 */ 191 public final static boolean objectEquals(Object a, Object b) { 192 return a == null ? 193 b == null ? true : false : 194 b == null ? false : a.equals(b); 195 } 196 197 /** 198 * Convenience utility. Does null checks on objects, then calls compare. 199 */ 200 public static <T extends Comparable<T>> int checkCompare(T a, T b) { 201 return a == null ? 202 b == null ? 0 : -1 : 203 b == null ? 1 : a.compareTo(b); 204 } 205 206 /** 207 * Convenience utility. Does null checks on object, then calls hashCode. 208 */ 209 public static int checkHash(Object a) { 210 return a == null ? 0 : a.hashCode(); 211 } 212 213 /** 214 * The ESCAPE character is used during run-length encoding. It signals 215 * a run of identical chars. 216 */ 217 private static final char ESCAPE = '\uA5A5'; 218 219 /** 220 * The ESCAPE_BYTE character is used during run-length encoding. It signals 221 * a run of identical bytes. 222 */ 223 static final byte ESCAPE_BYTE = (byte)0xA5; 224 225 /** 226 * Construct a string representing an int array. Use run-length encoding. 227 * A character represents itself, unless it is the ESCAPE character. Then 228 * the following notations are possible: 229 * ESCAPE ESCAPE ESCAPE literal 230 * ESCAPE n c n instances of character c 231 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 232 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 233 * If we encounter a run where n == ESCAPE, we represent this as: 234 * c ESCAPE n-1 c 235 * The ESCAPE value is chosen so as not to collide with commonly 236 * seen values. 237 */ 238 static public final String arrayToRLEString(int[] a) { 239 StringBuilder buffer = new StringBuilder(); 240 241 appendInt(buffer, a.length); 242 int runValue = a[0]; 243 int runLength = 1; 244 for (int i=1; i<a.length; ++i) { 245 int s = a[i]; 246 if (s == runValue && runLength < 0xFFFF) { 247 ++runLength; 248 } else { 249 encodeRun(buffer, runValue, runLength); 250 runValue = s; 251 runLength = 1; 252 } 253 } 254 encodeRun(buffer, runValue, runLength); 255 return buffer.toString(); 256 } 257 258 /** 259 * Construct a string representing a short array. Use run-length encoding. 260 * A character represents itself, unless it is the ESCAPE character. Then 261 * the following notations are possible: 262 * ESCAPE ESCAPE ESCAPE literal 263 * ESCAPE n c n instances of character c 264 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 265 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 266 * If we encounter a run where n == ESCAPE, we represent this as: 267 * c ESCAPE n-1 c 268 * The ESCAPE value is chosen so as not to collide with commonly 269 * seen values. 270 */ 271 static public final String arrayToRLEString(short[] a) { 272 StringBuilder buffer = new StringBuilder(); 273 // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]); 274 buffer.append((char) (a.length >> 16)); 275 buffer.append((char) a.length); 276 short runValue = a[0]; 277 int runLength = 1; 278 for (int i=1; i<a.length; ++i) { 279 short s = a[i]; 280 if (s == runValue && runLength < 0xFFFF) ++runLength; 281 else { 282 encodeRun(buffer, runValue, runLength); 283 runValue = s; 284 runLength = 1; 285 } 286 } 287 encodeRun(buffer, runValue, runLength); 288 return buffer.toString(); 289 } 290 291 /** 292 * Construct a string representing a char array. Use run-length encoding. 293 * A character represents itself, unless it is the ESCAPE character. Then 294 * the following notations are possible: 295 * ESCAPE ESCAPE ESCAPE literal 296 * ESCAPE n c n instances of character c 297 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 298 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 299 * If we encounter a run where n == ESCAPE, we represent this as: 300 * c ESCAPE n-1 c 301 * The ESCAPE value is chosen so as not to collide with commonly 302 * seen values. 303 */ 304 static public final String arrayToRLEString(char[] a) { 305 StringBuilder buffer = new StringBuilder(); 306 buffer.append((char) (a.length >> 16)); 307 buffer.append((char) a.length); 308 char runValue = a[0]; 309 int runLength = 1; 310 for (int i=1; i<a.length; ++i) { 311 char s = a[i]; 312 if (s == runValue && runLength < 0xFFFF) ++runLength; 313 else { 314 encodeRun(buffer, (short)runValue, runLength); 315 runValue = s; 316 runLength = 1; 317 } 318 } 319 encodeRun(buffer, (short)runValue, runLength); 320 return buffer.toString(); 321 } 322 323 /** 324 * Construct a string representing a byte array. Use run-length encoding. 325 * Two bytes are packed into a single char, with a single extra zero byte at 326 * the end if needed. A byte represents itself, unless it is the 327 * ESCAPE_BYTE. Then the following notations are possible: 328 * ESCAPE_BYTE ESCAPE_BYTE ESCAPE_BYTE literal 329 * ESCAPE_BYTE n b n instances of byte b 330 * Since an encoded run occupies 3 bytes, we only encode runs of 4 or 331 * more bytes. Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF. 332 * If we encounter a run where n == ESCAPE_BYTE, we represent this as: 333 * b ESCAPE_BYTE n-1 b 334 * The ESCAPE_BYTE value is chosen so as not to collide with commonly 335 * seen values. 336 */ 337 static public final String arrayToRLEString(byte[] a) { 338 StringBuilder buffer = new StringBuilder(); 339 buffer.append((char) (a.length >> 16)); 340 buffer.append((char) a.length); 341 byte runValue = a[0]; 342 int runLength = 1; 343 byte[] state = new byte[2]; 344 for (int i=1; i<a.length; ++i) { 345 byte b = a[i]; 346 if (b == runValue && runLength < 0xFF) ++runLength; 347 else { 348 encodeRun(buffer, runValue, runLength, state); 349 runValue = b; 350 runLength = 1; 351 } 352 } 353 encodeRun(buffer, runValue, runLength, state); 354 355 // We must save the final byte, if there is one, by padding 356 // an extra zero. 357 if (state[0] != 0) appendEncodedByte(buffer, (byte)0, state); 358 359 return buffer.toString(); 360 } 361 362 /** 363 * Encode a run, possibly a degenerate run (of < 4 values). 364 * @param length The length of the run; must be > 0 && <= 0xFFFF. 365 */ 366 private static final <T extends Appendable> void encodeRun(T buffer, int value, int length) { 367 if (length < 4) { 368 for (int j=0; j<length; ++j) { 369 if (value == ESCAPE) { 370 appendInt(buffer, value); 371 } 372 appendInt(buffer, value); 373 } 374 } 375 else { 376 if (length == ESCAPE) { 377 if (value == ESCAPE) { 378 appendInt(buffer, ESCAPE); 379 } 380 appendInt(buffer, value); 381 --length; 382 } 383 appendInt(buffer, ESCAPE); 384 appendInt(buffer, length); 385 appendInt(buffer, value); // Don't need to escape this value 386 } 387 } 388 389 private static final <T extends Appendable> void appendInt(T buffer, int value) { 390 try { 391 buffer.append((char)(value >>> 16)); 392 buffer.append((char)(value & 0xFFFF)); 393 } catch (IOException e) { 394 throw new IllegalIcuArgumentException(e); 395 } 396 } 397 398 /** 399 * Encode a run, possibly a degenerate run (of < 4 values). 400 * @param length The length of the run; must be > 0 && <= 0xFFFF. 401 */ 402 private static final <T extends Appendable> void encodeRun(T buffer, short value, int length) { 403 try { 404 char valueChar = (char) value; 405 if (length < 4) { 406 for (int j=0; j<length; ++j) { 407 if (valueChar == ESCAPE) { 408 buffer.append(ESCAPE); 409 } 410 buffer.append(valueChar); 411 } 412 } 413 else { 414 if (length == ESCAPE) { 415 if (valueChar == ESCAPE) { 416 buffer.append(ESCAPE); 417 } 418 buffer.append(valueChar); 419 --length; 420 } 421 buffer.append(ESCAPE); 422 buffer.append((char) length); 423 buffer.append(valueChar); // Don't need to escape this value 424 } 425 } catch (IOException e) { 426 throw new IllegalIcuArgumentException(e); 427 } 428 } 429 430 /** 431 * Encode a run, possibly a degenerate run (of < 4 values). 432 * @param length The length of the run; must be > 0 && <= 0xFF. 433 */ 434 private static final <T extends Appendable> void encodeRun(T buffer, byte value, int length, 435 byte[] state) { 436 if (length < 4) { 437 for (int j=0; j<length; ++j) { 438 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state); 439 appendEncodedByte(buffer, value, state); 440 } 441 } 442 else { 443 if ((byte)length == ESCAPE_BYTE) { 444 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state); 445 appendEncodedByte(buffer, value, state); 446 --length; 447 } 448 appendEncodedByte(buffer, ESCAPE_BYTE, state); 449 appendEncodedByte(buffer, (byte)length, state); 450 appendEncodedByte(buffer, value, state); // Don't need to escape this value 451 } 452 } 453 454 /** 455 * Append a byte to the given Appendable, packing two bytes into each 456 * character. The state parameter maintains intermediary data between 457 * calls. 458 * @param state A two-element array, with state[0] == 0 if this is the 459 * first byte of a pair, or state[0] != 0 if this is the second byte 460 * of a pair, in which case state[1] is the first byte. 461 */ 462 private static final <T extends Appendable> void appendEncodedByte(T buffer, byte value, 463 byte[] state) { 464 try { 465 if (state[0] != 0) { 466 char c = (char) ((state[1] << 8) | ((value) & 0xFF)); 467 buffer.append(c); 468 state[0] = 0; 469 } 470 else { 471 state[0] = 1; 472 state[1] = value; 473 } 474 } catch (IOException e) { 475 throw new IllegalIcuArgumentException(e); 476 } 477 } 478 479 /** 480 * Construct an array of ints from a run-length encoded string. 481 */ 482 static public final int[] RLEStringToIntArray(String s) { 483 int length = getInt(s, 0); 484 int[] array = new int[length]; 485 int ai = 0, i = 1; 486 487 int maxI = s.length() / 2; 488 while (ai < length && i < maxI) { 489 int c = getInt(s, i++); 490 491 if (c == ESCAPE) { 492 c = getInt(s, i++); 493 if (c == ESCAPE) { 494 array[ai++] = c; 495 } else { 496 int runLength = c; 497 int runValue = getInt(s, i++); 498 for (int j=0; j<runLength; ++j) { 499 array[ai++] = runValue; 500 } 501 } 502 } 503 else { 504 array[ai++] = c; 505 } 506 } 507 508 if (ai != length || i != maxI) { 509 throw new IllegalStateException("Bad run-length encoded int array"); 510 } 511 512 return array; 513 } 514 static final int getInt(String s, int i) { 515 return ((s.charAt(2*i)) << 16) | s.charAt(2*i+1); 516 } 517 518 /** 519 * Construct an array of shorts from a run-length encoded string. 520 */ 521 static public final short[] RLEStringToShortArray(String s) { 522 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 523 short[] array = new short[length]; 524 int ai = 0; 525 for (int i=2; i<s.length(); ++i) { 526 char c = s.charAt(i); 527 if (c == ESCAPE) { 528 c = s.charAt(++i); 529 if (c == ESCAPE) { 530 array[ai++] = (short) c; 531 } else { 532 int runLength = c; 533 short runValue = (short) s.charAt(++i); 534 for (int j=0; j<runLength; ++j) array[ai++] = runValue; 535 } 536 } 537 else { 538 array[ai++] = (short) c; 539 } 540 } 541 542 if (ai != length) 543 throw new IllegalStateException("Bad run-length encoded short array"); 544 545 return array; 546 } 547 548 /** 549 * Construct an array of shorts from a run-length encoded string. 550 */ 551 static public final char[] RLEStringToCharArray(String s) { 552 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 553 char[] array = new char[length]; 554 int ai = 0; 555 for (int i=2; i<s.length(); ++i) { 556 char c = s.charAt(i); 557 if (c == ESCAPE) { 558 c = s.charAt(++i); 559 if (c == ESCAPE) { 560 array[ai++] = c; 561 } else { 562 int runLength = c; 563 char runValue = s.charAt(++i); 564 for (int j=0; j<runLength; ++j) array[ai++] = runValue; 565 } 566 } 567 else { 568 array[ai++] = c; 569 } 570 } 571 572 if (ai != length) 573 throw new IllegalStateException("Bad run-length encoded short array"); 574 575 return array; 576 } 577 578 /** 579 * Construct an array of bytes from a run-length encoded string. 580 */ 581 static public final byte[] RLEStringToByteArray(String s) { 582 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 583 byte[] array = new byte[length]; 584 boolean nextChar = true; 585 char c = 0; 586 int node = 0; 587 int runLength = 0; 588 int i = 2; 589 for (int ai=0; ai<length; ) { 590 // This part of the loop places the next byte into the local 591 // variable 'b' each time through the loop. It keeps the 592 // current character in 'c' and uses the boolean 'nextChar' 593 // to see if we've taken both bytes out of 'c' yet. 594 byte b; 595 if (nextChar) { 596 c = s.charAt(i++); 597 b = (byte) (c >> 8); 598 nextChar = false; 599 } 600 else { 601 b = (byte) (c & 0xFF); 602 nextChar = true; 603 } 604 605 // This part of the loop is a tiny state machine which handles 606 // the parsing of the run-length encoding. This would be simpler 607 // if we could look ahead, but we can't, so we use 'node' to 608 // move between three nodes in the state machine. 609 switch (node) { 610 case 0: 611 // Normal idle node 612 if (b == ESCAPE_BYTE) { 613 node = 1; 614 } 615 else { 616 array[ai++] = b; 617 } 618 break; 619 case 1: 620 // We have seen one ESCAPE_BYTE; we expect either a second 621 // one, or a run length and value. 622 if (b == ESCAPE_BYTE) { 623 array[ai++] = ESCAPE_BYTE; 624 node = 0; 625 } 626 else { 627 runLength = b; 628 // Interpret signed byte as unsigned 629 if (runLength < 0) runLength += 0x100; 630 node = 2; 631 } 632 break; 633 case 2: 634 // We have seen an ESCAPE_BYTE and length byte. We interpret 635 // the next byte as the value to be repeated. 636 for (int j=0; j<runLength; ++j) array[ai++] = b; 637 node = 0; 638 break; 639 } 640 } 641 642 if (node != 0) 643 throw new IllegalStateException("Bad run-length encoded byte array"); 644 645 if (i != s.length()) 646 throw new IllegalStateException("Excess data in RLE byte array string"); 647 648 return array; 649 } 650 651 static public String LINE_SEPARATOR = System.getProperty("line.separator"); 652 653 /** 654 * Format a String for representation in a source file. This includes 655 * breaking it into lines and escaping characters using octal notation 656 * when necessary (control characters and double quotes). 657 */ 658 static public final String formatForSource(String s) { 659 StringBuilder buffer = new StringBuilder(); 660 for (int i=0; i<s.length();) { 661 if (i > 0) buffer.append('+').append(LINE_SEPARATOR); 662 buffer.append(" \""); 663 int count = 11; 664 while (i<s.length() && count<80) { 665 char c = s.charAt(i++); 666 if (c < '\u0020' || c == '"' || c == '\\') { 667 if (c == '\n') { 668 buffer.append("\\n"); 669 count += 2; 670 } else if (c == '\t') { 671 buffer.append("\\t"); 672 count += 2; 673 } else if (c == '\r') { 674 buffer.append("\\r"); 675 count += 2; 676 } else { 677 // Represent control characters, backslash and double quote 678 // using octal notation; otherwise the string we form 679 // won't compile, since Unicode escape sequences are 680 // processed before tokenization. 681 buffer.append('\\'); 682 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal 683 buffer.append(HEX_DIGIT[(c & 0070) >> 3]); 684 buffer.append(HEX_DIGIT[(c & 0007)]); 685 count += 4; 686 } 687 } 688 else if (c <= '\u007E') { 689 buffer.append(c); 690 count += 1; 691 } 692 else { 693 buffer.append("\\u"); 694 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]); 695 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]); 696 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]); 697 buffer.append(HEX_DIGIT[(c & 0x000F)]); 698 count += 6; 699 } 700 } 701 buffer.append('"'); 702 } 703 return buffer.toString(); 704 } 705 706 static final char[] HEX_DIGIT = {'0','1','2','3','4','5','6','7', 707 '8','9','A','B','C','D','E','F'}; 708 709 /** 710 * Format a String for representation in a source file. Like 711 * formatForSource but does not do line breaking. 712 */ 713 static public final String format1ForSource(String s) { 714 StringBuilder buffer = new StringBuilder(); 715 buffer.append("\""); 716 for (int i=0; i<s.length();) { 717 char c = s.charAt(i++); 718 if (c < '\u0020' || c == '"' || c == '\\') { 719 if (c == '\n') { 720 buffer.append("\\n"); 721 } else if (c == '\t') { 722 buffer.append("\\t"); 723 } else if (c == '\r') { 724 buffer.append("\\r"); 725 } else { 726 // Represent control characters, backslash and double quote 727 // using octal notation; otherwise the string we form 728 // won't compile, since Unicode escape sequences are 729 // processed before tokenization. 730 buffer.append('\\'); 731 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal 732 buffer.append(HEX_DIGIT[(c & 0070) >> 3]); 733 buffer.append(HEX_DIGIT[(c & 0007)]); 734 } 735 } 736 else if (c <= '\u007E') { 737 buffer.append(c); 738 } 739 else { 740 buffer.append("\\u"); 741 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]); 742 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]); 743 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]); 744 buffer.append(HEX_DIGIT[(c & 0x000F)]); 745 } 746 } 747 buffer.append('"'); 748 return buffer.toString(); 749 } 750 751 /** 752 * Convert characters outside the range U+0020 to U+007F to 753 * Unicode escapes, and convert backslash to a double backslash. 754 */ 755 public static final String escape(String s) { 756 StringBuilder buf = new StringBuilder(); 757 for (int i=0; i<s.length(); ) { 758 int c = Character.codePointAt(s, i); 759 i += UTF16.getCharCount(c); 760 if (c >= ' ' && c <= 0x007F) { 761 if (c == '\\') { 762 buf.append("\\\\"); // That is, "\\" 763 } else { 764 buf.append((char)c); 765 } 766 } else { 767 boolean four = c <= 0xFFFF; 768 buf.append(four ? "\\u" : "\\U"); 769 buf.append(hex(c, four ? 4 : 8)); 770 } 771 } 772 return buf.toString(); 773 } 774 775 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 776 static private final char[] UNESCAPE_MAP = { 777 /*" 0x22, 0x22 */ 778 /*' 0x27, 0x27 */ 779 /*? 0x3F, 0x3F */ 780 /*\ 0x5C, 0x5C */ 781 /*a*/ 0x61, 0x07, 782 /*b*/ 0x62, 0x08, 783 /*e*/ 0x65, 0x1b, 784 /*f*/ 0x66, 0x0c, 785 /*n*/ 0x6E, 0x0a, 786 /*r*/ 0x72, 0x0d, 787 /*t*/ 0x74, 0x09, 788 /*v*/ 0x76, 0x0b 789 }; 790 791 /** 792 * Convert an escape to a 32-bit code point value. We attempt 793 * to parallel the icu4c unescapeAt() function. 794 * @param offset16 an array containing offset to the character 795 * <em>after</em> the backslash. Upon return offset16[0] will 796 * be updated to point after the escape sequence. 797 * @return character value from 0 to 10FFFF, or -1 on error. 798 */ 799 public static int unescapeAt(String s, int[] offset16) { 800 int c; 801 int result = 0; 802 int n = 0; 803 int minDig = 0; 804 int maxDig = 0; 805 int bitsPerDigit = 4; 806 int dig; 807 int i; 808 boolean braces = false; 809 810 /* Check that offset is in range */ 811 int offset = offset16[0]; 812 int length = s.length(); 813 if (offset < 0 || offset >= length) { 814 return -1; 815 } 816 817 /* Fetch first UChar after '\\' */ 818 c = Character.codePointAt(s, offset); 819 offset += UTF16.getCharCount(c); 820 821 /* Convert hexadecimal and octal escapes */ 822 switch (c) { 823 case 'u': 824 minDig = maxDig = 4; 825 break; 826 case 'U': 827 minDig = maxDig = 8; 828 break; 829 case 'x': 830 minDig = 1; 831 if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { 832 ++offset; 833 braces = true; 834 maxDig = 8; 835 } else { 836 maxDig = 2; 837 } 838 break; 839 default: 840 dig = UCharacter.digit(c, 8); 841 if (dig >= 0) { 842 minDig = 1; 843 maxDig = 3; 844 n = 1; /* Already have first octal digit */ 845 bitsPerDigit = 3; 846 result = dig; 847 } 848 break; 849 } 850 if (minDig != 0) { 851 while (offset < length && n < maxDig) { 852 c = UTF16.charAt(s, offset); 853 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); 854 if (dig < 0) { 855 break; 856 } 857 result = (result << bitsPerDigit) | dig; 858 offset += UTF16.getCharCount(c); 859 ++n; 860 } 861 if (n < minDig) { 862 return -1; 863 } 864 if (braces) { 865 if (c != 0x7D /*}*/) { 866 return -1; 867 } 868 ++offset; 869 } 870 if (result < 0 || result >= 0x110000) { 871 return -1; 872 } 873 // If an escape sequence specifies a lead surrogate, see 874 // if there is a trail surrogate after it, either as an 875 // escape or as a literal. If so, join them up into a 876 // supplementary. 877 if (offset < length && 878 UTF16.isLeadSurrogate((char) result)) { 879 int ahead = offset+1; 880 c = s.charAt(offset); // [sic] get 16-bit code unit 881 if (c == '\\' && ahead < length) { 882 int o[] = new int[] { ahead }; 883 c = unescapeAt(s, o); 884 ahead = o[0]; 885 } 886 if (UTF16.isTrailSurrogate((char) c)) { 887 offset = ahead; 888 result = Character.toCodePoint((char) result, (char) c); 889 } 890 } 891 offset16[0] = offset; 892 return result; 893 } 894 895 /* Convert C-style escapes in table */ 896 for (i=0; i<UNESCAPE_MAP.length; i+=2) { 897 if (c == UNESCAPE_MAP[i]) { 898 offset16[0] = offset; 899 return UNESCAPE_MAP[i+1]; 900 } else if (c < UNESCAPE_MAP[i]) { 901 break; 902 } 903 } 904 905 /* Map \cX to control-X: X & 0x1F */ 906 if (c == 'c' && offset < length) { 907 c = UTF16.charAt(s, offset); 908 offset16[0] = offset + UTF16.getCharCount(c); 909 return 0x1F & c; 910 } 911 912 /* If no special forms are recognized, then consider 913 * the backslash to generically escape the next character. */ 914 offset16[0] = offset; 915 return c; 916 } 917 918 /** 919 * Convert all escapes in a given string using unescapeAt(). 920 * @exception IllegalArgumentException if an invalid escape is 921 * seen. 922 */ 923 public static String unescape(String s) { 924 StringBuilder buf = new StringBuilder(); 925 int[] pos = new int[1]; 926 for (int i=0; i<s.length(); ) { 927 char c = s.charAt(i++); 928 if (c == '\\') { 929 pos[0] = i; 930 int e = unescapeAt(s, pos); 931 if (e < 0) { 932 throw new IllegalArgumentException("Invalid escape sequence " + 933 s.substring(i-1, Math.min(i+8, s.length()))); 934 } 935 buf.appendCodePoint(e); 936 i = pos[0]; 937 } else { 938 buf.append(c); 939 } 940 } 941 return buf.toString(); 942 } 943 944 /** 945 * Convert all escapes in a given string using unescapeAt(). 946 * Leave invalid escape sequences unchanged. 947 */ 948 public static String unescapeLeniently(String s) { 949 StringBuilder buf = new StringBuilder(); 950 int[] pos = new int[1]; 951 for (int i=0; i<s.length(); ) { 952 char c = s.charAt(i++); 953 if (c == '\\') { 954 pos[0] = i; 955 int e = unescapeAt(s, pos); 956 if (e < 0) { 957 buf.append(c); 958 } else { 959 buf.appendCodePoint(e); 960 i = pos[0]; 961 } 962 } else { 963 buf.append(c); 964 } 965 } 966 return buf.toString(); 967 } 968 969 /** 970 * Convert a char to 4 hex uppercase digits. E.g., hex('a') => 971 * "0041". 972 */ 973 public static String hex(long ch) { 974 return hex(ch, 4); 975 } 976 977 /** 978 * Supplies a zero-padded hex representation of an integer (without 0x) 979 */ 980 static public String hex(long i, int places) { 981 if (i == Long.MIN_VALUE) return "-8000000000000000"; 982 boolean negative = i < 0; 983 if (negative) { 984 i = -i; 985 } 986 String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH); 987 if (result.length() < places) { 988 result = "0000000000000000".substring(result.length(),places) + result; 989 } 990 if (negative) { 991 return '-' + result; 992 } 993 return result; 994 } 995 996 /** 997 * Convert a string to comma-separated groups of 4 hex uppercase 998 * digits. E.g., hex('ab') => "0041,0042". 999 */ 1000 public static String hex(CharSequence s) { 1001 return hex(s, 4, ",", true, new StringBuilder()).toString(); 1002 } 1003 1004 /** 1005 * Convert a string to separated groups of hex uppercase 1006 * digits. E.g., hex('ab'...) => "0041,0042". Append the output 1007 * to the given Appendable. 1008 */ 1009 public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) { 1010 try { 1011 if (useCodePoints) { 1012 int cp; 1013 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1014 cp = Character.codePointAt(s, i); 1015 if (i != 0) { 1016 result.append(separator); 1017 } 1018 result.append(hex(cp,width)); 1019 } 1020 } else { 1021 for (int i = 0; i < s.length(); ++i) { 1022 if (i != 0) { 1023 result.append(separator); 1024 } 1025 result.append(hex(s.charAt(i),width)); 1026 } 1027 } 1028 return result; 1029 } catch (IOException e) { 1030 throw new IllegalIcuArgumentException(e); 1031 } 1032 } 1033 1034 public static String hex(byte[] o, int start, int end, String separator) { 1035 StringBuilder result = new StringBuilder(); 1036 //int ch; 1037 for (int i = start; i < end; ++i) { 1038 if (i != 0) result.append(separator); 1039 result.append(hex(o[i])); 1040 } 1041 return result.toString(); 1042 } 1043 1044 /** 1045 * Convert a string to comma-separated groups of 4 hex uppercase 1046 * digits. E.g., hex('ab') => "0041,0042". 1047 */ 1048 public static <S extends CharSequence> String hex(S s, int width, S separator) { 1049 return hex(s, width, separator, true, new StringBuilder()).toString(); 1050 } 1051 1052 /** 1053 * Split a string into pieces based on the given divider character 1054 * @param s the string to split 1055 * @param divider the character on which to split. Occurrences of 1056 * this character are not included in the output 1057 * @param output an array to receive the substrings between 1058 * instances of divider. It must be large enough on entry to 1059 * accomodate all output. Adjacent instances of the divider 1060 * character will place empty strings into output. Before 1061 * returning, output is padded out with empty strings. 1062 */ 1063 public static void split(String s, char divider, String[] output) { 1064 int last = 0; 1065 int current = 0; 1066 int i; 1067 for (i = 0; i < s.length(); ++i) { 1068 if (s.charAt(i) == divider) { 1069 output[current++] = s.substring(last,i); 1070 last = i+1; 1071 } 1072 } 1073 output[current++] = s.substring(last,i); 1074 while (current < output.length) { 1075 output[current++] = ""; 1076 } 1077 } 1078 1079 /** 1080 * Split a string into pieces based on the given divider character 1081 * @param s the string to split 1082 * @param divider the character on which to split. Occurrences of 1083 * this character are not included in the output 1084 * @return output an array to receive the substrings between 1085 * instances of divider. Adjacent instances of the divider 1086 * character will place empty strings into output. 1087 */ 1088 public static String[] split(String s, char divider) { 1089 int last = 0; 1090 int i; 1091 ArrayList<String> output = new ArrayList<String>(); 1092 for (i = 0; i < s.length(); ++i) { 1093 if (s.charAt(i) == divider) { 1094 output.add(s.substring(last,i)); 1095 last = i+1; 1096 } 1097 } 1098 output.add( s.substring(last,i)); 1099 return output.toArray(new String[output.size()]); 1100 } 1101 1102 /** 1103 * Look up a given string in a string array. Returns the index at 1104 * which the first occurrence of the string was found in the 1105 * array, or -1 if it was not found. 1106 * @param source the string to search for 1107 * @param target the array of zero or more strings in which to 1108 * look for source 1109 * @return the index of target at which source first occurs, or -1 1110 * if not found 1111 */ 1112 public static int lookup(String source, String[] target) { 1113 for (int i = 0; i < target.length; ++i) { 1114 if (source.equals(target[i])) return i; 1115 } 1116 return -1; 1117 } 1118 1119 /** 1120 * Parse a single non-whitespace character 'ch', optionally 1121 * preceded by whitespace. 1122 * @param id the string to be parsed 1123 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the 1124 * offset of the first character to be parsed. On output, pos[0] 1125 * is the index after the last parsed character. If the parse 1126 * fails, pos[0] will be unchanged. 1127 * @param ch the non-whitespace character to be parsed. 1128 * @return true if 'ch' is seen preceded by zero or more 1129 * whitespace characters. 1130 */ 1131 public static boolean parseChar(String id, int[] pos, char ch) { 1132 int start = pos[0]; 1133 pos[0] = PatternProps.skipWhiteSpace(id, pos[0]); 1134 if (pos[0] == id.length() || 1135 id.charAt(pos[0]) != ch) { 1136 pos[0] = start; 1137 return false; 1138 } 1139 ++pos[0]; 1140 return true; 1141 } 1142 1143 /** 1144 * Parse a pattern string starting at offset pos. Keywords are 1145 * matched case-insensitively. Spaces may be skipped and may be 1146 * optional or required. Integer values may be parsed, and if 1147 * they are, they will be returned in the given array. If 1148 * successful, the offset of the next non-space character is 1149 * returned. On failure, -1 is returned. 1150 * @param pattern must only contain lowercase characters, which 1151 * will match their uppercase equivalents as well. A space 1152 * character matches one or more required spaces. A '~' character 1153 * matches zero or more optional spaces. A '#' character matches 1154 * an integer and stores it in parsedInts, which the caller must 1155 * ensure has enough capacity. 1156 * @param parsedInts array to receive parsed integers. Caller 1157 * must ensure that parsedInts.length is >= the number of '#' 1158 * signs in 'pattern'. 1159 * @return the position after the last character parsed, or -1 if 1160 * the parse failed 1161 */ 1162 @SuppressWarnings("fallthrough") 1163 public static int parsePattern(String rule, int pos, int limit, 1164 String pattern, int[] parsedInts) { 1165 // TODO Update this to handle surrogates 1166 int[] p = new int[1]; 1167 int intCount = 0; // number of integers parsed 1168 for (int i=0; i<pattern.length(); ++i) { 1169 char cpat = pattern.charAt(i); 1170 char c; 1171 switch (cpat) { 1172 case ' ': 1173 if (pos >= limit) { 1174 return -1; 1175 } 1176 c = rule.charAt(pos++); 1177 if (!PatternProps.isWhiteSpace(c)) { 1178 return -1; 1179 } 1180 // FALL THROUGH to skipWhitespace 1181 case '~': 1182 pos = PatternProps.skipWhiteSpace(rule, pos); 1183 break; 1184 case '#': 1185 p[0] = pos; 1186 parsedInts[intCount++] = parseInteger(rule, p, limit); 1187 if (p[0] == pos) { 1188 // Syntax error; failed to parse integer 1189 return -1; 1190 } 1191 pos = p[0]; 1192 break; 1193 default: 1194 if (pos >= limit) { 1195 return -1; 1196 } 1197 c = (char) UCharacter.toLowerCase(rule.charAt(pos++)); 1198 if (c != cpat) { 1199 return -1; 1200 } 1201 break; 1202 } 1203 } 1204 return pos; 1205 } 1206 1207 /** 1208 * Parse a pattern string within the given Replaceable and a parsing 1209 * pattern. Characters are matched literally and case-sensitively 1210 * except for the following special characters: 1211 * 1212 * ~ zero or more Pattern_White_Space chars 1213 * 1214 * If end of pattern is reached with all matches along the way, 1215 * pos is advanced to the first unparsed index and returned. 1216 * Otherwise -1 is returned. 1217 * @param pat pattern that controls parsing 1218 * @param text text to be parsed, starting at index 1219 * @param index offset to first character to parse 1220 * @param limit offset after last character to parse 1221 * @return index after last parsed character, or -1 on parse failure. 1222 */ 1223 public static int parsePattern(String pat, 1224 Replaceable text, 1225 int index, 1226 int limit) { 1227 int ipat = 0; 1228 1229 // empty pattern matches immediately 1230 if (ipat == pat.length()) { 1231 return index; 1232 } 1233 1234 int cpat = Character.codePointAt(pat, ipat); 1235 1236 while (index < limit) { 1237 int c = text.char32At(index); 1238 1239 // parse \s* 1240 if (cpat == '~') { 1241 if (PatternProps.isWhiteSpace(c)) { 1242 index += UTF16.getCharCount(c); 1243 continue; 1244 } else { 1245 if (++ipat == pat.length()) { 1246 return index; // success; c unparsed 1247 } 1248 // fall thru; process c again with next cpat 1249 } 1250 } 1251 1252 // parse literal 1253 else if (c == cpat) { 1254 int n = UTF16.getCharCount(c); 1255 index += n; 1256 ipat += n; 1257 if (ipat == pat.length()) { 1258 return index; // success; c parsed 1259 } 1260 // fall thru; get next cpat 1261 } 1262 1263 // match failure of literal 1264 else { 1265 return -1; 1266 } 1267 1268 cpat = UTF16.charAt(pat, ipat); 1269 } 1270 1271 return -1; // text ended before end of pat 1272 } 1273 1274 /** 1275 * Parse an integer at pos, either of the form \d+ or of the form 1276 * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex, 1277 * or octal format. 1278 * @param pos INPUT-OUTPUT parameter. On input, the first 1279 * character to parse. On output, the character after the last 1280 * parsed character. 1281 */ 1282 public static int parseInteger(String rule, int[] pos, int limit) { 1283 int count = 0; 1284 int value = 0; 1285 int p = pos[0]; 1286 int radix = 10; 1287 1288 if (rule.regionMatches(true, p, "0x", 0, 2)) { 1289 p += 2; 1290 radix = 16; 1291 } else if (p < limit && rule.charAt(p) == '0') { 1292 p++; 1293 count = 1; 1294 radix = 8; 1295 } 1296 1297 while (p < limit) { 1298 int d = UCharacter.digit(rule.charAt(p++), radix); 1299 if (d < 0) { 1300 --p; 1301 break; 1302 } 1303 ++count; 1304 int v = (value * radix) + d; 1305 if (v <= value) { 1306 // If there are too many input digits, at some point 1307 // the value will go negative, e.g., if we have seen 1308 // "0x8000000" already and there is another '0', when 1309 // we parse the next 0 the value will go negative. 1310 return 0; 1311 } 1312 value = v; 1313 } 1314 if (count > 0) { 1315 pos[0] = p; 1316 } 1317 return value; 1318 } 1319 1320 /** 1321 * Parse a Unicode identifier from the given string at the given 1322 * position. Return the identifier, or null if there is no 1323 * identifier. 1324 * @param str the string to parse 1325 * @param pos INPUT-OUPUT parameter. On INPUT, pos[0] is the 1326 * first character to examine. It must be less than str.length(), 1327 * and it must not point to a whitespace character. That is, must 1328 * have pos[0] < str.length(). On 1329 * OUTPUT, the position after the last parsed character. 1330 * @return the Unicode identifier, or null if there is no valid 1331 * identifier at pos[0]. 1332 */ 1333 public static String parseUnicodeIdentifier(String str, int[] pos) { 1334 // assert(pos[0] < str.length()); 1335 StringBuilder buf = new StringBuilder(); 1336 int p = pos[0]; 1337 while (p < str.length()) { 1338 int ch = Character.codePointAt(str, p); 1339 if (buf.length() == 0) { 1340 if (UCharacter.isUnicodeIdentifierStart(ch)) { 1341 buf.appendCodePoint(ch); 1342 } else { 1343 return null; 1344 } 1345 } else { 1346 if (UCharacter.isUnicodeIdentifierPart(ch)) { 1347 buf.appendCodePoint(ch); 1348 } else { 1349 break; 1350 } 1351 } 1352 p += UTF16.getCharCount(ch); 1353 } 1354 pos[0] = p; 1355 return buf.toString(); 1356 } 1357 1358 static final char DIGITS[] = { 1359 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 1360 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 1361 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 1362 'U', 'V', 'W', 'X', 'Y', 'Z' 1363 }; 1364 1365 /** 1366 * Append the digits of a positive integer to the given 1367 * <code>Appendable</code> in the given radix. This is 1368 * done recursively since it is easiest to generate the low- 1369 * order digit first, but it must be appended last. 1370 * 1371 * @param result is the <code>Appendable</code> to append to 1372 * @param n is the positive integer 1373 * @param radix is the radix, from 2 to 36 inclusive 1374 * @param minDigits is the minimum number of digits to append. 1375 */ 1376 private static <T extends Appendable> void recursiveAppendNumber(T result, int n, 1377 int radix, int minDigits) 1378 { 1379 try { 1380 int digit = n % radix; 1381 1382 if (n >= radix || minDigits > 1) { 1383 recursiveAppendNumber(result, n / radix, radix, minDigits - 1); 1384 } 1385 result.append(DIGITS[digit]); 1386 } catch (IOException e) { 1387 throw new IllegalIcuArgumentException(e); 1388 } 1389 } 1390 1391 /** 1392 * Append a number to the given Appendable in the given radix. 1393 * Standard digits '0'-'9' are used and letters 'A'-'Z' for 1394 * radices 11 through 36. 1395 * @param result the digits of the number are appended here 1396 * @param n the number to be converted to digits; may be negative. 1397 * If negative, a '-' is prepended to the digits. 1398 * @param radix a radix from 2 to 36 inclusive. 1399 * @param minDigits the minimum number of digits, not including 1400 * any '-', to produce. Values less than 2 have no effect. One 1401 * digit is always emitted regardless of this parameter. 1402 * @return a reference to result 1403 */ 1404 public static <T extends Appendable> T appendNumber(T result, int n, 1405 int radix, int minDigits) 1406 { 1407 try { 1408 if (radix < 2 || radix > 36) { 1409 throw new IllegalArgumentException("Illegal radix " + radix); 1410 } 1411 1412 1413 int abs = n; 1414 1415 if (n < 0) { 1416 abs = -n; 1417 result.append("-"); 1418 } 1419 1420 recursiveAppendNumber(result, abs, radix, minDigits); 1421 1422 return result; 1423 } catch (IOException e) { 1424 throw new IllegalIcuArgumentException(e); 1425 } 1426 1427 } 1428 1429 /** 1430 * Parse an unsigned 31-bit integer at the given offset. Use 1431 * UCharacter.digit() to parse individual characters into digits. 1432 * @param text the text to be parsed 1433 * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the 1434 * offset within text at which to start parsing; it should point 1435 * to a valid digit. On exit, pos[0] is the offset after the last 1436 * parsed character. If the parse failed, it will be unchanged on 1437 * exit. Must be >= 0 on entry. 1438 * @param radix the radix in which to parse; must be >= 2 and <= 1439 * 36. 1440 * @return a non-negative parsed number, or -1 upon parse failure. 1441 * Parse fails if there are no digits, that is, if pos[0] does not 1442 * point to a valid digit on entry, or if the number to be parsed 1443 * does not fit into a 31-bit unsigned integer. 1444 */ 1445 public static int parseNumber(String text, int[] pos, int radix) { 1446 // assert(pos[0] >= 0); 1447 // assert(radix >= 2); 1448 // assert(radix <= 36); 1449 int n = 0; 1450 int p = pos[0]; 1451 while (p < text.length()) { 1452 int ch = Character.codePointAt(text, p); 1453 int d = UCharacter.digit(ch, radix); 1454 if (d < 0) { 1455 break; 1456 } 1457 n = radix*n + d; 1458 // ASSUME that when a 32-bit integer overflows it becomes 1459 // negative. E.g., 214748364 * 10 + 8 => negative value. 1460 if (n < 0) { 1461 return -1; 1462 } 1463 ++p; 1464 } 1465 if (p == pos[0]) { 1466 return -1; 1467 } 1468 pos[0] = p; 1469 return n; 1470 } 1471 1472 /** 1473 * Return true if the character is NOT printable ASCII. The tab, 1474 * newline and linefeed characters are considered unprintable. 1475 */ 1476 public static boolean isUnprintable(int c) { 1477 //0x20 = 32 and 0x7E = 126 1478 return !(c >= 0x20 && c <= 0x7E); 1479 } 1480 1481 /** 1482 * Escape unprintable characters using <backslash>uxxxx notation 1483 * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and 1484 * above. If the character is printable ASCII, then do nothing 1485 * and return FALSE. Otherwise, append the escaped notation and 1486 * return TRUE. 1487 */ 1488 public static <T extends Appendable> boolean escapeUnprintable(T result, int c) { 1489 try { 1490 if (isUnprintable(c)) { 1491 result.append('\\'); 1492 if ((c & ~0xFFFF) != 0) { 1493 result.append('U'); 1494 result.append(DIGITS[0xF&(c>>28)]); 1495 result.append(DIGITS[0xF&(c>>24)]); 1496 result.append(DIGITS[0xF&(c>>20)]); 1497 result.append(DIGITS[0xF&(c>>16)]); 1498 } else { 1499 result.append('u'); 1500 } 1501 result.append(DIGITS[0xF&(c>>12)]); 1502 result.append(DIGITS[0xF&(c>>8)]); 1503 result.append(DIGITS[0xF&(c>>4)]); 1504 result.append(DIGITS[0xF&c]); 1505 return true; 1506 } 1507 return false; 1508 } catch (IOException e) { 1509 throw new IllegalIcuArgumentException(e); 1510 } 1511 } 1512 1513 /** 1514 * Returns the index of the first character in a set, ignoring quoted text. 1515 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be 1516 * found by a search for "h". Unlike String.indexOf(), this method searches 1517 * not for a single character, but for any character of the string 1518 * <code>setOfChars</code>. 1519 * @param text text to be searched 1520 * @param start the beginning index, inclusive; <code>0 <= start 1521 * <= limit</code>. 1522 * @param limit the ending index, exclusive; <code>start <= limit 1523 * <= text.length()</code>. 1524 * @param setOfChars string with one or more distinct characters 1525 * @return Offset of the first character in <code>setOfChars</code> 1526 * found, or -1 if not found. 1527 * @see String#indexOf 1528 */ 1529 public static int quotedIndexOf(String text, int start, int limit, 1530 String setOfChars) { 1531 for (int i=start; i<limit; ++i) { 1532 char c = text.charAt(i); 1533 if (c == BACKSLASH) { 1534 ++i; 1535 } else if (c == APOSTROPHE) { 1536 while (++i < limit 1537 && text.charAt(i) != APOSTROPHE) {} 1538 } else if (setOfChars.indexOf(c) >= 0) { 1539 return i; 1540 } 1541 } 1542 return -1; 1543 } 1544 1545 /** 1546 * Append a character to a rule that is being built up. To flush 1547 * the quoteBuf to rule, make one final call with isLiteral == true. 1548 * If there is no final character, pass in (int)-1 as c. 1549 * @param rule the string to append the character to 1550 * @param c the character to append, or (int)-1 if none. 1551 * @param isLiteral if true, then the given character should not be 1552 * quoted or escaped. Usually this means it is a syntactic element 1553 * such as > or $ 1554 * @param escapeUnprintable if true, then unprintable characters 1555 * should be escaped using escapeUnprintable(). These escapes will 1556 * appear outside of quotes. 1557 * @param quoteBuf a buffer which is used to build up quoted 1558 * substrings. The caller should initially supply an empty buffer, 1559 * and thereafter should not modify the buffer. The buffer should be 1560 * cleared out by, at the end, calling this method with a literal 1561 * character (which may be -1). 1562 */ 1563 public static void appendToRule(StringBuffer rule, 1564 int c, 1565 boolean isLiteral, 1566 boolean escapeUnprintable, 1567 StringBuffer quoteBuf) { 1568 // If we are escaping unprintables, then escape them outside 1569 // quotes. \\u and \\U are not recognized within quotes. The same 1570 // logic applies to literals, but literals are never escaped. 1571 if (isLiteral || 1572 (escapeUnprintable && Utility.isUnprintable(c))) { 1573 if (quoteBuf.length() > 0) { 1574 // We prefer backslash APOSTROPHE to double APOSTROPHE 1575 // (more readable, less similar to ") so if there are 1576 // double APOSTROPHEs at the ends, we pull them outside 1577 // of the quote. 1578 1579 // If the first thing in the quoteBuf is APOSTROPHE 1580 // (doubled) then pull it out. 1581 while (quoteBuf.length() >= 2 && 1582 quoteBuf.charAt(0) == APOSTROPHE && 1583 quoteBuf.charAt(1) == APOSTROPHE) { 1584 rule.append(BACKSLASH).append(APOSTROPHE); 1585 quoteBuf.delete(0, 2); 1586 } 1587 // If the last thing in the quoteBuf is APOSTROPHE 1588 // (doubled) then remove and count it and add it after. 1589 int trailingCount = 0; 1590 while (quoteBuf.length() >= 2 && 1591 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE && 1592 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) { 1593 quoteBuf.setLength(quoteBuf.length()-2); 1594 ++trailingCount; 1595 } 1596 if (quoteBuf.length() > 0) { 1597 rule.append(APOSTROPHE); 1598 rule.append(quoteBuf); 1599 rule.append(APOSTROPHE); 1600 quoteBuf.setLength(0); 1601 } 1602 while (trailingCount-- > 0) { 1603 rule.append(BACKSLASH).append(APOSTROPHE); 1604 } 1605 } 1606 if (c != -1) { 1607 /* Since spaces are ignored during parsing, they are 1608 * emitted only for readability. We emit one here 1609 * only if there isn't already one at the end of the 1610 * rule. 1611 */ 1612 if (c == ' ') { 1613 int len = rule.length(); 1614 if (len > 0 && rule.charAt(len-1) != ' ') { 1615 rule.append(' '); 1616 } 1617 } else if (!escapeUnprintable || !Utility.escapeUnprintable(rule, c)) { 1618 rule.appendCodePoint(c); 1619 } 1620 } 1621 } 1622 1623 // Escape ' and '\' and don't begin a quote just for them 1624 else if (quoteBuf.length() == 0 && 1625 (c == APOSTROPHE || c == BACKSLASH)) { 1626 rule.append(BACKSLASH).append((char)c); 1627 } 1628 1629 // Specials (printable ascii that isn't [0-9a-zA-Z]) and 1630 // whitespace need quoting. Also append stuff to quotes if we are 1631 // building up a quoted substring already. 1632 else if (quoteBuf.length() > 0 || 1633 (c >= 0x0021 && c <= 0x007E && 1634 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 1635 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 1636 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) || 1637 PatternProps.isWhiteSpace(c)) { 1638 quoteBuf.appendCodePoint(c); 1639 // Double ' within a quote 1640 if (c == APOSTROPHE) { 1641 quoteBuf.append((char)c); 1642 } 1643 } 1644 1645 // Otherwise just append 1646 else { 1647 rule.appendCodePoint(c); 1648 } 1649 } 1650 1651 /** 1652 * Append the given string to the rule. Calls the single-character 1653 * version of appendToRule for each character. 1654 */ 1655 public static void appendToRule(StringBuffer rule, 1656 String text, 1657 boolean isLiteral, 1658 boolean escapeUnprintable, 1659 StringBuffer quoteBuf) { 1660 for (int i=0; i<text.length(); ++i) { 1661 // Okay to process in 16-bit code units here 1662 appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf); 1663 } 1664 } 1665 1666 /** 1667 * Given a matcher reference, which may be null, append its 1668 * pattern as a literal to the given rule. 1669 */ 1670 public static void appendToRule(StringBuffer rule, 1671 UnicodeMatcher matcher, 1672 boolean escapeUnprintable, 1673 StringBuffer quoteBuf) { 1674 if (matcher != null) { 1675 appendToRule(rule, matcher.toPattern(escapeUnprintable), 1676 true, escapeUnprintable, quoteBuf); 1677 } 1678 } 1679 1680 /** 1681 * Compares 2 unsigned integers 1682 * @param source 32 bit unsigned integer 1683 * @param target 32 bit unsigned integer 1684 * @return 0 if equals, 1 if source is greater than target and -1 1685 * otherwise 1686 */ 1687 public static final int compareUnsigned(int source, int target) 1688 { 1689 source += MAGIC_UNSIGNED; 1690 target += MAGIC_UNSIGNED; 1691 if (source < target) { 1692 return -1; 1693 } 1694 else if (source > target) { 1695 return 1; 1696 } 1697 return 0; 1698 } 1699 1700 /** 1701 * Find the highest bit in a positive integer. This is done 1702 * by doing a binary search through the bits. 1703 * 1704 * @param n is the integer 1705 * 1706 * @return the bit number of the highest bit, with 0 being 1707 * the low order bit, or -1 if <code>n</code> is not positive 1708 */ 1709 public static final byte highBit(int n) 1710 { 1711 if (n <= 0) { 1712 return -1; 1713 } 1714 1715 byte bit = 0; 1716 1717 if (n >= 1 << 16) { 1718 n >>= 16; 1719 bit += 16; 1720 } 1721 1722 if (n >= 1 << 8) { 1723 n >>= 8; 1724 bit += 8; 1725 } 1726 1727 if (n >= 1 << 4) { 1728 n >>= 4; 1729 bit += 4; 1730 } 1731 1732 if (n >= 1 << 2) { 1733 n >>= 2; 1734 bit += 2; 1735 } 1736 1737 if (n >= 1 << 1) { 1738 n >>= 1; 1739 bit += 1; 1740 } 1741 1742 return bit; 1743 } 1744 /** 1745 * Utility method to take a int[] containing codepoints and return 1746 * a string representation with code units. 1747 */ 1748 public static String valueOf(int[]source){ 1749 // TODO: Investigate why this method is not on UTF16 class 1750 StringBuilder result = new StringBuilder(source.length); 1751 for(int i=0; i<source.length; i++){ 1752 result.appendCodePoint(source[i]); 1753 } 1754 return result.toString(); 1755 } 1756 1757 1758 /** 1759 * Utility to duplicate a string count times 1760 * @param s String to be duplicated. 1761 * @param count Number of times to duplicate a string. 1762 */ 1763 public static String repeat(String s, int count) { 1764 if (count <= 0) return ""; 1765 if (count == 1) return s; 1766 StringBuilder result = new StringBuilder(); 1767 for (int i = 0; i < count; ++i) { 1768 result.append(s); 1769 } 1770 return result.toString(); 1771 } 1772 1773 public static String[] splitString(String src, String target) { 1774 return src.split("\\Q" + target + "\\E"); 1775 } 1776 1777 /** 1778 * Split the string at runs of ascii whitespace characters. 1779 */ 1780 public static String[] splitWhitespace(String src) { 1781 return src.split("\\s+"); 1782 } 1783 1784 /** 1785 * Parse a list of hex numbers and return a string 1786 * @param string String of hex numbers. 1787 * @param minLength Minimal length. 1788 * @param separator Separator. 1789 * @return A string from hex numbers. 1790 */ 1791 public static String fromHex(String string, int minLength, String separator) { 1792 return fromHex(string, minLength, Pattern.compile(separator != null ? separator : "\\s+")); 1793 } 1794 1795 /** 1796 * Parse a list of hex numbers and return a string 1797 * @param string String of hex numbers. 1798 * @param minLength Minimal length. 1799 * @param separator Separator. 1800 * @return A string from hex numbers. 1801 */ 1802 public static String fromHex(String string, int minLength, Pattern separator) { 1803 StringBuilder buffer = new StringBuilder(); 1804 String[] parts = separator.split(string); 1805 for (String part : parts) { 1806 if (part.length() < minLength) { 1807 throw new IllegalArgumentException("code point too short: " + part); 1808 } 1809 int cp = Integer.parseInt(part, 16); 1810 buffer.appendCodePoint(cp); 1811 } 1812 return buffer.toString(); 1813 } 1814} 1815