1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4 ******************************************************************************* 5 * Copyright (C) 1996-2015, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9package com.ibm.icu.impl; 10 11import java.io.IOException; 12import java.util.ArrayList; 13import java.util.Locale; 14import java.util.regex.Pattern; 15 16import com.ibm.icu.lang.UCharacter; 17import com.ibm.icu.text.Replaceable; 18import com.ibm.icu.text.UTF16; 19import com.ibm.icu.text.UnicodeMatcher; 20 21public final class Utility { 22 23 private static final char APOSTROPHE = '\''; 24 private static final char BACKSLASH = '\\'; 25 private static final int MAGIC_UNSIGNED = 0x80000000; 26 27 /** 28 * Convenience utility to compare two Object[]s. 29 * Ought to be in System 30 */ 31 public final static boolean arrayEquals(Object[] source, Object target) { 32 if (source == null) return (target == null); 33 if (!(target instanceof Object[])) return false; 34 Object[] targ = (Object[]) target; 35 return (source.length == targ.length 36 && arrayRegionMatches(source, 0, targ, 0, source.length)); 37 } 38 39 /** 40 * Convenience utility to compare two int[]s 41 * Ought to be in System 42 */ 43 public final static boolean arrayEquals(int[] source, Object target) { 44 if (source == null) return (target == null); 45 if (!(target instanceof int[])) return false; 46 int[] targ = (int[]) target; 47 return (source.length == targ.length 48 && arrayRegionMatches(source, 0, targ, 0, source.length)); 49 } 50 51 /** 52 * Convenience utility to compare two double[]s 53 * Ought to be in System 54 */ 55 public final static boolean arrayEquals(double[] source, Object target) { 56 if (source == null) return (target == null); 57 if (!(target instanceof double[])) return false; 58 double[] targ = (double[]) target; 59 return (source.length == targ.length 60 && arrayRegionMatches(source, 0, targ, 0, source.length)); 61 } 62 public final static boolean arrayEquals(byte[] source, Object target) { 63 if (source == null) return (target == null); 64 if (!(target instanceof byte[])) return false; 65 byte[] targ = (byte[]) target; 66 return (source.length == targ.length 67 && arrayRegionMatches(source, 0, targ, 0, source.length)); 68 } 69 70 /** 71 * Convenience utility to compare two Object[]s 72 * Ought to be in System 73 */ 74 public final static boolean arrayEquals(Object source, Object target) { 75 if (source == null) return (target == null); 76 // for some reason, the correct arrayEquals is not being called 77 // so do it by hand for now. 78 if (source instanceof Object[]) 79 return(arrayEquals((Object[]) source,target)); 80 if (source instanceof int[]) 81 return(arrayEquals((int[]) source,target)); 82 if (source instanceof double[]) 83 return(arrayEquals((double[]) source, target)); 84 if (source instanceof byte[]) 85 return(arrayEquals((byte[]) source,target)); 86 return source.equals(target); 87 } 88 89 /** 90 * Convenience utility to compare two Object[]s 91 * Ought to be in System. 92 * @param len the length to compare. 93 * The start indices and start+len must be valid. 94 */ 95 public final static boolean arrayRegionMatches(Object[] source, int sourceStart, 96 Object[] target, int targetStart, 97 int len) 98 { 99 int sourceEnd = sourceStart + len; 100 int delta = targetStart - sourceStart; 101 for (int i = sourceStart; i < sourceEnd; i++) { 102 if (!arrayEquals(source[i],target[i + delta])) 103 return false; 104 } 105 return true; 106 } 107 108 /** 109 * Convenience utility to compare two Object[]s 110 * Ought to be in System. 111 * @param len the length to compare. 112 * The start indices and start+len must be valid. 113 */ 114 public final static boolean arrayRegionMatches(char[] source, int sourceStart, 115 char[] target, int targetStart, 116 int len) 117 { 118 int sourceEnd = sourceStart + len; 119 int delta = targetStart - sourceStart; 120 for (int i = sourceStart; i < sourceEnd; i++) { 121 if (source[i]!=target[i + delta]) 122 return false; 123 } 124 return true; 125 } 126 127 /** 128 * Convenience utility to compare two int[]s. 129 * @param len the length to compare. 130 * The start indices and start+len must be valid. 131 * Ought to be in System 132 */ 133 public final static boolean arrayRegionMatches(int[] source, int sourceStart, 134 int[] target, int targetStart, 135 int len) 136 { 137 int sourceEnd = sourceStart + len; 138 int delta = targetStart - sourceStart; 139 for (int i = sourceStart; i < sourceEnd; i++) { 140 if (source[i] != target[i + delta]) 141 return false; 142 } 143 return true; 144 } 145 146 /** 147 * Convenience utility to compare two arrays of doubles. 148 * @param len the length to compare. 149 * The start indices and start+len must be valid. 150 * Ought to be in System 151 */ 152 public final static boolean arrayRegionMatches(double[] source, int sourceStart, 153 double[] target, int targetStart, 154 int len) 155 { 156 int sourceEnd = sourceStart + len; 157 int delta = targetStart - sourceStart; 158 for (int i = sourceStart; i < sourceEnd; i++) { 159 if (source[i] != target[i + delta]) 160 return false; 161 } 162 return true; 163 } 164 public final static boolean arrayRegionMatches(byte[] source, int sourceStart, 165 byte[] target, int targetStart, int len){ 166 int sourceEnd = sourceStart + len; 167 int delta = targetStart - sourceStart; 168 for (int i = sourceStart; i < sourceEnd; i++) { 169 if (source[i] != target[i + delta]) 170 return false; 171 } 172 return true; 173 } 174 175 /** 176 * Trivial reference equality. 177 * This method should help document that we really want == not equals(), 178 * and to have a single place to suppress warnings from static analysis tools. 179 */ 180 public static final boolean sameObjects(Object a, Object b) { 181 return a == b; 182 } 183 184 /** 185 * Convenience utility. Does null checks on objects, then calls equals. 186 */ 187 public final static boolean objectEquals(Object a, Object b) { 188 return a == null ? 189 b == null ? true : false : 190 b == null ? false : a.equals(b); 191 } 192 193 /** 194 * Convenience utility. Does null checks on objects, then calls compare. 195 */ 196 public static <T extends Comparable<T>> int checkCompare(T a, T b) { 197 return a == null ? 198 b == null ? 0 : -1 : 199 b == null ? 1 : a.compareTo(b); 200 } 201 202 /** 203 * Convenience utility. Does null checks on object, then calls hashCode. 204 */ 205 public static int checkHash(Object a) { 206 return a == null ? 0 : a.hashCode(); 207 } 208 209 /** 210 * The ESCAPE character is used during run-length encoding. It signals 211 * a run of identical chars. 212 */ 213 private static final char ESCAPE = '\uA5A5'; 214 215 /** 216 * The ESCAPE_BYTE character is used during run-length encoding. It signals 217 * a run of identical bytes. 218 */ 219 static final byte ESCAPE_BYTE = (byte)0xA5; 220 221 /** 222 * Construct a string representing an int array. Use run-length encoding. 223 * A character represents itself, unless it is the ESCAPE character. Then 224 * the following notations are possible: 225 * ESCAPE ESCAPE ESCAPE literal 226 * ESCAPE n c n instances of character c 227 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 228 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 229 * If we encounter a run where n == ESCAPE, we represent this as: 230 * c ESCAPE n-1 c 231 * The ESCAPE value is chosen so as not to collide with commonly 232 * seen values. 233 */ 234 static public final String arrayToRLEString(int[] a) { 235 StringBuilder buffer = new StringBuilder(); 236 237 appendInt(buffer, a.length); 238 int runValue = a[0]; 239 int runLength = 1; 240 for (int i=1; i<a.length; ++i) { 241 int s = a[i]; 242 if (s == runValue && runLength < 0xFFFF) { 243 ++runLength; 244 } else { 245 encodeRun(buffer, runValue, runLength); 246 runValue = s; 247 runLength = 1; 248 } 249 } 250 encodeRun(buffer, runValue, runLength); 251 return buffer.toString(); 252 } 253 254 /** 255 * Construct a string representing a short array. Use run-length encoding. 256 * A character represents itself, unless it is the ESCAPE character. Then 257 * the following notations are possible: 258 * ESCAPE ESCAPE ESCAPE literal 259 * ESCAPE n c n instances of character c 260 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 261 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 262 * If we encounter a run where n == ESCAPE, we represent this as: 263 * c ESCAPE n-1 c 264 * The ESCAPE value is chosen so as not to collide with commonly 265 * seen values. 266 */ 267 static public final String arrayToRLEString(short[] a) { 268 StringBuilder buffer = new StringBuilder(); 269 // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]); 270 buffer.append((char) (a.length >> 16)); 271 buffer.append((char) a.length); 272 short runValue = a[0]; 273 int runLength = 1; 274 for (int i=1; i<a.length; ++i) { 275 short s = a[i]; 276 if (s == runValue && runLength < 0xFFFF) ++runLength; 277 else { 278 encodeRun(buffer, runValue, runLength); 279 runValue = s; 280 runLength = 1; 281 } 282 } 283 encodeRun(buffer, runValue, runLength); 284 return buffer.toString(); 285 } 286 287 /** 288 * Construct a string representing a char array. Use run-length encoding. 289 * A character represents itself, unless it is the ESCAPE character. Then 290 * the following notations are possible: 291 * ESCAPE ESCAPE ESCAPE literal 292 * ESCAPE n c n instances of character c 293 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 294 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 295 * If we encounter a run where n == ESCAPE, we represent this as: 296 * c ESCAPE n-1 c 297 * The ESCAPE value is chosen so as not to collide with commonly 298 * seen values. 299 */ 300 static public final String arrayToRLEString(char[] a) { 301 StringBuilder buffer = new StringBuilder(); 302 buffer.append((char) (a.length >> 16)); 303 buffer.append((char) a.length); 304 char runValue = a[0]; 305 int runLength = 1; 306 for (int i=1; i<a.length; ++i) { 307 char s = a[i]; 308 if (s == runValue && runLength < 0xFFFF) ++runLength; 309 else { 310 encodeRun(buffer, (short)runValue, runLength); 311 runValue = s; 312 runLength = 1; 313 } 314 } 315 encodeRun(buffer, (short)runValue, runLength); 316 return buffer.toString(); 317 } 318 319 /** 320 * Construct a string representing a byte array. Use run-length encoding. 321 * Two bytes are packed into a single char, with a single extra zero byte at 322 * the end if needed. A byte represents itself, unless it is the 323 * ESCAPE_BYTE. Then the following notations are possible: 324 * ESCAPE_BYTE ESCAPE_BYTE ESCAPE_BYTE literal 325 * ESCAPE_BYTE n b n instances of byte b 326 * Since an encoded run occupies 3 bytes, we only encode runs of 4 or 327 * more bytes. Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF. 328 * If we encounter a run where n == ESCAPE_BYTE, we represent this as: 329 * b ESCAPE_BYTE n-1 b 330 * The ESCAPE_BYTE value is chosen so as not to collide with commonly 331 * seen values. 332 */ 333 static public final String arrayToRLEString(byte[] a) { 334 StringBuilder buffer = new StringBuilder(); 335 buffer.append((char) (a.length >> 16)); 336 buffer.append((char) a.length); 337 byte runValue = a[0]; 338 int runLength = 1; 339 byte[] state = new byte[2]; 340 for (int i=1; i<a.length; ++i) { 341 byte b = a[i]; 342 if (b == runValue && runLength < 0xFF) ++runLength; 343 else { 344 encodeRun(buffer, runValue, runLength, state); 345 runValue = b; 346 runLength = 1; 347 } 348 } 349 encodeRun(buffer, runValue, runLength, state); 350 351 // We must save the final byte, if there is one, by padding 352 // an extra zero. 353 if (state[0] != 0) appendEncodedByte(buffer, (byte)0, state); 354 355 return buffer.toString(); 356 } 357 358 /** 359 * Encode a run, possibly a degenerate run (of < 4 values). 360 * @param length The length of the run; must be > 0 && <= 0xFFFF. 361 */ 362 private static final <T extends Appendable> void encodeRun(T buffer, int value, int length) { 363 if (length < 4) { 364 for (int j=0; j<length; ++j) { 365 if (value == ESCAPE) { 366 appendInt(buffer, value); 367 } 368 appendInt(buffer, value); 369 } 370 } 371 else { 372 if (length == ESCAPE) { 373 if (value == ESCAPE) { 374 appendInt(buffer, ESCAPE); 375 } 376 appendInt(buffer, value); 377 --length; 378 } 379 appendInt(buffer, ESCAPE); 380 appendInt(buffer, length); 381 appendInt(buffer, value); // Don't need to escape this value 382 } 383 } 384 385 private static final <T extends Appendable> void appendInt(T buffer, int value) { 386 try { 387 buffer.append((char)(value >>> 16)); 388 buffer.append((char)(value & 0xFFFF)); 389 } catch (IOException e) { 390 throw new IllegalIcuArgumentException(e); 391 } 392 } 393 394 /** 395 * Encode a run, possibly a degenerate run (of < 4 values). 396 * @param length The length of the run; must be > 0 && <= 0xFFFF. 397 */ 398 private static final <T extends Appendable> void encodeRun(T buffer, short value, int length) { 399 try { 400 char valueChar = (char) value; 401 if (length < 4) { 402 for (int j=0; j<length; ++j) { 403 if (valueChar == ESCAPE) { 404 buffer.append(ESCAPE); 405 } 406 buffer.append(valueChar); 407 } 408 } 409 else { 410 if (length == ESCAPE) { 411 if (valueChar == ESCAPE) { 412 buffer.append(ESCAPE); 413 } 414 buffer.append(valueChar); 415 --length; 416 } 417 buffer.append(ESCAPE); 418 buffer.append((char) length); 419 buffer.append(valueChar); // Don't need to escape this value 420 } 421 } catch (IOException e) { 422 throw new IllegalIcuArgumentException(e); 423 } 424 } 425 426 /** 427 * Encode a run, possibly a degenerate run (of < 4 values). 428 * @param length The length of the run; must be > 0 && <= 0xFF. 429 */ 430 private static final <T extends Appendable> void encodeRun(T buffer, byte value, int length, 431 byte[] state) { 432 if (length < 4) { 433 for (int j=0; j<length; ++j) { 434 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state); 435 appendEncodedByte(buffer, value, state); 436 } 437 } 438 else { 439 if ((byte)length == ESCAPE_BYTE) { 440 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state); 441 appendEncodedByte(buffer, value, state); 442 --length; 443 } 444 appendEncodedByte(buffer, ESCAPE_BYTE, state); 445 appendEncodedByte(buffer, (byte)length, state); 446 appendEncodedByte(buffer, value, state); // Don't need to escape this value 447 } 448 } 449 450 /** 451 * Append a byte to the given Appendable, packing two bytes into each 452 * character. The state parameter maintains intermediary data between 453 * calls. 454 * @param state A two-element array, with state[0] == 0 if this is the 455 * first byte of a pair, or state[0] != 0 if this is the second byte 456 * of a pair, in which case state[1] is the first byte. 457 */ 458 private static final <T extends Appendable> void appendEncodedByte(T buffer, byte value, 459 byte[] state) { 460 try { 461 if (state[0] != 0) { 462 char c = (char) ((state[1] << 8) | ((value) & 0xFF)); 463 buffer.append(c); 464 state[0] = 0; 465 } 466 else { 467 state[0] = 1; 468 state[1] = value; 469 } 470 } catch (IOException e) { 471 throw new IllegalIcuArgumentException(e); 472 } 473 } 474 475 /** 476 * Construct an array of ints from a run-length encoded string. 477 */ 478 static public final int[] RLEStringToIntArray(String s) { 479 int length = getInt(s, 0); 480 int[] array = new int[length]; 481 int ai = 0, i = 1; 482 483 int maxI = s.length() / 2; 484 while (ai < length && i < maxI) { 485 int c = getInt(s, i++); 486 487 if (c == ESCAPE) { 488 c = getInt(s, i++); 489 if (c == ESCAPE) { 490 array[ai++] = c; 491 } else { 492 int runLength = c; 493 int runValue = getInt(s, i++); 494 for (int j=0; j<runLength; ++j) { 495 array[ai++] = runValue; 496 } 497 } 498 } 499 else { 500 array[ai++] = c; 501 } 502 } 503 504 if (ai != length || i != maxI) { 505 throw new IllegalStateException("Bad run-length encoded int array"); 506 } 507 508 return array; 509 } 510 static final int getInt(String s, int i) { 511 return ((s.charAt(2*i)) << 16) | s.charAt(2*i+1); 512 } 513 514 /** 515 * Construct an array of shorts from a run-length encoded string. 516 */ 517 static public final short[] RLEStringToShortArray(String s) { 518 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 519 short[] array = new short[length]; 520 int ai = 0; 521 for (int i=2; i<s.length(); ++i) { 522 char c = s.charAt(i); 523 if (c == ESCAPE) { 524 c = s.charAt(++i); 525 if (c == ESCAPE) { 526 array[ai++] = (short) c; 527 } else { 528 int runLength = c; 529 short runValue = (short) s.charAt(++i); 530 for (int j=0; j<runLength; ++j) array[ai++] = runValue; 531 } 532 } 533 else { 534 array[ai++] = (short) c; 535 } 536 } 537 538 if (ai != length) 539 throw new IllegalStateException("Bad run-length encoded short array"); 540 541 return array; 542 } 543 544 /** 545 * Construct an array of shorts from a run-length encoded string. 546 */ 547 static public final char[] RLEStringToCharArray(String s) { 548 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 549 char[] array = new char[length]; 550 int ai = 0; 551 for (int i=2; i<s.length(); ++i) { 552 char c = s.charAt(i); 553 if (c == ESCAPE) { 554 c = s.charAt(++i); 555 if (c == ESCAPE) { 556 array[ai++] = c; 557 } else { 558 int runLength = c; 559 char runValue = s.charAt(++i); 560 for (int j=0; j<runLength; ++j) array[ai++] = runValue; 561 } 562 } 563 else { 564 array[ai++] = c; 565 } 566 } 567 568 if (ai != length) 569 throw new IllegalStateException("Bad run-length encoded short array"); 570 571 return array; 572 } 573 574 /** 575 * Construct an array of bytes from a run-length encoded string. 576 */ 577 static public final byte[] RLEStringToByteArray(String s) { 578 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 579 byte[] array = new byte[length]; 580 boolean nextChar = true; 581 char c = 0; 582 int node = 0; 583 int runLength = 0; 584 int i = 2; 585 for (int ai=0; ai<length; ) { 586 // This part of the loop places the next byte into the local 587 // variable 'b' each time through the loop. It keeps the 588 // current character in 'c' and uses the boolean 'nextChar' 589 // to see if we've taken both bytes out of 'c' yet. 590 byte b; 591 if (nextChar) { 592 c = s.charAt(i++); 593 b = (byte) (c >> 8); 594 nextChar = false; 595 } 596 else { 597 b = (byte) (c & 0xFF); 598 nextChar = true; 599 } 600 601 // This part of the loop is a tiny state machine which handles 602 // the parsing of the run-length encoding. This would be simpler 603 // if we could look ahead, but we can't, so we use 'node' to 604 // move between three nodes in the state machine. 605 switch (node) { 606 case 0: 607 // Normal idle node 608 if (b == ESCAPE_BYTE) { 609 node = 1; 610 } 611 else { 612 array[ai++] = b; 613 } 614 break; 615 case 1: 616 // We have seen one ESCAPE_BYTE; we expect either a second 617 // one, or a run length and value. 618 if (b == ESCAPE_BYTE) { 619 array[ai++] = ESCAPE_BYTE; 620 node = 0; 621 } 622 else { 623 runLength = b; 624 // Interpret signed byte as unsigned 625 if (runLength < 0) runLength += 0x100; 626 node = 2; 627 } 628 break; 629 case 2: 630 // We have seen an ESCAPE_BYTE and length byte. We interpret 631 // the next byte as the value to be repeated. 632 for (int j=0; j<runLength; ++j) array[ai++] = b; 633 node = 0; 634 break; 635 } 636 } 637 638 if (node != 0) 639 throw new IllegalStateException("Bad run-length encoded byte array"); 640 641 if (i != s.length()) 642 throw new IllegalStateException("Excess data in RLE byte array string"); 643 644 return array; 645 } 646 647 static public String LINE_SEPARATOR = System.getProperty("line.separator"); 648 649 /** 650 * Format a String for representation in a source file. This includes 651 * breaking it into lines and escaping characters using octal notation 652 * when necessary (control characters and double quotes). 653 */ 654 static public final String formatForSource(String s) { 655 StringBuilder buffer = new StringBuilder(); 656 for (int i=0; i<s.length();) { 657 if (i > 0) buffer.append('+').append(LINE_SEPARATOR); 658 buffer.append(" \""); 659 int count = 11; 660 while (i<s.length() && count<80) { 661 char c = s.charAt(i++); 662 if (c < '\u0020' || c == '"' || c == '\\') { 663 if (c == '\n') { 664 buffer.append("\\n"); 665 count += 2; 666 } else if (c == '\t') { 667 buffer.append("\\t"); 668 count += 2; 669 } else if (c == '\r') { 670 buffer.append("\\r"); 671 count += 2; 672 } else { 673 // Represent control characters, backslash and double quote 674 // using octal notation; otherwise the string we form 675 // won't compile, since Unicode escape sequences are 676 // processed before tokenization. 677 buffer.append('\\'); 678 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal 679 buffer.append(HEX_DIGIT[(c & 0070) >> 3]); 680 buffer.append(HEX_DIGIT[(c & 0007)]); 681 count += 4; 682 } 683 } 684 else if (c <= '\u007E') { 685 buffer.append(c); 686 count += 1; 687 } 688 else { 689 buffer.append("\\u"); 690 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]); 691 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]); 692 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]); 693 buffer.append(HEX_DIGIT[(c & 0x000F)]); 694 count += 6; 695 } 696 } 697 buffer.append('"'); 698 } 699 return buffer.toString(); 700 } 701 702 static final char[] HEX_DIGIT = {'0','1','2','3','4','5','6','7', 703 '8','9','A','B','C','D','E','F'}; 704 705 /** 706 * Format a String for representation in a source file. Like 707 * formatForSource but does not do line breaking. 708 */ 709 static public final String format1ForSource(String s) { 710 StringBuilder buffer = new StringBuilder(); 711 buffer.append("\""); 712 for (int i=0; i<s.length();) { 713 char c = s.charAt(i++); 714 if (c < '\u0020' || c == '"' || c == '\\') { 715 if (c == '\n') { 716 buffer.append("\\n"); 717 } else if (c == '\t') { 718 buffer.append("\\t"); 719 } else if (c == '\r') { 720 buffer.append("\\r"); 721 } else { 722 // Represent control characters, backslash and double quote 723 // using octal notation; otherwise the string we form 724 // won't compile, since Unicode escape sequences are 725 // processed before tokenization. 726 buffer.append('\\'); 727 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal 728 buffer.append(HEX_DIGIT[(c & 0070) >> 3]); 729 buffer.append(HEX_DIGIT[(c & 0007)]); 730 } 731 } 732 else if (c <= '\u007E') { 733 buffer.append(c); 734 } 735 else { 736 buffer.append("\\u"); 737 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]); 738 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]); 739 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]); 740 buffer.append(HEX_DIGIT[(c & 0x000F)]); 741 } 742 } 743 buffer.append('"'); 744 return buffer.toString(); 745 } 746 747 /** 748 * Convert characters outside the range U+0020 to U+007F to 749 * Unicode escapes, and convert backslash to a double backslash. 750 */ 751 public static final String escape(String s) { 752 StringBuilder buf = new StringBuilder(); 753 for (int i=0; i<s.length(); ) { 754 int c = Character.codePointAt(s, i); 755 i += UTF16.getCharCount(c); 756 if (c >= ' ' && c <= 0x007F) { 757 if (c == '\\') { 758 buf.append("\\\\"); // That is, "\\" 759 } else { 760 buf.append((char)c); 761 } 762 } else { 763 boolean four = c <= 0xFFFF; 764 buf.append(four ? "\\u" : "\\U"); 765 buf.append(hex(c, four ? 4 : 8)); 766 } 767 } 768 return buf.toString(); 769 } 770 771 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 772 static private final char[] UNESCAPE_MAP = { 773 /*" 0x22, 0x22 */ 774 /*' 0x27, 0x27 */ 775 /*? 0x3F, 0x3F */ 776 /*\ 0x5C, 0x5C */ 777 /*a*/ 0x61, 0x07, 778 /*b*/ 0x62, 0x08, 779 /*e*/ 0x65, 0x1b, 780 /*f*/ 0x66, 0x0c, 781 /*n*/ 0x6E, 0x0a, 782 /*r*/ 0x72, 0x0d, 783 /*t*/ 0x74, 0x09, 784 /*v*/ 0x76, 0x0b 785 }; 786 787 /** 788 * Convert an escape to a 32-bit code point value. We attempt 789 * to parallel the icu4c unescapeAt() function. 790 * @param offset16 an array containing offset to the character 791 * <em>after</em> the backslash. Upon return offset16[0] will 792 * be updated to point after the escape sequence. 793 * @return character value from 0 to 10FFFF, or -1 on error. 794 */ 795 public static int unescapeAt(String s, int[] offset16) { 796 int c; 797 int result = 0; 798 int n = 0; 799 int minDig = 0; 800 int maxDig = 0; 801 int bitsPerDigit = 4; 802 int dig; 803 int i; 804 boolean braces = false; 805 806 /* Check that offset is in range */ 807 int offset = offset16[0]; 808 int length = s.length(); 809 if (offset < 0 || offset >= length) { 810 return -1; 811 } 812 813 /* Fetch first UChar after '\\' */ 814 c = Character.codePointAt(s, offset); 815 offset += UTF16.getCharCount(c); 816 817 /* Convert hexadecimal and octal escapes */ 818 switch (c) { 819 case 'u': 820 minDig = maxDig = 4; 821 break; 822 case 'U': 823 minDig = maxDig = 8; 824 break; 825 case 'x': 826 minDig = 1; 827 if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { 828 ++offset; 829 braces = true; 830 maxDig = 8; 831 } else { 832 maxDig = 2; 833 } 834 break; 835 default: 836 dig = UCharacter.digit(c, 8); 837 if (dig >= 0) { 838 minDig = 1; 839 maxDig = 3; 840 n = 1; /* Already have first octal digit */ 841 bitsPerDigit = 3; 842 result = dig; 843 } 844 break; 845 } 846 if (minDig != 0) { 847 while (offset < length && n < maxDig) { 848 c = UTF16.charAt(s, offset); 849 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); 850 if (dig < 0) { 851 break; 852 } 853 result = (result << bitsPerDigit) | dig; 854 offset += UTF16.getCharCount(c); 855 ++n; 856 } 857 if (n < minDig) { 858 return -1; 859 } 860 if (braces) { 861 if (c != 0x7D /*}*/) { 862 return -1; 863 } 864 ++offset; 865 } 866 if (result < 0 || result >= 0x110000) { 867 return -1; 868 } 869 // If an escape sequence specifies a lead surrogate, see 870 // if there is a trail surrogate after it, either as an 871 // escape or as a literal. If so, join them up into a 872 // supplementary. 873 if (offset < length && 874 UTF16.isLeadSurrogate((char) result)) { 875 int ahead = offset+1; 876 c = s.charAt(offset); // [sic] get 16-bit code unit 877 if (c == '\\' && ahead < length) { 878 int o[] = new int[] { ahead }; 879 c = unescapeAt(s, o); 880 ahead = o[0]; 881 } 882 if (UTF16.isTrailSurrogate((char) c)) { 883 offset = ahead; 884 result = Character.toCodePoint((char) result, (char) c); 885 } 886 } 887 offset16[0] = offset; 888 return result; 889 } 890 891 /* Convert C-style escapes in table */ 892 for (i=0; i<UNESCAPE_MAP.length; i+=2) { 893 if (c == UNESCAPE_MAP[i]) { 894 offset16[0] = offset; 895 return UNESCAPE_MAP[i+1]; 896 } else if (c < UNESCAPE_MAP[i]) { 897 break; 898 } 899 } 900 901 /* Map \cX to control-X: X & 0x1F */ 902 if (c == 'c' && offset < length) { 903 c = UTF16.charAt(s, offset); 904 offset16[0] = offset + UTF16.getCharCount(c); 905 return 0x1F & c; 906 } 907 908 /* If no special forms are recognized, then consider 909 * the backslash to generically escape the next character. */ 910 offset16[0] = offset; 911 return c; 912 } 913 914 /** 915 * Convert all escapes in a given string using unescapeAt(). 916 * @exception IllegalArgumentException if an invalid escape is 917 * seen. 918 */ 919 public static String unescape(String s) { 920 StringBuilder buf = new StringBuilder(); 921 int[] pos = new int[1]; 922 for (int i=0; i<s.length(); ) { 923 char c = s.charAt(i++); 924 if (c == '\\') { 925 pos[0] = i; 926 int e = unescapeAt(s, pos); 927 if (e < 0) { 928 throw new IllegalArgumentException("Invalid escape sequence " + 929 s.substring(i-1, Math.min(i+8, s.length()))); 930 } 931 buf.appendCodePoint(e); 932 i = pos[0]; 933 } else { 934 buf.append(c); 935 } 936 } 937 return buf.toString(); 938 } 939 940 /** 941 * Convert all escapes in a given string using unescapeAt(). 942 * Leave invalid escape sequences unchanged. 943 */ 944 public static String unescapeLeniently(String s) { 945 StringBuilder buf = new StringBuilder(); 946 int[] pos = new int[1]; 947 for (int i=0; i<s.length(); ) { 948 char c = s.charAt(i++); 949 if (c == '\\') { 950 pos[0] = i; 951 int e = unescapeAt(s, pos); 952 if (e < 0) { 953 buf.append(c); 954 } else { 955 buf.appendCodePoint(e); 956 i = pos[0]; 957 } 958 } else { 959 buf.append(c); 960 } 961 } 962 return buf.toString(); 963 } 964 965 /** 966 * Convert a char to 4 hex uppercase digits. E.g., hex('a') => 967 * "0041". 968 */ 969 public static String hex(long ch) { 970 return hex(ch, 4); 971 } 972 973 /** 974 * Supplies a zero-padded hex representation of an integer (without 0x) 975 */ 976 static public String hex(long i, int places) { 977 if (i == Long.MIN_VALUE) return "-8000000000000000"; 978 boolean negative = i < 0; 979 if (negative) { 980 i = -i; 981 } 982 String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH); 983 if (result.length() < places) { 984 result = "0000000000000000".substring(result.length(),places) + result; 985 } 986 if (negative) { 987 return '-' + result; 988 } 989 return result; 990 } 991 992 /** 993 * Convert a string to comma-separated groups of 4 hex uppercase 994 * digits. E.g., hex('ab') => "0041,0042". 995 */ 996 public static String hex(CharSequence s) { 997 return hex(s, 4, ",", true, new StringBuilder()).toString(); 998 } 999 1000 /** 1001 * Convert a string to separated groups of hex uppercase 1002 * digits. E.g., hex('ab'...) => "0041,0042". Append the output 1003 * to the given Appendable. 1004 */ 1005 public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) { 1006 try { 1007 if (useCodePoints) { 1008 int cp; 1009 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1010 cp = Character.codePointAt(s, i); 1011 if (i != 0) { 1012 result.append(separator); 1013 } 1014 result.append(hex(cp,width)); 1015 } 1016 } else { 1017 for (int i = 0; i < s.length(); ++i) { 1018 if (i != 0) { 1019 result.append(separator); 1020 } 1021 result.append(hex(s.charAt(i),width)); 1022 } 1023 } 1024 return result; 1025 } catch (IOException e) { 1026 throw new IllegalIcuArgumentException(e); 1027 } 1028 } 1029 1030 public static String hex(byte[] o, int start, int end, String separator) { 1031 StringBuilder result = new StringBuilder(); 1032 //int ch; 1033 for (int i = start; i < end; ++i) { 1034 if (i != 0) result.append(separator); 1035 result.append(hex(o[i])); 1036 } 1037 return result.toString(); 1038 } 1039 1040 /** 1041 * Convert a string to comma-separated groups of 4 hex uppercase 1042 * digits. E.g., hex('ab') => "0041,0042". 1043 */ 1044 public static <S extends CharSequence> String hex(S s, int width, S separator) { 1045 return hex(s, width, separator, true, new StringBuilder()).toString(); 1046 } 1047 1048 /** 1049 * Split a string into pieces based on the given divider character 1050 * @param s the string to split 1051 * @param divider the character on which to split. Occurrences of 1052 * this character are not included in the output 1053 * @param output an array to receive the substrings between 1054 * instances of divider. It must be large enough on entry to 1055 * accomodate all output. Adjacent instances of the divider 1056 * character will place empty strings into output. Before 1057 * returning, output is padded out with empty strings. 1058 */ 1059 public static void split(String s, char divider, String[] output) { 1060 int last = 0; 1061 int current = 0; 1062 int i; 1063 for (i = 0; i < s.length(); ++i) { 1064 if (s.charAt(i) == divider) { 1065 output[current++] = s.substring(last,i); 1066 last = i+1; 1067 } 1068 } 1069 output[current++] = s.substring(last,i); 1070 while (current < output.length) { 1071 output[current++] = ""; 1072 } 1073 } 1074 1075 /** 1076 * Split a string into pieces based on the given divider character 1077 * @param s the string to split 1078 * @param divider the character on which to split. Occurrences of 1079 * this character are not included in the output 1080 * @return output an array to receive the substrings between 1081 * instances of divider. Adjacent instances of the divider 1082 * character will place empty strings into output. 1083 */ 1084 public static String[] split(String s, char divider) { 1085 int last = 0; 1086 int i; 1087 ArrayList<String> output = new ArrayList<String>(); 1088 for (i = 0; i < s.length(); ++i) { 1089 if (s.charAt(i) == divider) { 1090 output.add(s.substring(last,i)); 1091 last = i+1; 1092 } 1093 } 1094 output.add( s.substring(last,i)); 1095 return output.toArray(new String[output.size()]); 1096 } 1097 1098 /** 1099 * Look up a given string in a string array. Returns the index at 1100 * which the first occurrence of the string was found in the 1101 * array, or -1 if it was not found. 1102 * @param source the string to search for 1103 * @param target the array of zero or more strings in which to 1104 * look for source 1105 * @return the index of target at which source first occurs, or -1 1106 * if not found 1107 */ 1108 public static int lookup(String source, String[] target) { 1109 for (int i = 0; i < target.length; ++i) { 1110 if (source.equals(target[i])) return i; 1111 } 1112 return -1; 1113 } 1114 1115 /** 1116 * Parse a single non-whitespace character 'ch', optionally 1117 * preceded by whitespace. 1118 * @param id the string to be parsed 1119 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the 1120 * offset of the first character to be parsed. On output, pos[0] 1121 * is the index after the last parsed character. If the parse 1122 * fails, pos[0] will be unchanged. 1123 * @param ch the non-whitespace character to be parsed. 1124 * @return true if 'ch' is seen preceded by zero or more 1125 * whitespace characters. 1126 */ 1127 public static boolean parseChar(String id, int[] pos, char ch) { 1128 int start = pos[0]; 1129 pos[0] = PatternProps.skipWhiteSpace(id, pos[0]); 1130 if (pos[0] == id.length() || 1131 id.charAt(pos[0]) != ch) { 1132 pos[0] = start; 1133 return false; 1134 } 1135 ++pos[0]; 1136 return true; 1137 } 1138 1139 /** 1140 * Parse a pattern string starting at offset pos. Keywords are 1141 * matched case-insensitively. Spaces may be skipped and may be 1142 * optional or required. Integer values may be parsed, and if 1143 * they are, they will be returned in the given array. If 1144 * successful, the offset of the next non-space character is 1145 * returned. On failure, -1 is returned. 1146 * @param pattern must only contain lowercase characters, which 1147 * will match their uppercase equivalents as well. A space 1148 * character matches one or more required spaces. A '~' character 1149 * matches zero or more optional spaces. A '#' character matches 1150 * an integer and stores it in parsedInts, which the caller must 1151 * ensure has enough capacity. 1152 * @param parsedInts array to receive parsed integers. Caller 1153 * must ensure that parsedInts.length is >= the number of '#' 1154 * signs in 'pattern'. 1155 * @return the position after the last character parsed, or -1 if 1156 * the parse failed 1157 */ 1158 @SuppressWarnings("fallthrough") 1159 public static int parsePattern(String rule, int pos, int limit, 1160 String pattern, int[] parsedInts) { 1161 // TODO Update this to handle surrogates 1162 int[] p = new int[1]; 1163 int intCount = 0; // number of integers parsed 1164 for (int i=0; i<pattern.length(); ++i) { 1165 char cpat = pattern.charAt(i); 1166 char c; 1167 switch (cpat) { 1168 case ' ': 1169 if (pos >= limit) { 1170 return -1; 1171 } 1172 c = rule.charAt(pos++); 1173 if (!PatternProps.isWhiteSpace(c)) { 1174 return -1; 1175 } 1176 // FALL THROUGH to skipWhitespace 1177 case '~': 1178 pos = PatternProps.skipWhiteSpace(rule, pos); 1179 break; 1180 case '#': 1181 p[0] = pos; 1182 parsedInts[intCount++] = parseInteger(rule, p, limit); 1183 if (p[0] == pos) { 1184 // Syntax error; failed to parse integer 1185 return -1; 1186 } 1187 pos = p[0]; 1188 break; 1189 default: 1190 if (pos >= limit) { 1191 return -1; 1192 } 1193 c = (char) UCharacter.toLowerCase(rule.charAt(pos++)); 1194 if (c != cpat) { 1195 return -1; 1196 } 1197 break; 1198 } 1199 } 1200 return pos; 1201 } 1202 1203 /** 1204 * Parse a pattern string within the given Replaceable and a parsing 1205 * pattern. Characters are matched literally and case-sensitively 1206 * except for the following special characters: 1207 * 1208 * ~ zero or more Pattern_White_Space chars 1209 * 1210 * If end of pattern is reached with all matches along the way, 1211 * pos is advanced to the first unparsed index and returned. 1212 * Otherwise -1 is returned. 1213 * @param pat pattern that controls parsing 1214 * @param text text to be parsed, starting at index 1215 * @param index offset to first character to parse 1216 * @param limit offset after last character to parse 1217 * @return index after last parsed character, or -1 on parse failure. 1218 */ 1219 public static int parsePattern(String pat, 1220 Replaceable text, 1221 int index, 1222 int limit) { 1223 int ipat = 0; 1224 1225 // empty pattern matches immediately 1226 if (ipat == pat.length()) { 1227 return index; 1228 } 1229 1230 int cpat = Character.codePointAt(pat, ipat); 1231 1232 while (index < limit) { 1233 int c = text.char32At(index); 1234 1235 // parse \s* 1236 if (cpat == '~') { 1237 if (PatternProps.isWhiteSpace(c)) { 1238 index += UTF16.getCharCount(c); 1239 continue; 1240 } else { 1241 if (++ipat == pat.length()) { 1242 return index; // success; c unparsed 1243 } 1244 // fall thru; process c again with next cpat 1245 } 1246 } 1247 1248 // parse literal 1249 else if (c == cpat) { 1250 int n = UTF16.getCharCount(c); 1251 index += n; 1252 ipat += n; 1253 if (ipat == pat.length()) { 1254 return index; // success; c parsed 1255 } 1256 // fall thru; get next cpat 1257 } 1258 1259 // match failure of literal 1260 else { 1261 return -1; 1262 } 1263 1264 cpat = UTF16.charAt(pat, ipat); 1265 } 1266 1267 return -1; // text ended before end of pat 1268 } 1269 1270 /** 1271 * Parse an integer at pos, either of the form \d+ or of the form 1272 * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex, 1273 * or octal format. 1274 * @param pos INPUT-OUTPUT parameter. On input, the first 1275 * character to parse. On output, the character after the last 1276 * parsed character. 1277 */ 1278 public static int parseInteger(String rule, int[] pos, int limit) { 1279 int count = 0; 1280 int value = 0; 1281 int p = pos[0]; 1282 int radix = 10; 1283 1284 if (rule.regionMatches(true, p, "0x", 0, 2)) { 1285 p += 2; 1286 radix = 16; 1287 } else if (p < limit && rule.charAt(p) == '0') { 1288 p++; 1289 count = 1; 1290 radix = 8; 1291 } 1292 1293 while (p < limit) { 1294 int d = UCharacter.digit(rule.charAt(p++), radix); 1295 if (d < 0) { 1296 --p; 1297 break; 1298 } 1299 ++count; 1300 int v = (value * radix) + d; 1301 if (v <= value) { 1302 // If there are too many input digits, at some point 1303 // the value will go negative, e.g., if we have seen 1304 // "0x8000000" already and there is another '0', when 1305 // we parse the next 0 the value will go negative. 1306 return 0; 1307 } 1308 value = v; 1309 } 1310 if (count > 0) { 1311 pos[0] = p; 1312 } 1313 return value; 1314 } 1315 1316 /** 1317 * Parse a Unicode identifier from the given string at the given 1318 * position. Return the identifier, or null if there is no 1319 * identifier. 1320 * @param str the string to parse 1321 * @param pos INPUT-OUPUT parameter. On INPUT, pos[0] is the 1322 * first character to examine. It must be less than str.length(), 1323 * and it must not point to a whitespace character. That is, must 1324 * have pos[0] < str.length(). On 1325 * OUTPUT, the position after the last parsed character. 1326 * @return the Unicode identifier, or null if there is no valid 1327 * identifier at pos[0]. 1328 */ 1329 public static String parseUnicodeIdentifier(String str, int[] pos) { 1330 // assert(pos[0] < str.length()); 1331 StringBuilder buf = new StringBuilder(); 1332 int p = pos[0]; 1333 while (p < str.length()) { 1334 int ch = Character.codePointAt(str, p); 1335 if (buf.length() == 0) { 1336 if (UCharacter.isUnicodeIdentifierStart(ch)) { 1337 buf.appendCodePoint(ch); 1338 } else { 1339 return null; 1340 } 1341 } else { 1342 if (UCharacter.isUnicodeIdentifierPart(ch)) { 1343 buf.appendCodePoint(ch); 1344 } else { 1345 break; 1346 } 1347 } 1348 p += UTF16.getCharCount(ch); 1349 } 1350 pos[0] = p; 1351 return buf.toString(); 1352 } 1353 1354 static final char DIGITS[] = { 1355 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 1356 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 1357 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 1358 'U', 'V', 'W', 'X', 'Y', 'Z' 1359 }; 1360 1361 /** 1362 * Append the digits of a positive integer to the given 1363 * <code>Appendable</code> in the given radix. This is 1364 * done recursively since it is easiest to generate the low- 1365 * order digit first, but it must be appended last. 1366 * 1367 * @param result is the <code>Appendable</code> to append to 1368 * @param n is the positive integer 1369 * @param radix is the radix, from 2 to 36 inclusive 1370 * @param minDigits is the minimum number of digits to append. 1371 */ 1372 private static <T extends Appendable> void recursiveAppendNumber(T result, int n, 1373 int radix, int minDigits) 1374 { 1375 try { 1376 int digit = n % radix; 1377 1378 if (n >= radix || minDigits > 1) { 1379 recursiveAppendNumber(result, n / radix, radix, minDigits - 1); 1380 } 1381 result.append(DIGITS[digit]); 1382 } catch (IOException e) { 1383 throw new IllegalIcuArgumentException(e); 1384 } 1385 } 1386 1387 /** 1388 * Append a number to the given Appendable in the given radix. 1389 * Standard digits '0'-'9' are used and letters 'A'-'Z' for 1390 * radices 11 through 36. 1391 * @param result the digits of the number are appended here 1392 * @param n the number to be converted to digits; may be negative. 1393 * If negative, a '-' is prepended to the digits. 1394 * @param radix a radix from 2 to 36 inclusive. 1395 * @param minDigits the minimum number of digits, not including 1396 * any '-', to produce. Values less than 2 have no effect. One 1397 * digit is always emitted regardless of this parameter. 1398 * @return a reference to result 1399 */ 1400 public static <T extends Appendable> T appendNumber(T result, int n, 1401 int radix, int minDigits) 1402 { 1403 try { 1404 if (radix < 2 || radix > 36) { 1405 throw new IllegalArgumentException("Illegal radix " + radix); 1406 } 1407 1408 1409 int abs = n; 1410 1411 if (n < 0) { 1412 abs = -n; 1413 result.append("-"); 1414 } 1415 1416 recursiveAppendNumber(result, abs, radix, minDigits); 1417 1418 return result; 1419 } catch (IOException e) { 1420 throw new IllegalIcuArgumentException(e); 1421 } 1422 1423 } 1424 1425 /** 1426 * Parse an unsigned 31-bit integer at the given offset. Use 1427 * UCharacter.digit() to parse individual characters into digits. 1428 * @param text the text to be parsed 1429 * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the 1430 * offset within text at which to start parsing; it should point 1431 * to a valid digit. On exit, pos[0] is the offset after the last 1432 * parsed character. If the parse failed, it will be unchanged on 1433 * exit. Must be >= 0 on entry. 1434 * @param radix the radix in which to parse; must be >= 2 and <= 1435 * 36. 1436 * @return a non-negative parsed number, or -1 upon parse failure. 1437 * Parse fails if there are no digits, that is, if pos[0] does not 1438 * point to a valid digit on entry, or if the number to be parsed 1439 * does not fit into a 31-bit unsigned integer. 1440 */ 1441 public static int parseNumber(String text, int[] pos, int radix) { 1442 // assert(pos[0] >= 0); 1443 // assert(radix >= 2); 1444 // assert(radix <= 36); 1445 int n = 0; 1446 int p = pos[0]; 1447 while (p < text.length()) { 1448 int ch = Character.codePointAt(text, p); 1449 int d = UCharacter.digit(ch, radix); 1450 if (d < 0) { 1451 break; 1452 } 1453 n = radix*n + d; 1454 // ASSUME that when a 32-bit integer overflows it becomes 1455 // negative. E.g., 214748364 * 10 + 8 => negative value. 1456 if (n < 0) { 1457 return -1; 1458 } 1459 ++p; 1460 } 1461 if (p == pos[0]) { 1462 return -1; 1463 } 1464 pos[0] = p; 1465 return n; 1466 } 1467 1468 /** 1469 * Return true if the character is NOT printable ASCII. The tab, 1470 * newline and linefeed characters are considered unprintable. 1471 */ 1472 public static boolean isUnprintable(int c) { 1473 //0x20 = 32 and 0x7E = 126 1474 return !(c >= 0x20 && c <= 0x7E); 1475 } 1476 1477 /** 1478 * Escape unprintable characters using <backslash>uxxxx notation 1479 * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and 1480 * above. If the character is printable ASCII, then do nothing 1481 * and return FALSE. Otherwise, append the escaped notation and 1482 * return TRUE. 1483 */ 1484 public static <T extends Appendable> boolean escapeUnprintable(T result, int c) { 1485 try { 1486 if (isUnprintable(c)) { 1487 result.append('\\'); 1488 if ((c & ~0xFFFF) != 0) { 1489 result.append('U'); 1490 result.append(DIGITS[0xF&(c>>28)]); 1491 result.append(DIGITS[0xF&(c>>24)]); 1492 result.append(DIGITS[0xF&(c>>20)]); 1493 result.append(DIGITS[0xF&(c>>16)]); 1494 } else { 1495 result.append('u'); 1496 } 1497 result.append(DIGITS[0xF&(c>>12)]); 1498 result.append(DIGITS[0xF&(c>>8)]); 1499 result.append(DIGITS[0xF&(c>>4)]); 1500 result.append(DIGITS[0xF&c]); 1501 return true; 1502 } 1503 return false; 1504 } catch (IOException e) { 1505 throw new IllegalIcuArgumentException(e); 1506 } 1507 } 1508 1509 /** 1510 * Returns the index of the first character in a set, ignoring quoted text. 1511 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be 1512 * found by a search for "h". Unlike String.indexOf(), this method searches 1513 * not for a single character, but for any character of the string 1514 * <code>setOfChars</code>. 1515 * @param text text to be searched 1516 * @param start the beginning index, inclusive; <code>0 <= start 1517 * <= limit</code>. 1518 * @param limit the ending index, exclusive; <code>start <= limit 1519 * <= text.length()</code>. 1520 * @param setOfChars string with one or more distinct characters 1521 * @return Offset of the first character in <code>setOfChars</code> 1522 * found, or -1 if not found. 1523 * @see String#indexOf 1524 */ 1525 public static int quotedIndexOf(String text, int start, int limit, 1526 String setOfChars) { 1527 for (int i=start; i<limit; ++i) { 1528 char c = text.charAt(i); 1529 if (c == BACKSLASH) { 1530 ++i; 1531 } else if (c == APOSTROPHE) { 1532 while (++i < limit 1533 && text.charAt(i) != APOSTROPHE) {} 1534 } else if (setOfChars.indexOf(c) >= 0) { 1535 return i; 1536 } 1537 } 1538 return -1; 1539 } 1540 1541 /** 1542 * Append a character to a rule that is being built up. To flush 1543 * the quoteBuf to rule, make one final call with isLiteral == true. 1544 * If there is no final character, pass in (int)-1 as c. 1545 * @param rule the string to append the character to 1546 * @param c the character to append, or (int)-1 if none. 1547 * @param isLiteral if true, then the given character should not be 1548 * quoted or escaped. Usually this means it is a syntactic element 1549 * such as > or $ 1550 * @param escapeUnprintable if true, then unprintable characters 1551 * should be escaped using escapeUnprintable(). These escapes will 1552 * appear outside of quotes. 1553 * @param quoteBuf a buffer which is used to build up quoted 1554 * substrings. The caller should initially supply an empty buffer, 1555 * and thereafter should not modify the buffer. The buffer should be 1556 * cleared out by, at the end, calling this method with a literal 1557 * character (which may be -1). 1558 */ 1559 public static void appendToRule(StringBuffer rule, 1560 int c, 1561 boolean isLiteral, 1562 boolean escapeUnprintable, 1563 StringBuffer quoteBuf) { 1564 // If we are escaping unprintables, then escape them outside 1565 // quotes. \\u and \\U are not recognized within quotes. The same 1566 // logic applies to literals, but literals are never escaped. 1567 if (isLiteral || 1568 (escapeUnprintable && Utility.isUnprintable(c))) { 1569 if (quoteBuf.length() > 0) { 1570 // We prefer backslash APOSTROPHE to double APOSTROPHE 1571 // (more readable, less similar to ") so if there are 1572 // double APOSTROPHEs at the ends, we pull them outside 1573 // of the quote. 1574 1575 // If the first thing in the quoteBuf is APOSTROPHE 1576 // (doubled) then pull it out. 1577 while (quoteBuf.length() >= 2 && 1578 quoteBuf.charAt(0) == APOSTROPHE && 1579 quoteBuf.charAt(1) == APOSTROPHE) { 1580 rule.append(BACKSLASH).append(APOSTROPHE); 1581 quoteBuf.delete(0, 2); 1582 } 1583 // If the last thing in the quoteBuf is APOSTROPHE 1584 // (doubled) then remove and count it and add it after. 1585 int trailingCount = 0; 1586 while (quoteBuf.length() >= 2 && 1587 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE && 1588 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) { 1589 quoteBuf.setLength(quoteBuf.length()-2); 1590 ++trailingCount; 1591 } 1592 if (quoteBuf.length() > 0) { 1593 rule.append(APOSTROPHE); 1594 rule.append(quoteBuf); 1595 rule.append(APOSTROPHE); 1596 quoteBuf.setLength(0); 1597 } 1598 while (trailingCount-- > 0) { 1599 rule.append(BACKSLASH).append(APOSTROPHE); 1600 } 1601 } 1602 if (c != -1) { 1603 /* Since spaces are ignored during parsing, they are 1604 * emitted only for readability. We emit one here 1605 * only if there isn't already one at the end of the 1606 * rule. 1607 */ 1608 if (c == ' ') { 1609 int len = rule.length(); 1610 if (len > 0 && rule.charAt(len-1) != ' ') { 1611 rule.append(' '); 1612 } 1613 } else if (!escapeUnprintable || !Utility.escapeUnprintable(rule, c)) { 1614 rule.appendCodePoint(c); 1615 } 1616 } 1617 } 1618 1619 // Escape ' and '\' and don't begin a quote just for them 1620 else if (quoteBuf.length() == 0 && 1621 (c == APOSTROPHE || c == BACKSLASH)) { 1622 rule.append(BACKSLASH).append((char)c); 1623 } 1624 1625 // Specials (printable ascii that isn't [0-9a-zA-Z]) and 1626 // whitespace need quoting. Also append stuff to quotes if we are 1627 // building up a quoted substring already. 1628 else if (quoteBuf.length() > 0 || 1629 (c >= 0x0021 && c <= 0x007E && 1630 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 1631 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 1632 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) || 1633 PatternProps.isWhiteSpace(c)) { 1634 quoteBuf.appendCodePoint(c); 1635 // Double ' within a quote 1636 if (c == APOSTROPHE) { 1637 quoteBuf.append((char)c); 1638 } 1639 } 1640 1641 // Otherwise just append 1642 else { 1643 rule.appendCodePoint(c); 1644 } 1645 } 1646 1647 /** 1648 * Append the given string to the rule. Calls the single-character 1649 * version of appendToRule for each character. 1650 */ 1651 public static void appendToRule(StringBuffer rule, 1652 String text, 1653 boolean isLiteral, 1654 boolean escapeUnprintable, 1655 StringBuffer quoteBuf) { 1656 for (int i=0; i<text.length(); ++i) { 1657 // Okay to process in 16-bit code units here 1658 appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf); 1659 } 1660 } 1661 1662 /** 1663 * Given a matcher reference, which may be null, append its 1664 * pattern as a literal to the given rule. 1665 */ 1666 public static void appendToRule(StringBuffer rule, 1667 UnicodeMatcher matcher, 1668 boolean escapeUnprintable, 1669 StringBuffer quoteBuf) { 1670 if (matcher != null) { 1671 appendToRule(rule, matcher.toPattern(escapeUnprintable), 1672 true, escapeUnprintable, quoteBuf); 1673 } 1674 } 1675 1676 /** 1677 * Compares 2 unsigned integers 1678 * @param source 32 bit unsigned integer 1679 * @param target 32 bit unsigned integer 1680 * @return 0 if equals, 1 if source is greater than target and -1 1681 * otherwise 1682 */ 1683 public static final int compareUnsigned(int source, int target) 1684 { 1685 source += MAGIC_UNSIGNED; 1686 target += MAGIC_UNSIGNED; 1687 if (source < target) { 1688 return -1; 1689 } 1690 else if (source > target) { 1691 return 1; 1692 } 1693 return 0; 1694 } 1695 1696 /** 1697 * Find the highest bit in a positive integer. This is done 1698 * by doing a binary search through the bits. 1699 * 1700 * @param n is the integer 1701 * 1702 * @return the bit number of the highest bit, with 0 being 1703 * the low order bit, or -1 if <code>n</code> is not positive 1704 */ 1705 public static final byte highBit(int n) 1706 { 1707 if (n <= 0) { 1708 return -1; 1709 } 1710 1711 byte bit = 0; 1712 1713 if (n >= 1 << 16) { 1714 n >>= 16; 1715 bit += 16; 1716 } 1717 1718 if (n >= 1 << 8) { 1719 n >>= 8; 1720 bit += 8; 1721 } 1722 1723 if (n >= 1 << 4) { 1724 n >>= 4; 1725 bit += 4; 1726 } 1727 1728 if (n >= 1 << 2) { 1729 n >>= 2; 1730 bit += 2; 1731 } 1732 1733 if (n >= 1 << 1) { 1734 n >>= 1; 1735 bit += 1; 1736 } 1737 1738 return bit; 1739 } 1740 /** 1741 * Utility method to take a int[] containing codepoints and return 1742 * a string representation with code units. 1743 */ 1744 public static String valueOf(int[]source){ 1745 // TODO: Investigate why this method is not on UTF16 class 1746 StringBuilder result = new StringBuilder(source.length); 1747 for(int i=0; i<source.length; i++){ 1748 result.appendCodePoint(source[i]); 1749 } 1750 return result.toString(); 1751 } 1752 1753 1754 /** 1755 * Utility to duplicate a string count times 1756 * @param s String to be duplicated. 1757 * @param count Number of times to duplicate a string. 1758 */ 1759 public static String repeat(String s, int count) { 1760 if (count <= 0) return ""; 1761 if (count == 1) return s; 1762 StringBuilder result = new StringBuilder(); 1763 for (int i = 0; i < count; ++i) { 1764 result.append(s); 1765 } 1766 return result.toString(); 1767 } 1768 1769 public static String[] splitString(String src, String target) { 1770 return src.split("\\Q" + target + "\\E"); 1771 } 1772 1773 /** 1774 * Split the string at runs of ascii whitespace characters. 1775 */ 1776 public static String[] splitWhitespace(String src) { 1777 return src.split("\\s+"); 1778 } 1779 1780 /** 1781 * Parse a list of hex numbers and return a string 1782 * @param string String of hex numbers. 1783 * @param minLength Minimal length. 1784 * @param separator Separator. 1785 * @return A string from hex numbers. 1786 */ 1787 public static String fromHex(String string, int minLength, String separator) { 1788 return fromHex(string, minLength, Pattern.compile(separator != null ? separator : "\\s+")); 1789 } 1790 1791 /** 1792 * Parse a list of hex numbers and return a string 1793 * @param string String of hex numbers. 1794 * @param minLength Minimal length. 1795 * @param separator Separator. 1796 * @return A string from hex numbers. 1797 */ 1798 public static String fromHex(String string, int minLength, Pattern separator) { 1799 StringBuilder buffer = new StringBuilder(); 1800 String[] parts = separator.split(string); 1801 for (String part : parts) { 1802 if (part.length() < minLength) { 1803 throw new IllegalArgumentException("code point too short: " + part); 1804 } 1805 int cp = Integer.parseInt(part, 16); 1806 buffer.appendCodePoint(cp); 1807 } 1808 return buffer.toString(); 1809 } 1810} 1811