1/* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1999-2014, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 */ 9 10package com.ibm.icu.lang; 11 12import com.ibm.icu.text.UTF16; 13 14/** 15 * <code>UScriptRun</code> is used to find runs of characters in 16 * the same script, as defined in the <code>UScript</code> class. 17 * It implements a simple iterator over an array of characters. 18 * The iterator will assign <code>COMMON</code> and <code>INHERITED</code> 19 * characters to the same script as the preceeding characters. If the 20 * COMMON and INHERITED characters are first, they will be assigned to 21 * the same script as the following characters. 22 * 23 * The iterator will try to match paired punctuation. If it sees an 24 * opening punctuation character, it will remember the script that 25 * was assigned to that character, and assign the same script to the 26 * matching closing punctuation. 27 * 28 * No attempt is made to combine related scripts into a single run. In 29 * particular, Hiragana, Katakana, and Han characters will appear in separate 30 * runs. 31 32 * Here is an example of how to iterate over script runs: 33 * <pre> 34 * void printScriptRuns(char[] text) 35 * { 36 * UScriptRun scriptRun = new UScriptRun(text); 37 * 38 * while (scriptRun.next()) { 39 * int start = scriptRun.getScriptStart(); 40 * int limit = scriptRun.getScriptLimit(); 41 * int script = scriptRun.getScriptCode(); 42 * 43 * System.out.println("Script \"" + UScript.getName(script) + "\" from " + 44 * start + " to " + limit + "."); 45 * } 46 * } 47 * </pre> 48 * 49 * @internal 50 * @deprecated This API is ICU internal only. 51 */ 52@Deprecated 53public final class UScriptRun 54{ 55 /** 56 * Construct an empty <code>UScriptRun</code> object. The <code>next()</code> 57 * method will return <code>false</code> the first time it is called. 58 * 59 * @internal 60 * @deprecated This API is ICU internal only. 61 */ 62 @Deprecated 63 public UScriptRun() 64 { 65 char[] nullChars = null; 66 67 reset(nullChars, 0, 0); 68 } 69 70 /** 71 * Construct a <code>UScriptRun</code> object which iterates over the 72 * characters in the given string. 73 * 74 * @param text the string of characters over which to iterate. 75 * 76 * @internal 77 * @deprecated This API is ICU internal only. 78 */ 79 @Deprecated 80 public UScriptRun(String text) 81 { 82 reset (text); 83 } 84 85 /** 86 * Construct a <code>UScriptRun</code> object which iterates over a subrange 87 * of the characetrs in the given string. 88 * 89 * @param text the string of characters over which to iterate. 90 * @param start the index of the first character over which to iterate 91 * @param count the number of characters over which to iterate 92 * 93 * @internal 94 * @deprecated This API is ICU internal only. 95 */ 96 @Deprecated 97 public UScriptRun(String text, int start, int count) 98 { 99 reset(text, start, count); 100 } 101 102 /** 103 * Construct a <code>UScriptRun</code> object which iterates over the given 104 * characetrs. 105 * 106 * @param chars the array of characters over which to iterate. 107 * 108 * @internal 109 * @deprecated This API is ICU internal only. 110 */ 111 @Deprecated 112 public UScriptRun(char[] chars) 113 { 114 reset(chars); 115 } 116 117 /** 118 * Construct a <code>UScriptRun</code> object which iterates over a subrange 119 * of the given characetrs. 120 * 121 * @param chars the array of characters over which to iterate. 122 * @param start the index of the first character over which to iterate 123 * @param count the number of characters over which to iterate 124 * 125 * @internal 126 * @deprecated This API is ICU internal only. 127 */ 128 @Deprecated 129 public UScriptRun(char[] chars, int start, int count) 130 { 131 reset(chars, start, count); 132 } 133 134 135 /** 136 * Reset the iterator to the start of the text. 137 * 138 * @internal 139 * @deprecated This API is ICU internal only. 140 */ 141 @Deprecated 142 public final void reset() 143 { 144 // empty any old parenStack contents. 145 // NOTE: this is not the most efficient way 146 // to do this, but it's the easiest to write... 147 while (stackIsNotEmpty()) { 148 pop(); 149 } 150 151 scriptStart = textStart; 152 scriptLimit = textStart; 153 scriptCode = UScript.INVALID_CODE; 154 parenSP = -1; 155 pushCount = 0; 156 fixupCount = 0; 157 158 textIndex = textStart; 159 } 160 161 /** 162 * Reset the iterator to iterate over the given range of the text. Throws 163 * IllegalArgumentException if the range is outside of the bounds of the 164 * character array. 165 * 166 * @param start the index of the new first character over which to iterate 167 * @param count the new number of characters over which to iterate. 168 * @exception IllegalArgumentException If invalid arguments are passed. 169 * 170 * @internal 171 * @deprecated This API is ICU internal only. 172 */ 173 @Deprecated 174 public final void reset(int start, int count) 175 throws IllegalArgumentException 176 { 177 int len = 0; 178 179 if (text != null) { 180 len = text.length; 181 } 182 183 if (start < 0 || count < 0 || start > len - count) { 184 throw new IllegalArgumentException(); 185 } 186 187 textStart = start; 188 textLimit = start + count; 189 190 reset(); 191 } 192 193 /** 194 * Reset the iterator to iterate over <code>count</code> characters 195 * in <code>chars</code> starting at <code>start</code>. This allows 196 * clients to reuse an iterator. 197 * 198 * @param chars the new array of characters over which to iterate. 199 * @param start the index of the first character over which to iterate. 200 * @param count the number of characters over which to iterate. 201 * 202 * @internal 203 * @deprecated This API is ICU internal only. 204 */ 205 @Deprecated 206 public final void reset(char[] chars, int start, int count) 207 { 208 if (chars == null) { 209 chars = emptyCharArray; 210 } 211 212 text = chars; 213 214 reset(start, count); 215 } 216 217 /** 218 * Reset the iterator to iterate over the characters 219 * in <code>chars</code>. This allows clients to reuse an iterator. 220 * 221 * @param chars the new array of characters over which to iterate. 222 * 223 * @internal 224 * @deprecated This API is ICU internal only. 225 */ 226 @Deprecated 227 public final void reset(char[] chars) 228 { 229 int length = 0; 230 231 if (chars != null) { 232 length = chars.length; 233 } 234 235 reset(chars, 0, length); 236 } 237 238 /** 239 * Reset the iterator to iterate over <code>count</code> characters 240 * in <code>text</code> starting at <code>start</code>. This allows 241 * clients to reuse an iterator. 242 * 243 * @param str the new string of characters over which to iterate. 244 * @param start the index of the first character over which to iterate. 245 * @param count the nuber of characters over which to iterate. 246 * 247 * @internal 248 * @deprecated This API is ICU internal only. 249 */ 250 @Deprecated 251 public final void reset(String str, int start, int count) 252 { 253 char[] chars = null; 254 255 if (str != null) { 256 chars = str.toCharArray(); 257 } 258 259 reset(chars, start, count); 260 } 261 262 /** 263 * Reset the iterator to iterate over the characters 264 * in <code>text</code>. This allows clients to reuse an iterator. 265 * 266 * @param str the new string of characters over which to iterate. 267 * 268 * @internal 269 * @deprecated This API is ICU internal only. 270 */ 271 @Deprecated 272 public final void reset(String str) 273 { 274 int length = 0; 275 276 if (str != null) { 277 length = str.length(); 278 } 279 280 reset(str, 0, length); 281 } 282 283 284 285 /** 286 * Get the starting index of the current script run. 287 * 288 * @return the index of the first character in the current script run. 289 * 290 * @internal 291 * @deprecated This API is ICU internal only. 292 */ 293 @Deprecated 294 public final int getScriptStart() 295 { 296 return scriptStart; 297 } 298 299 /** 300 * Get the index of the first character after the current script run. 301 * 302 * @return the index of the first character after the current script run. 303 * 304 * @internal 305 * @deprecated This API is ICU internal only. 306 */ 307 @Deprecated 308 public final int getScriptLimit() 309 { 310 return scriptLimit; 311 } 312 313 /** 314 * Get the script code for the script of the current script run. 315 * 316 * @return the script code for the script of the current script run. 317 * @see com.ibm.icu.lang.UScript 318 * 319 * @internal 320 * @deprecated This API is ICU internal only. 321 */ 322 @Deprecated 323 public final int getScriptCode() 324 { 325 return scriptCode; 326 } 327 328 /** 329 * Find the next script run. Returns <code>false</code> if there 330 * isn't another run, returns <code>true</code> if there is. 331 * 332 * @return <code>false</code> if there isn't another run, <code>true</code> if there is. 333 * 334 * @internal 335 * @deprecated This API is ICU internal only. 336 */ 337 @Deprecated 338 public final boolean next() 339 { 340 // if we've fallen off the end of the text, we're done 341 if (scriptLimit >= textLimit) { 342 return false; 343 } 344 345 scriptCode = UScript.COMMON; 346 scriptStart = scriptLimit; 347 348 syncFixup(); 349 350 while (textIndex < textLimit) { 351 int ch = UTF16.charAt(text, textStart, textLimit, textIndex - textStart); 352 int codePointCount = UTF16.getCharCount(ch); 353 int sc = UScript.getScript(ch); 354 int pairIndex = getPairIndex(ch); 355 356 textIndex += codePointCount; 357 358 // Paired character handling: 359 // 360 // if it's an open character, push it onto the stack. 361 // if it's a close character, find the matching open on the 362 // stack, and use that script code. Any non-matching open 363 // characters above it on the stack will be poped. 364 if (pairIndex >= 0) { 365 if ((pairIndex & 1) == 0) { 366 push(pairIndex, scriptCode); 367 } else { 368 int pi = pairIndex & ~1; 369 370 while (stackIsNotEmpty() && top().pairIndex != pi) { 371 pop(); 372 } 373 374 if (stackIsNotEmpty()) { 375 sc = top().scriptCode; 376 } 377 } 378 } 379 380 if (sameScript(scriptCode, sc)) { 381 if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) { 382 scriptCode = sc; 383 384 fixup(scriptCode); 385 } 386 387 // if this character is a close paired character, 388 // pop the matching open character from the stack 389 if (pairIndex >= 0 && (pairIndex & 1) != 0) { 390 pop(); 391 } 392 } else { 393 // We've just seen the first character of 394 // the next run. Back over it so we'll see 395 // it again the next time. 396 textIndex -= codePointCount; 397 break; 398 } 399 } 400 401 scriptLimit = textIndex; 402 return true; 403 } 404 405 /** 406 * Compare two script codes to see if they are in the same script. If one script is 407 * a strong script, and the other is INHERITED or COMMON, it will compare equal. 408 * 409 * @param scriptOne one of the script codes. 410 * @param scriptTwo the other script code. 411 * @return <code>true</code> if the two scripts are the same. 412 * @see com.ibm.icu.lang.UScript 413 */ 414 private static boolean sameScript(int scriptOne, int scriptTwo) 415 { 416 return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED || scriptOne == scriptTwo; 417 } 418 419 /* 420 * An internal class which holds entries on the paren stack. 421 */ 422 private static final class ParenStackEntry 423 { 424 int pairIndex; 425 int scriptCode; 426 427 public ParenStackEntry(int thePairIndex, int theScriptCode) 428 { 429 pairIndex = thePairIndex; 430 scriptCode = theScriptCode; 431 } 432 } 433 434 private static final int mod(int sp) 435 { 436 return sp % PAREN_STACK_DEPTH; 437 } 438 439 private static final int inc(int sp, int count) 440 { 441 return mod(sp + count); 442 } 443 444 private static final int inc(int sp) 445 { 446 return inc(sp, 1); 447 } 448 449 private static final int dec(int sp, int count) 450 { 451 return mod(sp + PAREN_STACK_DEPTH - count); 452 } 453 454 private static final int dec(int sp) 455 { 456 return dec(sp, 1); 457 } 458 459 private static final int limitInc(int count) 460 { 461 if (count < PAREN_STACK_DEPTH) { 462 count += 1; 463 } 464 465 return count; 466 } 467 468 private final boolean stackIsEmpty() 469 { 470 return pushCount <= 0; 471 } 472 473 private final boolean stackIsNotEmpty() 474 { 475 return ! stackIsEmpty(); 476 } 477 478 private final void push(int pairIndex, int scrptCode) 479 { 480 pushCount = limitInc(pushCount); 481 fixupCount = limitInc(fixupCount); 482 483 parenSP = inc(parenSP); 484 parenStack[parenSP] = new ParenStackEntry(pairIndex, scrptCode); 485 } 486 487 private final void pop() 488 { 489 490 if (stackIsEmpty()) { 491 return; 492 } 493 494 parenStack[parenSP] = null; 495 496 if (fixupCount > 0) { 497 fixupCount -= 1; 498 } 499 500 pushCount -= 1; 501 parenSP = dec(parenSP); 502 503 // If the stack is now empty, reset the stack 504 // pointers to their initial values. 505 if (stackIsEmpty()) { 506 parenSP = -1; 507 } 508 } 509 510 private final ParenStackEntry top() 511 { 512 return parenStack[parenSP]; 513 } 514 515 private final void syncFixup() 516 { 517 fixupCount = 0; 518 } 519 520 private final void fixup(int scrptCode) 521 { 522 int fixupSP = dec(parenSP, fixupCount); 523 524 while (fixupCount-- > 0) { 525 fixupSP = inc(fixupSP); 526 parenStack[fixupSP].scriptCode = scrptCode; 527 } 528 } 529 530 private char[] emptyCharArray = {}; 531 532 private char[] text; 533 534 private int textIndex; 535 private int textStart; 536 private int textLimit; 537 538 private int scriptStart; 539 private int scriptLimit; 540 private int scriptCode; 541 542 private static int PAREN_STACK_DEPTH = 32; 543 private static ParenStackEntry parenStack[] = new ParenStackEntry[PAREN_STACK_DEPTH]; 544 private int parenSP = -1; 545 private int pushCount = 0; 546 private int fixupCount = 0; 547 548 /** 549 * Find the highest bit that's set in a word. Uses a binary search through 550 * the bits. 551 * 552 * @param n the word in which to find the highest bit that's set. 553 * @return the bit number (counting from the low order bit) of the highest bit. 554 */ 555 private static final byte highBit(int n) 556 { 557 if (n <= 0) { 558 return -32; 559 } 560 561 byte bit = 0; 562 563 if (n >= 1 << 16) { 564 n >>= 16; 565 bit += 16; 566 } 567 568 if (n >= 1 << 8) { 569 n >>= 8; 570 bit += 8; 571 } 572 573 if (n >= 1 << 4) { 574 n >>= 4; 575 bit += 4; 576 } 577 578 if (n >= 1 << 2) { 579 n >>= 2; 580 bit += 2; 581 } 582 583 if (n >= 1 << 1) { 584 n >>= 1; 585 bit += 1; 586 } 587 588 return bit; 589 } 590 591 /** 592 * Search the pairedChars array for the given character. 593 * 594 * @param ch the character for which to search. 595 * @return the index of the character in the table, or -1 if it's not there. 596 */ 597 private static int getPairIndex(int ch) 598 { 599 int probe = pairedCharPower; 600 int index = 0; 601 602 if (ch >= pairedChars[pairedCharExtra]) { 603 index = pairedCharExtra; 604 } 605 606 while (probe > (1 << 0)) { 607 probe >>= 1; 608 609 if (ch >= pairedChars[index + probe]) { 610 index += probe; 611 } 612 } 613 614 if (pairedChars[index] != ch) { 615 index = -1; 616 } 617 618 return index; 619 } 620 621 private static int pairedChars[] = { 622 0x0028, 0x0029, // ascii paired punctuation 623 0x003c, 0x003e, 624 0x005b, 0x005d, 625 0x007b, 0x007d, 626 0x00ab, 0x00bb, // guillemets 627 0x2018, 0x2019, // general punctuation 628 0x201c, 0x201d, 629 0x2039, 0x203a, 630 0x3008, 0x3009, // chinese paired punctuation 631 0x300a, 0x300b, 632 0x300c, 0x300d, 633 0x300e, 0x300f, 634 0x3010, 0x3011, 635 0x3014, 0x3015, 636 0x3016, 0x3017, 637 0x3018, 0x3019, 638 0x301a, 0x301b 639 }; 640 641 private static int pairedCharPower = 1 << highBit(pairedChars.length); 642 private static int pairedCharExtra = pairedChars.length - pairedCharPower; 643} 644 645