1// © 2017 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3package com.ibm.icu.text; 4 5import java.nio.BufferOverflowException; 6import java.util.Arrays; 7 8/** 9 * Records lengths of string edits but not replacement text. 10 * Supports replacements, insertions, deletions in linear progression. 11 * Does not support moving/reordering of text. 12 * 13 * @draft ICU 59 14 * @provisional This API might change or be removed in a future release. 15 */ 16public final class Edits { 17 // 0000uuuuuuuuuuuu records u+1 unchanged text units. 18 private static final int MAX_UNCHANGED_LENGTH = 0x1000; 19 private static final int MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1; 20 21 // 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units. 22 private static final int MAX_SHORT_CHANGE_OLD_LENGTH = 6; 23 private static final int MAX_SHORT_CHANGE_NEW_LENGTH = 7; 24 private static final int SHORT_CHANGE_NUM_MASK = 0x1ff; 25 private static final int MAX_SHORT_CHANGE = 0x6fff; 26 27 // 0111mmmmmmnnnnnn records a replacement of m text units with n. 28 // m or n = 61: actual length follows in the next edits array unit. 29 // m or n = 62..63: actual length follows in the next two edits array units. 30 // Bit 30 of the actual length is in the head unit. 31 // Trailing units have bit 15 set. 32 private static final int LENGTH_IN_1TRAIL = 61; 33 private static final int LENGTH_IN_2TRAIL = 62; 34 35 private static final int STACK_CAPACITY = 100; 36 private char[] array; 37 private int length; 38 private int delta; 39 private int numChanges; 40 41 /** 42 * Constructs an empty object. 43 * @draft ICU 59 44 * @provisional This API might change or be removed in a future release. 45 */ 46 public Edits() { 47 array = new char[STACK_CAPACITY]; 48 } 49 50 /** 51 * Resets the data but may not release memory. 52 * @draft ICU 59 53 * @provisional This API might change or be removed in a future release. 54 */ 55 public void reset() { 56 length = delta = numChanges = 0; 57 } 58 59 private void setLastUnit(int last) { 60 array[length - 1] = (char)last; 61 } 62 private int lastUnit() { 63 return length > 0 ? array[length - 1] : 0xffff; 64 } 65 66 /** 67 * Adds a record for an unchanged segment of text. 68 * Normally called from inside ICU string transformation functions, not user code. 69 * @draft ICU 59 70 * @provisional This API might change or be removed in a future release. 71 */ 72 public void addUnchanged(int unchangedLength) { 73 if(unchangedLength < 0) { 74 throw new IllegalArgumentException( 75 "addUnchanged(" + unchangedLength + "): length must not be negative"); 76 } 77 // Merge into previous unchanged-text record, if any. 78 int last = lastUnit(); 79 if(last < MAX_UNCHANGED) { 80 int remaining = MAX_UNCHANGED - last; 81 if (remaining >= unchangedLength) { 82 setLastUnit(last + unchangedLength); 83 return; 84 } 85 setLastUnit(MAX_UNCHANGED); 86 unchangedLength -= remaining; 87 } 88 // Split large lengths into multiple units. 89 while(unchangedLength >= MAX_UNCHANGED_LENGTH) { 90 append(MAX_UNCHANGED); 91 unchangedLength -= MAX_UNCHANGED_LENGTH; 92 } 93 // Write a small (remaining) length. 94 if(unchangedLength > 0) { 95 append(unchangedLength - 1); 96 } 97 } 98 99 /** 100 * Adds a record for a text replacement/insertion/deletion. 101 * Normally called from inside ICU string transformation functions, not user code. 102 * @draft ICU 59 103 * @provisional This API might change or be removed in a future release. 104 */ 105 public void addReplace(int oldLength, int newLength) { 106 if(oldLength < 0 || newLength < 0) { 107 throw new IllegalArgumentException( 108 "addReplace(" + oldLength + ", " + newLength + 109 "): both lengths must be non-negative"); 110 } 111 if (oldLength == 0 && newLength == 0) { 112 return; 113 } 114 ++numChanges; 115 int newDelta = newLength - oldLength; 116 if (newDelta != 0) { 117 if ((newDelta > 0 && delta >= 0 && newDelta > (Integer.MAX_VALUE - delta)) || 118 (newDelta < 0 && delta < 0 && newDelta < (Integer.MIN_VALUE - delta))) { 119 // Integer overflow or underflow. 120 throw new IndexOutOfBoundsException(); 121 } 122 delta += newDelta; 123 } 124 125 if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH && 126 newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) { 127 // Merge into previous same-lengths short-replacement record, if any. 128 int u = (oldLength << 12) | (newLength << 9); 129 int last = lastUnit(); 130 if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE && 131 (last & ~SHORT_CHANGE_NUM_MASK) == u && 132 (last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) { 133 setLastUnit(last + 1); 134 return; 135 } 136 append(u); 137 return; 138 } 139 140 int head = 0x7000; 141 if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) { 142 head |= oldLength << 6; 143 head |= newLength; 144 append(head); 145 } else if ((array.length - length) >= 5 || growArray()) { 146 int limit = length + 1; 147 if(oldLength < LENGTH_IN_1TRAIL) { 148 head |= oldLength << 6; 149 } else if(oldLength <= 0x7fff) { 150 head |= LENGTH_IN_1TRAIL << 6; 151 array[limit++] = (char)(0x8000 | oldLength); 152 } else { 153 head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6; 154 array[limit++] = (char)(0x8000 | (oldLength >> 15)); 155 array[limit++] = (char)(0x8000 | oldLength); 156 } 157 if(newLength < LENGTH_IN_1TRAIL) { 158 head |= newLength; 159 } else if(newLength <= 0x7fff) { 160 head |= LENGTH_IN_1TRAIL; 161 array[limit++] = (char)(0x8000 | newLength); 162 } else { 163 head |= LENGTH_IN_2TRAIL + (newLength >> 30); 164 array[limit++] = (char)(0x8000 | (newLength >> 15)); 165 array[limit++] = (char)(0x8000 | newLength); 166 } 167 array[length] = (char)head; 168 length = limit; 169 } 170 } 171 172 private void append(int r) { 173 if(length < array.length || growArray()) { 174 array[length++] = (char)r; 175 } 176 } 177 178 private boolean growArray() { 179 int newCapacity; 180 if (array.length == STACK_CAPACITY) { 181 newCapacity = 2000; 182 } else if (array.length == Integer.MAX_VALUE) { 183 throw new BufferOverflowException(); 184 } else if (array.length >= (Integer.MAX_VALUE / 2)) { 185 newCapacity = Integer.MAX_VALUE; 186 } else { 187 newCapacity = 2 * array.length; 188 } 189 // Grow by at least 5 units so that a maximal change record will fit. 190 if ((newCapacity - array.length) < 5) { 191 throw new BufferOverflowException(); 192 } 193 array = Arrays.copyOf(array, newCapacity); 194 return true; 195 } 196 197 /** 198 * How much longer is the new text compared with the old text? 199 * @return new length minus old length 200 * @draft ICU 59 201 * @provisional This API might change or be removed in a future release. 202 */ 203 public int lengthDelta() { return delta; } 204 /** 205 * @return true if there are any change edits 206 * @draft ICU 59 207 * @provisional This API might change or be removed in a future release. 208 */ 209 public boolean hasChanges() { return numChanges != 0; } 210 211 /** 212 * @return the number of change edits 213 * @draft ICU 60 214 * @provisional This API might change or be removed in a future release. 215 */ 216 public int numberOfChanges() { return numChanges; } 217 218 /** 219 * Access to the list of edits. 220 * @see #getCoarseIterator 221 * @see #getFineIterator 222 * @draft ICU 59 223 * @provisional This API might change or be removed in a future release. 224 */ 225 public static final class Iterator { 226 private final char[] array; 227 private int index; 228 private final int length; 229 /** 230 * 0 if we are not within compressed equal-length changes. 231 * Otherwise the number of remaining changes, including the current one. 232 */ 233 private int remaining; 234 private final boolean onlyChanges_, coarse; 235 236 private int dir; // iteration direction: back(<0), initial(0), forward(>0) 237 private boolean changed; 238 private int oldLength_, newLength_; 239 private int srcIndex, replIndex, destIndex; 240 241 private Iterator(char[] a, int len, boolean oc, boolean crs) { 242 array = a; 243 length = len; 244 onlyChanges_ = oc; 245 coarse = crs; 246 } 247 248 private int readLength(int head) { 249 if (head < LENGTH_IN_1TRAIL) { 250 return head; 251 } else if (head < LENGTH_IN_2TRAIL) { 252 assert(index < length); 253 assert(array[index] >= 0x8000); 254 return array[index++] & 0x7fff; 255 } else { 256 assert((index + 2) <= length); 257 assert(array[index] >= 0x8000); 258 assert(array[index + 1] >= 0x8000); 259 int len = ((head & 1) << 30) | 260 ((array[index] & 0x7fff) << 15) | 261 (array[index + 1] & 0x7fff); 262 index += 2; 263 return len; 264 } 265 } 266 267 private void updateNextIndexes() { 268 srcIndex += oldLength_; 269 if (changed) { 270 replIndex += newLength_; 271 } 272 destIndex += newLength_; 273 } 274 275 private void updatePreviousIndexes() { 276 srcIndex -= oldLength_; 277 if (changed) { 278 replIndex -= newLength_; 279 } 280 destIndex -= newLength_; 281 } 282 283 private boolean noNext() { 284 // No change before or beyond the string. 285 dir = 0; 286 changed = false; 287 oldLength_ = newLength_ = 0; 288 return false; 289 } 290 291 /** 292 * Advances to the next edit. 293 * @return true if there is another edit 294 * @draft ICU 59 295 * @provisional This API might change or be removed in a future release. 296 */ 297 public boolean next() { 298 return next(onlyChanges_); 299 } 300 301 private boolean next(boolean onlyChanges) { 302 // Forward iteration: Update the string indexes to the limit of the current span, 303 // and post-increment-read array units to assemble a new span. 304 // Leaves the array index one after the last unit of that span. 305 if (dir > 0) { 306 updateNextIndexes(); 307 } else { 308 if (dir < 0) { 309 // Turn around from previous() to next(). 310 // Post-increment-read the same span again. 311 if (remaining > 0) { 312 // Fine-grained iterator: 313 // Stay on the current one of a sequence of compressed changes. 314 ++index; // next() rests on the index after the sequence unit. 315 dir = 1; 316 return true; 317 } 318 } 319 dir = 1; 320 } 321 if (remaining >= 1) { 322 // Fine-grained iterator: Continue a sequence of compressed changes. 323 if (remaining > 1) { 324 --remaining; 325 return true; 326 } 327 remaining = 0; 328 } 329 if (index >= length) { 330 return noNext(); 331 } 332 int u = array[index++]; 333 if (u <= MAX_UNCHANGED) { 334 // Combine adjacent unchanged ranges. 335 changed = false; 336 oldLength_ = u + 1; 337 while (index < length && (u = array[index]) <= MAX_UNCHANGED) { 338 ++index; 339 oldLength_ += u + 1; 340 } 341 newLength_ = oldLength_; 342 if (onlyChanges) { 343 updateNextIndexes(); 344 if (index >= length) { 345 return noNext(); 346 } 347 // already fetched u > MAX_UNCHANGED at index 348 ++index; 349 } else { 350 return true; 351 } 352 } 353 changed = true; 354 if (u <= MAX_SHORT_CHANGE) { 355 int oldLen = u >> 12; 356 int newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH; 357 int num = (u & SHORT_CHANGE_NUM_MASK) + 1; 358 if (coarse) { 359 oldLength_ = num * oldLen; 360 newLength_ = num * newLen; 361 } else { 362 // Split a sequence of changes that was compressed into one unit. 363 oldLength_ = oldLen; 364 newLength_ = newLen; 365 if (num > 1) { 366 remaining = num; // This is the first of two or more changes. 367 } 368 return true; 369 } 370 } else { 371 assert(u <= 0x7fff); 372 oldLength_ = readLength((u >> 6) & 0x3f); 373 newLength_ = readLength(u & 0x3f); 374 if (!coarse) { 375 return true; 376 } 377 } 378 // Combine adjacent changes. 379 while (index < length && (u = array[index]) > MAX_UNCHANGED) { 380 ++index; 381 if (u <= MAX_SHORT_CHANGE) { 382 int num = (u & SHORT_CHANGE_NUM_MASK) + 1; 383 oldLength_ += (u >> 12) * num; 384 newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num; 385 } else { 386 assert(u <= 0x7fff); 387 oldLength_ += readLength((u >> 6) & 0x3f); 388 newLength_ += readLength(u & 0x3f); 389 } 390 } 391 return true; 392 } 393 394 private boolean previous() { 395 // Backward iteration: Pre-decrement-read array units to assemble a new span, 396 // then update the string indexes to the start of that span. 397 // Leaves the array index on the head unit of that span. 398 if (dir >= 0) { 399 if (dir > 0) { 400 // Turn around from next() to previous(). 401 // Set the string indexes to the span limit and 402 // pre-decrement-read the same span again. 403 if (remaining > 0) { 404 // Fine-grained iterator: 405 // Stay on the current one of a sequence of compressed changes. 406 --index; // previous() rests on the sequence unit. 407 dir = -1; 408 return true; 409 } 410 updateNextIndexes(); 411 } 412 dir = -1; 413 } 414 if (remaining > 0) { 415 // Fine-grained iterator: Continue a sequence of compressed changes. 416 int u = array[index]; 417 assert(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE); 418 if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) { 419 ++remaining; 420 updatePreviousIndexes(); 421 return true; 422 } 423 remaining = 0; 424 } 425 if (index <= 0) { 426 return noNext(); 427 } 428 int u = array[--index]; 429 if (u <= MAX_UNCHANGED) { 430 // Combine adjacent unchanged ranges. 431 changed = false; 432 oldLength_ = u + 1; 433 while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) { 434 --index; 435 oldLength_ += u + 1; 436 } 437 newLength_ = oldLength_; 438 // No need to handle onlyChanges as long as previous() is called only from findIndex(). 439 updatePreviousIndexes(); 440 return true; 441 } 442 changed = true; 443 if (u <= MAX_SHORT_CHANGE) { 444 int oldLen = u >> 12; 445 int newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH; 446 int num = (u & SHORT_CHANGE_NUM_MASK) + 1; 447 if (coarse) { 448 oldLength_ = num * oldLen; 449 newLength_ = num * newLen; 450 } else { 451 // Split a sequence of changes that was compressed into one unit. 452 oldLength_ = oldLen; 453 newLength_ = newLen; 454 if (num > 1) { 455 remaining = 1; // This is the last of two or more changes. 456 } 457 updatePreviousIndexes(); 458 return true; 459 } 460 } else { 461 if (u <= 0x7fff) { 462 // The change is encoded in u alone. 463 oldLength_ = readLength((u >> 6) & 0x3f); 464 newLength_ = readLength(u & 0x3f); 465 } else { 466 // Back up to the head of the change, read the lengths, 467 // and reset the index to the head again. 468 assert(index > 0); 469 while ((u = array[--index]) > 0x7fff) {} 470 assert(u > MAX_SHORT_CHANGE); 471 int headIndex = index++; 472 oldLength_ = readLength((u >> 6) & 0x3f); 473 newLength_ = readLength(u & 0x3f); 474 index = headIndex; 475 } 476 if (!coarse) { 477 updatePreviousIndexes(); 478 return true; 479 } 480 } 481 // Combine adjacent changes. 482 while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) { 483 --index; 484 if (u <= MAX_SHORT_CHANGE) { 485 int num = (u & SHORT_CHANGE_NUM_MASK) + 1; 486 oldLength_ += (u >> 12) * num; 487 newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num; 488 } else if (u <= 0x7fff) { 489 // Read the lengths, and reset the index to the head again. 490 int headIndex = index++; 491 oldLength_ += readLength((u >> 6) & 0x3f); 492 newLength_ += readLength(u & 0x3f); 493 index = headIndex; 494 } 495 } 496 updatePreviousIndexes(); 497 return true; 498 } 499 500 /** 501 * Finds the edit that contains the source index. 502 * The source index may be found in a non-change 503 * even if normal iteration would skip non-changes. 504 * Normal iteration can continue from a found edit. 505 * 506 * <p>The iterator state before this search logically does not matter. 507 * (It may affect the performance of the search.) 508 * 509 * <p>The iterator state after this search is undefined 510 * if the source index is out of bounds for the source string. 511 * 512 * @param i source index 513 * @return true if the edit for the source index was found 514 * @draft ICU 59 515 * @provisional This API might change or be removed in a future release. 516 */ 517 public boolean findSourceIndex(int i) { 518 return findIndex(i, true) == 0; 519 } 520 521 /** 522 * Finds the edit that contains the destination index. 523 * The destination index may be found in a non-change 524 * even if normal iteration would skip non-changes. 525 * Normal iteration can continue from a found edit. 526 * 527 * <p>The iterator state before this search logically does not matter. 528 * (It may affect the performance of the search.) 529 * 530 * <p>The iterator state after this search is undefined 531 * if the source index is out of bounds for the source string. 532 * 533 * @param i destination index 534 * @return true if the edit for the destination index was found 535 * @draft ICU 60 536 * @provisional This API might change or be removed in a future release. 537 */ 538 public boolean findDestinationIndex(int i) { 539 return findIndex(i, false) == 0; 540 } 541 542 /** @return -1: error or i<0; 0: found; 1: i>=string length */ 543 private int findIndex(int i, boolean findSource) { 544 if (i < 0) { return -1; } 545 int spanStart, spanLength; 546 if (findSource) { // find source index 547 spanStart = srcIndex; 548 spanLength = oldLength_; 549 } else { // find destination index 550 spanStart = destIndex; 551 spanLength = newLength_; 552 } 553 if (i < spanStart) { 554 if (i >= (spanStart / 2)) { 555 // Search backwards. 556 for (;;) { 557 boolean hasPrevious = previous(); 558 assert(hasPrevious); // because i>=0 and the first span starts at 0 559 spanStart = findSource ? srcIndex : destIndex; 560 if (i >= spanStart) { 561 // The index is in the current span. 562 return 0; 563 } 564 if (remaining > 0) { 565 // Is the index in one of the remaining compressed edits? 566 // spanStart is the start of the current span, first of the remaining ones. 567 spanLength = findSource ? oldLength_ : newLength_; 568 int u = array[index]; 569 assert(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE); 570 int num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining; 571 int len = num * spanLength; 572 if (i >= (spanStart - len)) { 573 int n = ((spanStart - i - 1) / spanLength) + 1; 574 // 1 <= n <= num 575 srcIndex -= n * oldLength_; 576 replIndex -= n * newLength_; 577 destIndex -= n * newLength_; 578 remaining += n; 579 return 0; 580 } 581 // Skip all of these edits at once. 582 srcIndex -= num * oldLength_; 583 replIndex -= num * newLength_; 584 destIndex -= num * newLength_; 585 remaining = 0; 586 } 587 } 588 } 589 // Reset the iterator to the start. 590 dir = 0; 591 index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0; 592 } else if (i < (spanStart + spanLength)) { 593 // The index is in the current span. 594 return 0; 595 } 596 while (next(false)) { 597 if (findSource) { 598 spanStart = srcIndex; 599 spanLength = oldLength_; 600 } else { 601 spanStart = destIndex; 602 spanLength = newLength_; 603 } 604 if (i < (spanStart + spanLength)) { 605 // The index is in the current span. 606 return 0; 607 } 608 if (remaining > 1) { 609 // Is the index in one of the remaining compressed edits? 610 // spanStart is the start of the current span, first of the remaining ones. 611 int len = remaining * spanLength; 612 if (i < (spanStart + len)) { 613 int n = (i - spanStart) / spanLength; // 1 <= n <= remaining - 1 614 srcIndex += n * oldLength_; 615 replIndex += n * newLength_; 616 destIndex += n * newLength_; 617 remaining -= n; 618 return 0; 619 } 620 // Make next() skip all of these edits at once. 621 oldLength_ *= remaining; 622 newLength_ *= remaining; 623 remaining = 0; 624 } 625 } 626 return 1; 627 } 628 629 /** 630 * Returns the destination index corresponding to the given source index. 631 * If the source index is inside a change edit (not at its start), 632 * then the destination index at the end of that edit is returned, 633 * since there is no information about index mapping inside a change edit. 634 * 635 * <p>(This means that indexes to the start and middle of an edit, 636 * for example around a grapheme cluster, are mapped to indexes 637 * encompassing the entire edit. 638 * The alternative, mapping an interior index to the start, 639 * would map such an interval to an empty one.) 640 * 641 * <p>This operation will usually but not always modify this object. 642 * The iterator state after this search is undefined. 643 * 644 * @param i source index 645 * @return destination index; undefined if i is not 0..string length 646 * @draft ICU 60 647 * @provisional This API might change or be removed in a future release. 648 */ 649 public int destinationIndexFromSourceIndex(int i) { 650 int where = findIndex(i, true); 651 if (where < 0) { 652 // Error or before the string. 653 return 0; 654 } 655 if (where > 0 || i == srcIndex) { 656 // At or after string length, or at start of the found span. 657 return destIndex; 658 } 659 if (changed) { 660 // In a change span, map to its end. 661 return destIndex + newLength_; 662 } else { 663 // In an unchanged span, offset 1:1 within it. 664 return destIndex + (i - srcIndex); 665 } 666 } 667 668 /** 669 * Returns the source index corresponding to the given destination index. 670 * If the destination index is inside a change edit (not at its start), 671 * then the source index at the end of that edit is returned, 672 * since there is no information about index mapping inside a change edit. 673 * 674 * <p>(This means that indexes to the start and middle of an edit, 675 * for example around a grapheme cluster, are mapped to indexes 676 * encompassing the entire edit. 677 * The alternative, mapping an interior index to the start, 678 * would map such an interval to an empty one.) 679 * 680 * <p>This operation will usually but not always modify this object. 681 * The iterator state after this search is undefined. 682 * 683 * @param i destination index 684 * @return source index; undefined if i is not 0..string length 685 * @draft ICU 60 686 * @provisional This API might change or be removed in a future release. 687 */ 688 public int sourceIndexFromDestinationIndex(int i) { 689 int where = findIndex(i, false); 690 if (where < 0) { 691 // Error or before the string. 692 return 0; 693 } 694 if (where > 0 || i == destIndex) { 695 // At or after string length, or at start of the found span. 696 return srcIndex; 697 } 698 if (changed) { 699 // In a change span, map to its end. 700 return srcIndex + oldLength_; 701 } else { 702 // In an unchanged span, offset within it. 703 return srcIndex + (i - destIndex); 704 } 705 } 706 707 /** 708 * @return true if this edit replaces oldLength() units with newLength() different ones. 709 * false if oldLength units remain unchanged. 710 * @draft ICU 59 711 * @provisional This API might change or be removed in a future release. 712 */ 713 public boolean hasChange() { return changed; } 714 /** 715 * @return the number of units in the original string which are replaced or remain unchanged. 716 * @draft ICU 59 717 * @provisional This API might change or be removed in a future release. 718 */ 719 public int oldLength() { return oldLength_; } 720 /** 721 * @return the number of units in the modified string, if hasChange() is true. 722 * Same as oldLength if hasChange() is false. 723 * @draft ICU 59 724 * @provisional This API might change or be removed in a future release. 725 */ 726 public int newLength() { return newLength_; } 727 728 /** 729 * @return the current index into the source string 730 * @draft ICU 59 731 * @provisional This API might change or be removed in a future release. 732 */ 733 public int sourceIndex() { return srcIndex; } 734 /** 735 * @return the current index into the replacement-characters-only string, 736 * not counting unchanged spans 737 * @draft ICU 59 738 * @provisional This API might change or be removed in a future release. 739 */ 740 public int replacementIndex() { return replIndex; } 741 /** 742 * @return the current index into the full destination string 743 * @draft ICU 59 744 * @provisional This API might change or be removed in a future release. 745 */ 746 public int destinationIndex() { return destIndex; } 747 }; 748 749 /** 750 * Returns an Iterator for coarse-grained changes for simple string updates. 751 * Skips non-changes. 752 * @return an Iterator that merges adjacent changes. 753 * @draft ICU 59 754 * @provisional This API might change or be removed in a future release. 755 */ 756 public Iterator getCoarseChangesIterator() { 757 return new Iterator(array, length, true, true); 758 } 759 760 /** 761 * Returns an Iterator for coarse-grained changes and non-changes for simple string updates. 762 * @return an Iterator that merges adjacent changes. 763 * @draft ICU 59 764 * @provisional This API might change or be removed in a future release. 765 */ 766 public Iterator getCoarseIterator() { 767 return new Iterator(array, length, false, true); 768 } 769 770 /** 771 * Returns an Iterator for fine-grained changes for modifying styled text. 772 * Skips non-changes. 773 * @return an Iterator that separates adjacent changes. 774 * @draft ICU 59 775 * @provisional This API might change or be removed in a future release. 776 */ 777 public Iterator getFineChangesIterator() { 778 return new Iterator(array, length, true, false); 779 } 780 781 /** 782 * Returns an Iterator for fine-grained changes and non-changes for modifying styled text. 783 * @return an Iterator that separates adjacent changes. 784 * @draft ICU 59 785 * @provisional This API might change or be removed in a future release. 786 */ 787 public Iterator getFineIterator() { 788 return new Iterator(array, length, false, false); 789 } 790 791 /** 792 * Merges the two input Edits and appends the result to this object. 793 * 794 * <p>Consider two string transformations (for example, normalization and case mapping) 795 * where each records Edits in addition to writing an output string.<br> 796 * Edits ab reflect how substrings of input string a 797 * map to substrings of intermediate string b.<br> 798 * Edits bc reflect how substrings of intermediate string b 799 * map to substrings of output string c.<br> 800 * This function merges ab and bc such that the additional edits 801 * recorded in this object reflect how substrings of input string a 802 * map to substrings of output string c. 803 * 804 * <p>If unrelated Edits are passed in where the output string of the first 805 * has a different length than the input string of the second, 806 * then an IllegalArgumentException is thrown. 807 * 808 * @param ab reflects how substrings of input string a 809 * map to substrings of intermediate string b. 810 * @param bc reflects how substrings of intermediate string b 811 * map to substrings of output string c. 812 * @return this, with the merged edits appended 813 * @draft ICU 60 814 * @provisional This API might change or be removed in a future release. 815 */ 816 public Edits mergeAndAppend(Edits ab, Edits bc) { 817 // Picture string a --(Edits ab)--> string b --(Edits bc)--> string c. 818 // Parallel iteration over both Edits. 819 Iterator abIter = ab.getFineIterator(); 820 Iterator bcIter = bc.getFineIterator(); 821 boolean abHasNext = true, bcHasNext = true; 822 // Copy iterator state into local variables, so that we can modify and subdivide spans. 823 // ab old & new length, bc old & new length 824 int aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0; 825 // When we have different-intermediate-length changes, we accumulate a larger change. 826 int pending_aLength = 0, pending_cLength = 0; 827 for (;;) { 828 // At this point, for each of the two iterators: 829 // Either we are done with the locally cached current edit, 830 // and its intermediate-string length has been reset, 831 // or we will continue to work with a truncated remainder of this edit. 832 // 833 // If the current edit is done, and the iterator has not yet reached the end, 834 // then we fetch the next edit. This is true for at least one of the iterators. 835 // 836 // Normally it does not matter whether we fetch from ab and then bc or vice versa. 837 // However, the result is observably different when 838 // ab deletions meet bc insertions at the same intermediate-string index. 839 // Some users expect the bc insertions to come first, so we fetch from bc first. 840 if (bc_bLength == 0) { 841 if (bcHasNext && (bcHasNext = bcIter.next())) { 842 bc_bLength = bcIter.oldLength(); 843 cLength = bcIter.newLength(); 844 if (bc_bLength == 0) { 845 // insertion 846 if (ab_bLength == 0 || !abIter.hasChange()) { 847 addReplace(pending_aLength, pending_cLength + cLength); 848 pending_aLength = pending_cLength = 0; 849 } else { 850 pending_cLength += cLength; 851 } 852 continue; 853 } 854 } 855 // else see if the other iterator is done, too. 856 } 857 if (ab_bLength == 0) { 858 if (abHasNext && (abHasNext = abIter.next())) { 859 aLength = abIter.oldLength(); 860 ab_bLength = abIter.newLength(); 861 if (ab_bLength == 0) { 862 // deletion 863 if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) { 864 addReplace(pending_aLength + aLength, pending_cLength); 865 pending_aLength = pending_cLength = 0; 866 } else { 867 pending_aLength += aLength; 868 } 869 continue; 870 } 871 } else if (bc_bLength == 0) { 872 // Both iterators are done at the same time: 873 // The intermediate-string lengths match. 874 break; 875 } else { 876 throw new IllegalArgumentException( 877 "The ab output string is shorter than the bc input string."); 878 } 879 } 880 if (bc_bLength == 0) { 881 throw new IllegalArgumentException( 882 "The bc input string is shorter than the ab output string."); 883 } 884 // Done fetching: ab_bLength > 0 && bc_bLength > 0 885 886 // The current state has two parts: 887 // - Past: We accumulate a longer ac edit in the "pending" variables. 888 // - Current: We have copies of the current ab/bc edits in local variables. 889 // At least one side is newly fetched. 890 // One side might be a truncated remainder of an edit we fetched earlier. 891 892 if (!abIter.hasChange() && !bcIter.hasChange()) { 893 // An unchanged span all the way from string a to string c. 894 if (pending_aLength != 0 || pending_cLength != 0) { 895 addReplace(pending_aLength, pending_cLength); 896 pending_aLength = pending_cLength = 0; 897 } 898 int unchangedLength = aLength <= cLength ? aLength : cLength; 899 addUnchanged(unchangedLength); 900 ab_bLength = aLength -= unchangedLength; 901 bc_bLength = cLength -= unchangedLength; 902 // At least one of the unchanged spans is now empty. 903 continue; 904 } 905 if (!abIter.hasChange() && bcIter.hasChange()) { 906 // Unchanged a->b but changed b->c. 907 if (ab_bLength >= bc_bLength) { 908 // Split the longer unchanged span into change + remainder. 909 addReplace(pending_aLength + bc_bLength, pending_cLength + cLength); 910 pending_aLength = pending_cLength = 0; 911 aLength = ab_bLength -= bc_bLength; 912 bc_bLength = 0; 913 continue; 914 } 915 // Handle the shorter unchanged span below like a change. 916 } else if (abIter.hasChange() && !bcIter.hasChange()) { 917 // Changed a->b and then unchanged b->c. 918 if (ab_bLength <= bc_bLength) { 919 // Split the longer unchanged span into change + remainder. 920 addReplace(pending_aLength + aLength, pending_cLength + ab_bLength); 921 pending_aLength = pending_cLength = 0; 922 cLength = bc_bLength -= ab_bLength; 923 ab_bLength = 0; 924 continue; 925 } 926 // Handle the shorter unchanged span below like a change. 927 } else { // both abIter.hasChange() && bcIter.hasChange() 928 if (ab_bLength == bc_bLength) { 929 // Changes on both sides up to the same position. Emit & reset. 930 addReplace(pending_aLength + aLength, pending_cLength + cLength); 931 pending_aLength = pending_cLength = 0; 932 ab_bLength = bc_bLength = 0; 933 continue; 934 } 935 } 936 // Accumulate the a->c change, reset the shorter side, 937 // keep a remainder of the longer one. 938 pending_aLength += aLength; 939 pending_cLength += cLength; 940 if (ab_bLength < bc_bLength) { 941 bc_bLength -= ab_bLength; 942 cLength = ab_bLength = 0; 943 } else { // ab_bLength > bc_bLength 944 ab_bLength -= bc_bLength; 945 aLength = bc_bLength = 0; 946 } 947 } 948 if (pending_aLength != 0 || pending_cLength != 0) { 949 addReplace(pending_aLength, pending_cLength); 950 } 951 return this; 952 } 953} 954