1/* 2******************************************************************************* 3* 4* Copyright (C) 2005-2015, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: utext.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2005apr12 14* created by: Markus W. Scherer 15*/ 16 17#include "unicode/utypes.h" 18#include "unicode/ustring.h" 19#include "unicode/unistr.h" 20#include "unicode/chariter.h" 21#include "unicode/utext.h" 22#include "unicode/utf.h" 23#include "unicode/utf8.h" 24#include "unicode/utf16.h" 25#include "ustr_imp.h" 26#include "cmemory.h" 27#include "cstring.h" 28#include "uassert.h" 29#include "putilimp.h" 30 31U_NAMESPACE_USE 32 33#define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex)) 34 35 36static UBool 37utext_access(UText *ut, int64_t index, UBool forward) { 38 return ut->pFuncs->access(ut, index, forward); 39} 40 41 42 43U_CAPI UBool U_EXPORT2 44utext_moveIndex32(UText *ut, int32_t delta) { 45 UChar32 c; 46 if (delta > 0) { 47 do { 48 if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, TRUE)) { 49 return FALSE; 50 } 51 c = ut->chunkContents[ut->chunkOffset]; 52 if (U16_IS_SURROGATE(c)) { 53 c = utext_next32(ut); 54 if (c == U_SENTINEL) { 55 return FALSE; 56 } 57 } else { 58 ut->chunkOffset++; 59 } 60 } while(--delta>0); 61 62 } else if (delta<0) { 63 do { 64 if(ut->chunkOffset<=0 && !utext_access(ut, ut->chunkNativeStart, FALSE)) { 65 return FALSE; 66 } 67 c = ut->chunkContents[ut->chunkOffset-1]; 68 if (U16_IS_SURROGATE(c)) { 69 c = utext_previous32(ut); 70 if (c == U_SENTINEL) { 71 return FALSE; 72 } 73 } else { 74 ut->chunkOffset--; 75 } 76 } while(++delta<0); 77 } 78 79 return TRUE; 80} 81 82 83U_CAPI int64_t U_EXPORT2 84utext_nativeLength(UText *ut) { 85 return ut->pFuncs->nativeLength(ut); 86} 87 88 89U_CAPI UBool U_EXPORT2 90utext_isLengthExpensive(const UText *ut) { 91 UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0; 92 return r; 93} 94 95 96U_CAPI int64_t U_EXPORT2 97utext_getNativeIndex(const UText *ut) { 98 if(ut->chunkOffset <= ut->nativeIndexingLimit) { 99 return ut->chunkNativeStart+ut->chunkOffset; 100 } else { 101 return ut->pFuncs->mapOffsetToNative(ut); 102 } 103} 104 105 106U_CAPI void U_EXPORT2 107utext_setNativeIndex(UText *ut, int64_t index) { 108 if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) { 109 // The desired position is outside of the current chunk. 110 // Access the new position. Assume a forward iteration from here, 111 // which will also be optimimum for a single random access. 112 // Reverse iterations may suffer slightly. 113 ut->pFuncs->access(ut, index, TRUE); 114 } else if((int32_t)(index - ut->chunkNativeStart) <= ut->nativeIndexingLimit) { 115 // utf-16 indexing. 116 ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart); 117 } else { 118 ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index); 119 } 120 // The convention is that the index must always be on a code point boundary. 121 // Adjust the index position if it is in the middle of a surrogate pair. 122 if (ut->chunkOffset<ut->chunkLength) { 123 UChar c= ut->chunkContents[ut->chunkOffset]; 124 if (U16_IS_TRAIL(c)) { 125 if (ut->chunkOffset==0) { 126 ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE); 127 } 128 if (ut->chunkOffset>0) { 129 UChar lead = ut->chunkContents[ut->chunkOffset-1]; 130 if (U16_IS_LEAD(lead)) { 131 ut->chunkOffset--; 132 } 133 } 134 } 135 } 136} 137 138 139 140U_CAPI int64_t U_EXPORT2 141utext_getPreviousNativeIndex(UText *ut) { 142 // 143 // Fast-path the common case. 144 // Common means current position is not at the beginning of a chunk 145 // and the preceding character is not supplementary. 146 // 147 int32_t i = ut->chunkOffset - 1; 148 int64_t result; 149 if (i >= 0) { 150 UChar c = ut->chunkContents[i]; 151 if (U16_IS_TRAIL(c) == FALSE) { 152 if (i <= ut->nativeIndexingLimit) { 153 result = ut->chunkNativeStart + i; 154 } else { 155 ut->chunkOffset = i; 156 result = ut->pFuncs->mapOffsetToNative(ut); 157 ut->chunkOffset++; 158 } 159 return result; 160 } 161 } 162 163 // If at the start of text, simply return 0. 164 if (ut->chunkOffset==0 && ut->chunkNativeStart==0) { 165 return 0; 166 } 167 168 // Harder, less common cases. We are at a chunk boundary, or on a surrogate. 169 // Keep it simple, use other functions to handle the edges. 170 // 171 utext_previous32(ut); 172 result = UTEXT_GETNATIVEINDEX(ut); 173 utext_next32(ut); 174 return result; 175} 176 177 178// 179// utext_current32. Get the UChar32 at the current position. 180// UText iteration position is always on a code point boundary, 181// never on the trail half of a surrogate pair. 182// 183U_CAPI UChar32 U_EXPORT2 184utext_current32(UText *ut) { 185 UChar32 c; 186 if (ut->chunkOffset==ut->chunkLength) { 187 // Current position is just off the end of the chunk. 188 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) { 189 // Off the end of the text. 190 return U_SENTINEL; 191 } 192 } 193 194 c = ut->chunkContents[ut->chunkOffset]; 195 if (U16_IS_LEAD(c) == FALSE) { 196 // Normal, non-supplementary case. 197 return c; 198 } 199 200 // 201 // Possible supplementary char. 202 // 203 UChar32 trail = 0; 204 UChar32 supplementaryC = c; 205 if ((ut->chunkOffset+1) < ut->chunkLength) { 206 // The trail surrogate is in the same chunk. 207 trail = ut->chunkContents[ut->chunkOffset+1]; 208 } else { 209 // The trail surrogate is in a different chunk. 210 // Because we must maintain the iteration position, we need to switch forward 211 // into the new chunk, get the trail surrogate, then revert the chunk back to the 212 // original one. 213 // An edge case to be careful of: the entire text may end with an unpaired 214 // leading surrogate. The attempt to access the trail will fail, but 215 // the original position before the unpaired lead still needs to be restored. 216 int64_t nativePosition = ut->chunkNativeLimit; 217 int32_t originalOffset = ut->chunkOffset; 218 if (ut->pFuncs->access(ut, nativePosition, TRUE)) { 219 trail = ut->chunkContents[ut->chunkOffset]; 220 } 221 UBool r = ut->pFuncs->access(ut, nativePosition, FALSE); // reverse iteration flag loads preceding chunk 222 U_ASSERT(r==TRUE); 223 ut->chunkOffset = originalOffset; 224 if(!r) { 225 return U_SENTINEL; 226 } 227 } 228 229 if (U16_IS_TRAIL(trail)) { 230 supplementaryC = U16_GET_SUPPLEMENTARY(c, trail); 231 } 232 return supplementaryC; 233 234} 235 236 237U_CAPI UChar32 U_EXPORT2 238utext_char32At(UText *ut, int64_t nativeIndex) { 239 UChar32 c = U_SENTINEL; 240 241 // Fast path the common case. 242 if (nativeIndex>=ut->chunkNativeStart && nativeIndex < ut->chunkNativeStart + ut->nativeIndexingLimit) { 243 ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart); 244 c = ut->chunkContents[ut->chunkOffset]; 245 if (U16_IS_SURROGATE(c) == FALSE) { 246 return c; 247 } 248 } 249 250 251 utext_setNativeIndex(ut, nativeIndex); 252 if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) { 253 c = ut->chunkContents[ut->chunkOffset]; 254 if (U16_IS_SURROGATE(c)) { 255 // For surrogates, let current32() deal with the complications 256 // of supplementaries that may span chunk boundaries. 257 c = utext_current32(ut); 258 } 259 } 260 return c; 261} 262 263 264U_CAPI UChar32 U_EXPORT2 265utext_next32(UText *ut) { 266 UChar32 c; 267 268 if (ut->chunkOffset >= ut->chunkLength) { 269 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) { 270 return U_SENTINEL; 271 } 272 } 273 274 c = ut->chunkContents[ut->chunkOffset++]; 275 if (U16_IS_LEAD(c) == FALSE) { 276 // Normal case, not supplementary. 277 // (A trail surrogate seen here is just returned as is, as a surrogate value. 278 // It cannot be part of a pair.) 279 return c; 280 } 281 282 if (ut->chunkOffset >= ut->chunkLength) { 283 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) { 284 // c is an unpaired lead surrogate at the end of the text. 285 // return it as it is. 286 return c; 287 } 288 } 289 UChar32 trail = ut->chunkContents[ut->chunkOffset]; 290 if (U16_IS_TRAIL(trail) == FALSE) { 291 // c was an unpaired lead surrogate, not at the end of the text. 292 // return it as it is (unpaired). Iteration position is on the 293 // following character, possibly in the next chunk, where the 294 // trail surrogate would have been if it had existed. 295 return c; 296 } 297 298 UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail); 299 ut->chunkOffset++; // move iteration position over the trail surrogate. 300 return supplementary; 301 } 302 303 304U_CAPI UChar32 U_EXPORT2 305utext_previous32(UText *ut) { 306 UChar32 c; 307 308 if (ut->chunkOffset <= 0) { 309 if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) { 310 return U_SENTINEL; 311 } 312 } 313 ut->chunkOffset--; 314 c = ut->chunkContents[ut->chunkOffset]; 315 if (U16_IS_TRAIL(c) == FALSE) { 316 // Normal case, not supplementary. 317 // (A lead surrogate seen here is just returned as is, as a surrogate value. 318 // It cannot be part of a pair.) 319 return c; 320 } 321 322 if (ut->chunkOffset <= 0) { 323 if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) { 324 // c is an unpaired trail surrogate at the start of the text. 325 // return it as it is. 326 return c; 327 } 328 } 329 330 UChar32 lead = ut->chunkContents[ut->chunkOffset-1]; 331 if (U16_IS_LEAD(lead) == FALSE) { 332 // c was an unpaired trail surrogate, not at the end of the text. 333 // return it as it is (unpaired). Iteration position is at c 334 return c; 335 } 336 337 UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c); 338 ut->chunkOffset--; // move iteration position over the lead surrogate. 339 return supplementary; 340} 341 342 343 344U_CAPI UChar32 U_EXPORT2 345utext_next32From(UText *ut, int64_t index) { 346 UChar32 c = U_SENTINEL; 347 348 if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) { 349 // Desired position is outside of the current chunk. 350 if(!ut->pFuncs->access(ut, index, TRUE)) { 351 // no chunk available here 352 return U_SENTINEL; 353 } 354 } else if (index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) { 355 // Desired position is in chunk, with direct 1:1 native to UTF16 indexing 356 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart); 357 } else { 358 // Desired position is in chunk, with non-UTF16 indexing. 359 ut->chunkOffset = ut->pFuncs->mapNativeIndexToUTF16(ut, index); 360 } 361 362 c = ut->chunkContents[ut->chunkOffset++]; 363 if (U16_IS_SURROGATE(c)) { 364 // Surrogates. Many edge cases. Use other functions that already 365 // deal with the problems. 366 utext_setNativeIndex(ut, index); 367 c = utext_next32(ut); 368 } 369 return c; 370} 371 372 373U_CAPI UChar32 U_EXPORT2 374utext_previous32From(UText *ut, int64_t index) { 375 // 376 // Return the character preceding the specified index. 377 // Leave the iteration position at the start of the character that was returned. 378 // 379 UChar32 cPrev; // The character preceding cCurr, which is what we will return. 380 381 // Address the chunk containg the position preceding the incoming index 382 // A tricky edge case: 383 // We try to test the requested native index against the chunkNativeStart to determine 384 // whether the character preceding the one at the index is in the current chunk. 385 // BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the 386 // requested index is on something other than the first position of the first char. 387 // 388 if(index<=ut->chunkNativeStart || index>ut->chunkNativeLimit) { 389 // Requested native index is outside of the current chunk. 390 if(!ut->pFuncs->access(ut, index, FALSE)) { 391 // no chunk available here 392 return U_SENTINEL; 393 } 394 } else if(index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) { 395 // Direct UTF-16 indexing. 396 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart); 397 } else { 398 ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index); 399 if (ut->chunkOffset==0 && !ut->pFuncs->access(ut, index, FALSE)) { 400 // no chunk available here 401 return U_SENTINEL; 402 } 403 } 404 405 // 406 // Simple case with no surrogates. 407 // 408 ut->chunkOffset--; 409 cPrev = ut->chunkContents[ut->chunkOffset]; 410 411 if (U16_IS_SURROGATE(cPrev)) { 412 // Possible supplementary. Many edge cases. 413 // Let other functions do the heavy lifting. 414 utext_setNativeIndex(ut, index); 415 cPrev = utext_previous32(ut); 416 } 417 return cPrev; 418} 419 420 421U_CAPI int32_t U_EXPORT2 422utext_extract(UText *ut, 423 int64_t start, int64_t limit, 424 UChar *dest, int32_t destCapacity, 425 UErrorCode *status) { 426 return ut->pFuncs->extract(ut, start, limit, dest, destCapacity, status); 427 } 428 429 430 431U_CAPI UBool U_EXPORT2 432utext_equals(const UText *a, const UText *b) { 433 if (a==NULL || b==NULL || 434 a->magic != UTEXT_MAGIC || 435 b->magic != UTEXT_MAGIC) { 436 // Null or invalid arguments don't compare equal to anything. 437 return FALSE; 438 } 439 440 if (a->pFuncs != b->pFuncs) { 441 // Different types of text providers. 442 return FALSE; 443 } 444 445 if (a->context != b->context) { 446 // Different sources (different strings) 447 return FALSE; 448 } 449 if (utext_getNativeIndex(a) != utext_getNativeIndex(b)) { 450 // Different current position in the string. 451 return FALSE; 452 } 453 454 return TRUE; 455} 456 457U_CAPI UBool U_EXPORT2 458utext_isWritable(const UText *ut) 459{ 460 UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0; 461 return b; 462} 463 464 465U_CAPI void U_EXPORT2 466utext_freeze(UText *ut) { 467 // Zero out the WRITABLE flag. 468 ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE)); 469} 470 471 472U_CAPI UBool U_EXPORT2 473utext_hasMetaData(const UText *ut) 474{ 475 UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0; 476 return b; 477} 478 479 480 481U_CAPI int32_t U_EXPORT2 482utext_replace(UText *ut, 483 int64_t nativeStart, int64_t nativeLimit, 484 const UChar *replacementText, int32_t replacementLength, 485 UErrorCode *status) 486{ 487 if (U_FAILURE(*status)) { 488 return 0; 489 } 490 if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) { 491 *status = U_NO_WRITE_PERMISSION; 492 return 0; 493 } 494 int32_t i = ut->pFuncs->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status); 495 return i; 496} 497 498U_CAPI void U_EXPORT2 499utext_copy(UText *ut, 500 int64_t nativeStart, int64_t nativeLimit, 501 int64_t destIndex, 502 UBool move, 503 UErrorCode *status) 504{ 505 if (U_FAILURE(*status)) { 506 return; 507 } 508 if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) { 509 *status = U_NO_WRITE_PERMISSION; 510 return; 511 } 512 ut->pFuncs->copy(ut, nativeStart, nativeLimit, destIndex, move, status); 513} 514 515 516 517U_CAPI UText * U_EXPORT2 518utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) { 519 if (U_FAILURE(*status)) { 520 return dest; 521 } 522 UText *result = src->pFuncs->clone(dest, src, deep, status); 523 if (U_FAILURE(*status)) { 524 return result; 525 } 526 if (result == NULL) { 527 *status = U_MEMORY_ALLOCATION_ERROR; 528 return result; 529 } 530 if (readOnly) { 531 utext_freeze(result); 532 } 533 return result; 534} 535 536 537 538//------------------------------------------------------------------------------ 539// 540// UText common functions implementation 541// 542//------------------------------------------------------------------------------ 543 544// 545// UText.flags bit definitions 546// 547enum { 548 UTEXT_HEAP_ALLOCATED = 1, // 1 if ICU has allocated this UText struct on the heap. 549 // 0 if caller provided storage for the UText. 550 551 UTEXT_EXTRA_HEAP_ALLOCATED = 2, // 1 if ICU has allocated extra storage as a separate 552 // heap block. 553 // 0 if there is no separate allocation. Either no extra 554 // storage was requested, or it is appended to the end 555 // of the main UText storage. 556 557 UTEXT_OPEN = 4 // 1 if this UText is currently open 558 // 0 if this UText is not open. 559}; 560 561 562// 563// Extended form of a UText. The purpose is to aid in computing the total size required 564// when a provider asks for a UText to be allocated with extra storage. 565 566struct ExtendedUText { 567 UText ut; 568 UAlignedMemory extension; 569}; 570 571static const UText emptyText = UTEXT_INITIALIZER; 572 573U_CAPI UText * U_EXPORT2 574utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) { 575 if (U_FAILURE(*status)) { 576 return ut; 577 } 578 579 if (ut == NULL) { 580 // We need to heap-allocate storage for the new UText 581 int32_t spaceRequired = sizeof(UText); 582 if (extraSpace > 0) { 583 spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(UAlignedMemory); 584 } 585 ut = (UText *)uprv_malloc(spaceRequired); 586 if (ut == NULL) { 587 *status = U_MEMORY_ALLOCATION_ERROR; 588 return NULL; 589 } else { 590 *ut = emptyText; 591 ut->flags |= UTEXT_HEAP_ALLOCATED; 592 if (spaceRequired>0) { 593 ut->extraSize = extraSpace; 594 ut->pExtra = &((ExtendedUText *)ut)->extension; 595 } 596 } 597 } else { 598 // We have been supplied with an already existing UText. 599 // Verify that it really appears to be a UText. 600 if (ut->magic != UTEXT_MAGIC) { 601 *status = U_ILLEGAL_ARGUMENT_ERROR; 602 return ut; 603 } 604 // If the ut is already open and there's a provider supplied close 605 // function, call it. 606 if ((ut->flags & UTEXT_OPEN) && ut->pFuncs->close != NULL) { 607 ut->pFuncs->close(ut); 608 } 609 ut->flags &= ~UTEXT_OPEN; 610 611 // If extra space was requested by our caller, check whether 612 // sufficient already exists, and allocate new if needed. 613 if (extraSpace > ut->extraSize) { 614 // Need more space. If there is existing separately allocated space, 615 // delete it first, then allocate new space. 616 if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) { 617 uprv_free(ut->pExtra); 618 ut->extraSize = 0; 619 } 620 ut->pExtra = uprv_malloc(extraSpace); 621 if (ut->pExtra == NULL) { 622 *status = U_MEMORY_ALLOCATION_ERROR; 623 } else { 624 ut->extraSize = extraSpace; 625 ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED; 626 } 627 } 628 } 629 if (U_SUCCESS(*status)) { 630 ut->flags |= UTEXT_OPEN; 631 632 // Initialize all remaining fields of the UText. 633 // 634 ut->context = NULL; 635 ut->chunkContents = NULL; 636 ut->p = NULL; 637 ut->q = NULL; 638 ut->r = NULL; 639 ut->a = 0; 640 ut->b = 0; 641 ut->c = 0; 642 ut->chunkOffset = 0; 643 ut->chunkLength = 0; 644 ut->chunkNativeStart = 0; 645 ut->chunkNativeLimit = 0; 646 ut->nativeIndexingLimit = 0; 647 ut->providerProperties = 0; 648 ut->privA = 0; 649 ut->privB = 0; 650 ut->privC = 0; 651 ut->privP = NULL; 652 if (ut->pExtra!=NULL && ut->extraSize>0) 653 uprv_memset(ut->pExtra, 0, ut->extraSize); 654 655 } 656 return ut; 657} 658 659 660U_CAPI UText * U_EXPORT2 661utext_close(UText *ut) { 662 if (ut==NULL || 663 ut->magic != UTEXT_MAGIC || 664 (ut->flags & UTEXT_OPEN) == 0) 665 { 666 // The supplied ut is not an open UText. 667 // Do nothing. 668 return ut; 669 } 670 671 // If the provider gave us a close function, call it now. 672 // This will clean up anything allocated specifically by the provider. 673 if (ut->pFuncs->close != NULL) { 674 ut->pFuncs->close(ut); 675 } 676 ut->flags &= ~UTEXT_OPEN; 677 678 // If we (the framework) allocated the UText or subsidiary storage, 679 // delete it. 680 if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) { 681 uprv_free(ut->pExtra); 682 ut->pExtra = NULL; 683 ut->flags &= ~UTEXT_EXTRA_HEAP_ALLOCATED; 684 ut->extraSize = 0; 685 } 686 687 // Zero out function table of the closed UText. This is a defensive move, 688 // inteded to cause applications that inadvertantly use a closed 689 // utext to crash with null pointer errors. 690 ut->pFuncs = NULL; 691 692 if (ut->flags & UTEXT_HEAP_ALLOCATED) { 693 // This UText was allocated by UText setup. We need to free it. 694 // Clear magic, so we can detect if the user messes up and immediately 695 // tries to reopen another UText using the deleted storage. 696 ut->magic = 0; 697 uprv_free(ut); 698 ut = NULL; 699 } 700 return ut; 701} 702 703 704 705 706// 707// invalidateChunk Reset a chunk to have no contents, so that the next call 708// to access will cause new data to load. 709// This is needed when copy/move/replace operate directly on the 710// backing text, potentially putting it out of sync with the 711// contents in the chunk. 712// 713static void 714invalidateChunk(UText *ut) { 715 ut->chunkLength = 0; 716 ut->chunkNativeLimit = 0; 717 ut->chunkNativeStart = 0; 718 ut->chunkOffset = 0; 719 ut->nativeIndexingLimit = 0; 720} 721 722// 723// pinIndex Do range pinning on a native index parameter. 724// 64 bit pinning is done in place. 725// 32 bit truncated result is returned as a convenience for 726// use in providers that don't need 64 bits. 727static int32_t 728pinIndex(int64_t &index, int64_t limit) { 729 if (index<0) { 730 index = 0; 731 } else if (index > limit) { 732 index = limit; 733 } 734 return (int32_t)index; 735} 736 737 738U_CDECL_BEGIN 739 740// 741// Pointer relocation function, 742// a utility used by shallow clone. 743// Adjust a pointer that refers to something within one UText (the source) 744// to refer to the same relative offset within a another UText (the target) 745// 746static void adjustPointer(UText *dest, const void **destPtr, const UText *src) { 747 // convert all pointers to (char *) so that byte address arithmetic will work. 748 char *dptr = (char *)*destPtr; 749 char *dUText = (char *)dest; 750 char *sUText = (char *)src; 751 752 if (dptr >= (char *)src->pExtra && dptr < ((char*)src->pExtra)+src->extraSize) { 753 // target ptr was to something within the src UText's pExtra storage. 754 // relocate it into the target UText's pExtra region. 755 *destPtr = ((char *)dest->pExtra) + (dptr - (char *)src->pExtra); 756 } else if (dptr>=sUText && dptr < sUText+src->sizeOfStruct) { 757 // target ptr was pointing to somewhere within the source UText itself. 758 // Move it to the same offset within the target UText. 759 *destPtr = dUText + (dptr-sUText); 760 } 761} 762 763 764// 765// Clone. This is a generic copy-the-utext-by-value clone function that can be 766// used as-is with some utext types, and as a helper by other clones. 767// 768static UText * U_CALLCONV 769shallowTextClone(UText * dest, const UText * src, UErrorCode * status) { 770 if (U_FAILURE(*status)) { 771 return NULL; 772 } 773 int32_t srcExtraSize = src->extraSize; 774 775 // 776 // Use the generic text_setup to allocate storage if required. 777 // 778 dest = utext_setup(dest, srcExtraSize, status); 779 if (U_FAILURE(*status)) { 780 return dest; 781 } 782 783 // 784 // flags (how the UText was allocated) and the pointer to the 785 // extra storage must retain the values in the cloned utext that 786 // were set up by utext_setup. Save them separately before 787 // copying the whole struct. 788 // 789 void *destExtra = dest->pExtra; 790 int32_t flags = dest->flags; 791 792 793 // 794 // Copy the whole UText struct by value. 795 // Any "Extra" storage is copied also. 796 // 797 int sizeToCopy = src->sizeOfStruct; 798 if (sizeToCopy > dest->sizeOfStruct) { 799 sizeToCopy = dest->sizeOfStruct; 800 } 801 uprv_memcpy(dest, src, sizeToCopy); 802 dest->pExtra = destExtra; 803 dest->flags = flags; 804 if (srcExtraSize > 0) { 805 uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize); 806 } 807 808 // 809 // Relocate any pointers in the target that refer to the UText itself 810 // to point to the cloned copy rather than the original source. 811 // 812 adjustPointer(dest, &dest->context, src); 813 adjustPointer(dest, &dest->p, src); 814 adjustPointer(dest, &dest->q, src); 815 adjustPointer(dest, &dest->r, src); 816 adjustPointer(dest, (const void **)&dest->chunkContents, src); 817 818 // The newly shallow-cloned UText does _not_ own the underlying storage for the text. 819 // (The source for the clone may or may not have owned the text.) 820 821 dest->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); 822 823 return dest; 824} 825 826 827U_CDECL_END 828 829 830 831//------------------------------------------------------------------------------ 832// 833// UText implementation for UTF-8 char * strings (read-only) 834// Limitation: string length must be <= 0x7fffffff in length. 835// (length must for in an int32_t variable) 836// 837// Use of UText data members: 838// context pointer to UTF-8 string 839// utext.b is the input string length (bytes). 840// utext.c Length scanned so far in string 841// (for optimizing finding length of zero terminated strings.) 842// utext.p pointer to the current buffer 843// utext.q pointer to the other buffer. 844// 845//------------------------------------------------------------------------------ 846 847// Chunk size. 848// Must be less than 85, because of byte mapping from UChar indexes to native indexes. 849// Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes 850// to two UChars.) 851// 852enum { UTF8_TEXT_CHUNK_SIZE=32 }; 853 854// 855// UTF8Buf Two of these structs will be set up in the UText's extra allocated space. 856// Each contains the UChar chunk buffer, the to and from native maps, and 857// header info. 858// 859// because backwards iteration fills the buffers starting at the end and 860// working towards the front, the filled part of the buffers may not begin 861// at the start of the available storage for the buffers. 862// 863// Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for 864// the last character added being a supplementary, and thus requiring a surrogate 865// pair. Doing this is simpler than checking for the edge case. 866// 867 868struct UTF8Buf { 869 int32_t bufNativeStart; // Native index of first char in UChar buf 870 int32_t bufNativeLimit; // Native index following last char in buf. 871 int32_t bufStartIdx; // First filled position in buf. 872 int32_t bufLimitIdx; // Limit of filled range in buf. 873 int32_t bufNILimit; // Limit of native indexing part of buf 874 int32_t toUCharsMapStart; // Native index corresponding to 875 // mapToUChars[0]. 876 // Set to bufNativeStart when filling forwards. 877 // Set to computed value when filling backwards. 878 879 UChar buf[UTF8_TEXT_CHUNK_SIZE+4]; // The UChar buffer. Requires one extra position beyond the 880 // the chunk size, to allow for surrogate at the end. 881 // Length must be identical to mapToNative array, below, 882 // because of the way indexing works when the array is 883 // filled backwards during a reverse iteration. Thus, 884 // the additional extra size. 885 uint8_t mapToNative[UTF8_TEXT_CHUNK_SIZE+4]; // map UChar index in buf to 886 // native offset from bufNativeStart. 887 // Requires two extra slots, 888 // one for a supplementary starting in the last normal position, 889 // and one for an entry for the buffer limit position. 890 uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to 891 // correspoding offset in filled part of buf. 892 int32_t align; 893}; 894 895U_CDECL_BEGIN 896 897// 898// utf8TextLength 899// 900// Get the length of the string. If we don't already know it, 901// we'll need to scan for the trailing nul. 902// 903static int64_t U_CALLCONV 904utf8TextLength(UText *ut) { 905 if (ut->b < 0) { 906 // Zero terminated string, and we haven't scanned to the end yet. 907 // Scan it now. 908 const char *r = (const char *)ut->context + ut->c; 909 while (*r != 0) { 910 r++; 911 } 912 if ((r - (const char *)ut->context) < 0x7fffffff) { 913 ut->b = (int32_t)(r - (const char *)ut->context); 914 } else { 915 // Actual string was bigger (more than 2 gig) than we 916 // can handle. Clip it to 2 GB. 917 ut->b = 0x7fffffff; 918 } 919 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 920 } 921 return ut->b; 922} 923 924 925 926 927 928 929static UBool U_CALLCONV 930utf8TextAccess(UText *ut, int64_t index, UBool forward) { 931 // 932 // Apologies to those who are allergic to goto statements. 933 // Consider each goto to a labelled block to be the equivalent of 934 // call the named block as if it were a function(); 935 // return; 936 // 937 const uint8_t *s8=(const uint8_t *)ut->context; 938 UTF8Buf *u8b = NULL; 939 int32_t length = ut->b; // Length of original utf-8 940 int32_t ix= (int32_t)index; // Requested index, trimmed to 32 bits. 941 int32_t mapIndex = 0; 942 if (index<0) { 943 ix=0; 944 } else if (index > 0x7fffffff) { 945 // Strings with 64 bit lengths not supported by this UTF-8 provider. 946 ix = 0x7fffffff; 947 } 948 949 // Pin requested index to the string length. 950 if (ix>length) { 951 if (length>=0) { 952 ix=length; 953 } else if (ix>=ut->c) { 954 // Zero terminated string, and requested index is beyond 955 // the region that has already been scanned. 956 // Scan up to either the end of the string or to the 957 // requested position, whichever comes first. 958 while (ut->c<ix && s8[ut->c]!=0) { 959 ut->c++; 960 } 961 // TODO: support for null terminated string length > 32 bits. 962 if (s8[ut->c] == 0) { 963 // We just found the actual length of the string. 964 // Trim the requested index back to that. 965 ix = ut->c; 966 ut->b = ut->c; 967 length = ut->c; 968 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 969 } 970 } 971 } 972 973 // 974 // Dispatch to the appropriate action for a forward iteration request. 975 // 976 if (forward) { 977 if (ix==ut->chunkNativeLimit) { 978 // Check for normal sequential iteration cases first. 979 if (ix==length) { 980 // Just reached end of string 981 // Don't swap buffers, but do set the 982 // current buffer position. 983 ut->chunkOffset = ut->chunkLength; 984 return FALSE; 985 } else { 986 // End of current buffer. 987 // check whether other buffer already has what we need. 988 UTF8Buf *altB = (UTF8Buf *)ut->q; 989 if (ix>=altB->bufNativeStart && ix<altB->bufNativeLimit) { 990 goto swapBuffers; 991 } 992 } 993 } 994 995 // A random access. Desired index could be in either or niether buf. 996 // For optimizing the order of testing, first check for the index 997 // being in the other buffer. This will be the case for uses that 998 // move back and forth over a fairly limited range 999 { 1000 u8b = (UTF8Buf *)ut->q; // the alternate buffer 1001 if (ix>=u8b->bufNativeStart && ix<u8b->bufNativeLimit) { 1002 // Requested index is in the other buffer. 1003 goto swapBuffers; 1004 } 1005 if (ix == length) { 1006 // Requested index is end-of-string. 1007 // (this is the case of randomly seeking to the end. 1008 // The case of iterating off the end is handled earlier.) 1009 if (ix == ut->chunkNativeLimit) { 1010 // Current buffer extends up to the end of the string. 1011 // Leave it as the current buffer. 1012 ut->chunkOffset = ut->chunkLength; 1013 return FALSE; 1014 } 1015 if (ix == u8b->bufNativeLimit) { 1016 // Alternate buffer extends to the end of string. 1017 // Swap it in as the current buffer. 1018 goto swapBuffersAndFail; 1019 } 1020 1021 // Neither existing buffer extends to the end of the string. 1022 goto makeStubBuffer; 1023 } 1024 1025 if (ix<ut->chunkNativeStart || ix>=ut->chunkNativeLimit) { 1026 // Requested index is in neither buffer. 1027 goto fillForward; 1028 } 1029 1030 // Requested index is in this buffer. 1031 u8b = (UTF8Buf *)ut->p; // the current buffer 1032 mapIndex = ix - u8b->toUCharsMapStart; 1033 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; 1034 return TRUE; 1035 1036 } 1037 } 1038 1039 1040 // 1041 // Dispatch to the appropriate action for a 1042 // Backwards Diretion iteration request. 1043 // 1044 if (ix==ut->chunkNativeStart) { 1045 // Check for normal sequential iteration cases first. 1046 if (ix==0) { 1047 // Just reached the start of string 1048 // Don't swap buffers, but do set the 1049 // current buffer position. 1050 ut->chunkOffset = 0; 1051 return FALSE; 1052 } else { 1053 // Start of current buffer. 1054 // check whether other buffer already has what we need. 1055 UTF8Buf *altB = (UTF8Buf *)ut->q; 1056 if (ix>altB->bufNativeStart && ix<=altB->bufNativeLimit) { 1057 goto swapBuffers; 1058 } 1059 } 1060 } 1061 1062 // A random access. Desired index could be in either or niether buf. 1063 // For optimizing the order of testing, 1064 // Most likely case: in the other buffer. 1065 // Second most likely: in neither buffer. 1066 // Unlikely, but must work: in the current buffer. 1067 u8b = (UTF8Buf *)ut->q; // the alternate buffer 1068 if (ix>u8b->bufNativeStart && ix<=u8b->bufNativeLimit) { 1069 // Requested index is in the other buffer. 1070 goto swapBuffers; 1071 } 1072 // Requested index is start-of-string. 1073 // (this is the case of randomly seeking to the start. 1074 // The case of iterating off the start is handled earlier.) 1075 if (ix==0) { 1076 if (u8b->bufNativeStart==0) { 1077 // Alternate buffer contains the data for the start string. 1078 // Make it be the current buffer. 1079 goto swapBuffersAndFail; 1080 } else { 1081 // Request for data before the start of string, 1082 // neither buffer is usable. 1083 // set up a zero-length buffer. 1084 goto makeStubBuffer; 1085 } 1086 } 1087 1088 if (ix<=ut->chunkNativeStart || ix>ut->chunkNativeLimit) { 1089 // Requested index is in neither buffer. 1090 goto fillReverse; 1091 } 1092 1093 // Requested index is in this buffer. 1094 // Set the utf16 buffer index. 1095 u8b = (UTF8Buf *)ut->p; 1096 mapIndex = ix - u8b->toUCharsMapStart; 1097 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; 1098 if (ut->chunkOffset==0) { 1099 // This occurs when the first character in the text is 1100 // a multi-byte UTF-8 char, and the requested index is to 1101 // one of the trailing bytes. Because there is no preceding , 1102 // character, this access fails. We can't pick up on the 1103 // situation sooner because the requested index is not zero. 1104 return FALSE; 1105 } else { 1106 return TRUE; 1107 } 1108 1109 1110 1111swapBuffers: 1112 // The alternate buffer (ut->q) has the string data that was requested. 1113 // Swap the primary and alternate buffers, and set the 1114 // chunk index into the new primary buffer. 1115 { 1116 u8b = (UTF8Buf *)ut->q; 1117 ut->q = ut->p; 1118 ut->p = u8b; 1119 ut->chunkContents = &u8b->buf[u8b->bufStartIdx]; 1120 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx; 1121 ut->chunkNativeStart = u8b->bufNativeStart; 1122 ut->chunkNativeLimit = u8b->bufNativeLimit; 1123 ut->nativeIndexingLimit = u8b->bufNILimit; 1124 1125 // Index into the (now current) chunk 1126 // Use the map to set the chunk index. It's more trouble than it's worth 1127 // to check whether native indexing can be used. 1128 U_ASSERT(ix>=u8b->bufNativeStart); 1129 U_ASSERT(ix<=u8b->bufNativeLimit); 1130 mapIndex = ix - u8b->toUCharsMapStart; 1131 U_ASSERT(mapIndex>=0); 1132 U_ASSERT(mapIndex<(int32_t)sizeof(u8b->mapToUChars)); 1133 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; 1134 1135 return TRUE; 1136 } 1137 1138 1139 swapBuffersAndFail: 1140 // We got a request for either the start or end of the string, 1141 // with iteration continuing in the out-of-bounds direction. 1142 // The alternate buffer already contains the data up to the 1143 // start/end. 1144 // Swap the buffers, then return failure, indicating that we couldn't 1145 // make things correct for continuing the iteration in the requested 1146 // direction. The position & buffer are correct should the 1147 // user decide to iterate in the opposite direction. 1148 u8b = (UTF8Buf *)ut->q; 1149 ut->q = ut->p; 1150 ut->p = u8b; 1151 ut->chunkContents = &u8b->buf[u8b->bufStartIdx]; 1152 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx; 1153 ut->chunkNativeStart = u8b->bufNativeStart; 1154 ut->chunkNativeLimit = u8b->bufNativeLimit; 1155 ut->nativeIndexingLimit = u8b->bufNILimit; 1156 1157 // Index into the (now current) chunk 1158 // For this function (swapBuffersAndFail), the requested index 1159 // will always be at either the start or end of the chunk. 1160 if (ix==u8b->bufNativeLimit) { 1161 ut->chunkOffset = ut->chunkLength; 1162 } else { 1163 ut->chunkOffset = 0; 1164 U_ASSERT(ix == u8b->bufNativeStart); 1165 } 1166 return FALSE; 1167 1168makeStubBuffer: 1169 // The user has done a seek/access past the start or end 1170 // of the string. Rather than loading data that is likely 1171 // to never be used, just set up a zero-length buffer at 1172 // the position. 1173 u8b = (UTF8Buf *)ut->q; 1174 u8b->bufNativeStart = ix; 1175 u8b->bufNativeLimit = ix; 1176 u8b->bufStartIdx = 0; 1177 u8b->bufLimitIdx = 0; 1178 u8b->bufNILimit = 0; 1179 u8b->toUCharsMapStart = ix; 1180 u8b->mapToNative[0] = 0; 1181 u8b->mapToUChars[0] = 0; 1182 goto swapBuffersAndFail; 1183 1184 1185 1186fillForward: 1187 { 1188 // Move the incoming index to a code point boundary. 1189 U8_SET_CP_START(s8, 0, ix); 1190 1191 // Swap the UText buffers. 1192 // We want to fill what was previously the alternate buffer, 1193 // and make what was the current buffer be the new alternate. 1194 UTF8Buf *u8b = (UTF8Buf *)ut->q; 1195 ut->q = ut->p; 1196 ut->p = u8b; 1197 1198 int32_t strLen = ut->b; 1199 UBool nulTerminated = FALSE; 1200 if (strLen < 0) { 1201 strLen = 0x7fffffff; 1202 nulTerminated = TRUE; 1203 } 1204 1205 UChar *buf = u8b->buf; 1206 uint8_t *mapToNative = u8b->mapToNative; 1207 uint8_t *mapToUChars = u8b->mapToUChars; 1208 int32_t destIx = 0; 1209 int32_t srcIx = ix; 1210 UBool seenNonAscii = FALSE; 1211 UChar32 c = 0; 1212 1213 // Fill the chunk buffer and mapping arrays. 1214 while (destIx<UTF8_TEXT_CHUNK_SIZE) { 1215 c = s8[srcIx]; 1216 if (c>0 && c<0x80) { 1217 // Special case ASCII range for speed. 1218 // zero is excluded to simplify bounds checking. 1219 buf[destIx] = (UChar)c; 1220 mapToNative[destIx] = (uint8_t)(srcIx - ix); 1221 mapToUChars[srcIx-ix] = (uint8_t)destIx; 1222 srcIx++; 1223 destIx++; 1224 } else { 1225 // General case, handle everything. 1226 if (seenNonAscii == FALSE) { 1227 seenNonAscii = TRUE; 1228 u8b->bufNILimit = destIx; 1229 } 1230 1231 int32_t cIx = srcIx; 1232 int32_t dIx = destIx; 1233 int32_t dIxSaved = destIx; 1234 U8_NEXT_OR_FFFD(s8, srcIx, strLen, c); 1235 if (c==0 && nulTerminated) { 1236 srcIx--; 1237 break; 1238 } 1239 1240 U16_APPEND_UNSAFE(buf, destIx, c); 1241 do { 1242 mapToNative[dIx++] = (uint8_t)(cIx - ix); 1243 } while (dIx < destIx); 1244 1245 do { 1246 mapToUChars[cIx++ - ix] = (uint8_t)dIxSaved; 1247 } while (cIx < srcIx); 1248 } 1249 if (srcIx>=strLen) { 1250 break; 1251 } 1252 1253 } 1254 1255 // store Native <--> Chunk Map entries for the end of the buffer. 1256 // There is no actual character here, but the index position is valid. 1257 mapToNative[destIx] = (uint8_t)(srcIx - ix); 1258 mapToUChars[srcIx - ix] = (uint8_t)destIx; 1259 1260 // fill in Buffer descriptor 1261 u8b->bufNativeStart = ix; 1262 u8b->bufNativeLimit = srcIx; 1263 u8b->bufStartIdx = 0; 1264 u8b->bufLimitIdx = destIx; 1265 if (seenNonAscii == FALSE) { 1266 u8b->bufNILimit = destIx; 1267 } 1268 u8b->toUCharsMapStart = u8b->bufNativeStart; 1269 1270 // Set UText chunk to refer to this buffer. 1271 ut->chunkContents = buf; 1272 ut->chunkOffset = 0; 1273 ut->chunkLength = u8b->bufLimitIdx; 1274 ut->chunkNativeStart = u8b->bufNativeStart; 1275 ut->chunkNativeLimit = u8b->bufNativeLimit; 1276 ut->nativeIndexingLimit = u8b->bufNILimit; 1277 1278 // For zero terminated strings, keep track of the maximum point 1279 // scanned so far. 1280 if (nulTerminated && srcIx>ut->c) { 1281 ut->c = srcIx; 1282 if (c==0) { 1283 // We scanned to the end. 1284 // Remember the actual length. 1285 ut->b = srcIx; 1286 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 1287 } 1288 } 1289 return TRUE; 1290 } 1291 1292 1293fillReverse: 1294 { 1295 // Move the incoming index to a code point boundary. 1296 // Can only do this if the incoming index is somewhere in the interior of the string. 1297 // If index is at the end, there is no character there to look at. 1298 if (ix != ut->b) { 1299 U8_SET_CP_START(s8, 0, ix); 1300 } 1301 1302 // Swap the UText buffers. 1303 // We want to fill what was previously the alternate buffer, 1304 // and make what was the current buffer be the new alternate. 1305 UTF8Buf *u8b = (UTF8Buf *)ut->q; 1306 ut->q = ut->p; 1307 ut->p = u8b; 1308 1309 UChar *buf = u8b->buf; 1310 uint8_t *mapToNative = u8b->mapToNative; 1311 uint8_t *mapToUChars = u8b->mapToUChars; 1312 int32_t toUCharsMapStart = ix - (UTF8_TEXT_CHUNK_SIZE*3 + 1); 1313 int32_t destIx = UTF8_TEXT_CHUNK_SIZE+2; // Start in the overflow region 1314 // at end of buffer to leave room 1315 // for a surrogate pair at the 1316 // buffer start. 1317 int32_t srcIx = ix; 1318 int32_t bufNILimit = destIx; 1319 UChar32 c; 1320 1321 // Map to/from Native Indexes, fill in for the position at the end of 1322 // the buffer. 1323 // 1324 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1325 mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx; 1326 1327 // Fill the chunk buffer 1328 // Work backwards, filling from the end of the buffer towards the front. 1329 // 1330 while (destIx>2 && (srcIx - toUCharsMapStart > 5) && (srcIx > 0)) { 1331 srcIx--; 1332 destIx--; 1333 1334 // Get last byte of the UTF-8 character 1335 c = s8[srcIx]; 1336 if (c<0x80) { 1337 // Special case ASCII range for speed. 1338 buf[destIx] = (UChar)c; 1339 mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx; 1340 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1341 } else { 1342 // General case, handle everything non-ASCII. 1343 1344 int32_t sIx = srcIx; // ix of last byte of multi-byte u8 char 1345 1346 // Get the full character from the UTF8 string. 1347 // use code derived from tbe macros in utf8.h 1348 // Leaves srcIx pointing at the first byte of the UTF-8 char. 1349 // 1350 c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -3); 1351 // leaves srcIx at first byte of the multi-byte char. 1352 1353 // Store the character in UTF-16 buffer. 1354 if (c<0x10000) { 1355 buf[destIx] = (UChar)c; 1356 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1357 } else { 1358 buf[destIx] = U16_TRAIL(c); 1359 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1360 buf[--destIx] = U16_LEAD(c); 1361 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1362 } 1363 1364 // Fill in the map from native indexes to UChars buf index. 1365 do { 1366 mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx; 1367 } while (sIx >= srcIx); 1368 1369 // Set native indexing limit to be the current position. 1370 // We are processing a non-ascii, non-native-indexing char now; 1371 // the limit will be here if the rest of the chars to be 1372 // added to this buffer are ascii. 1373 bufNILimit = destIx; 1374 } 1375 } 1376 u8b->bufNativeStart = srcIx; 1377 u8b->bufNativeLimit = ix; 1378 u8b->bufStartIdx = destIx; 1379 u8b->bufLimitIdx = UTF8_TEXT_CHUNK_SIZE+2; 1380 u8b->bufNILimit = bufNILimit - u8b->bufStartIdx; 1381 u8b->toUCharsMapStart = toUCharsMapStart; 1382 1383 ut->chunkContents = &buf[u8b->bufStartIdx]; 1384 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx; 1385 ut->chunkOffset = ut->chunkLength; 1386 ut->chunkNativeStart = u8b->bufNativeStart; 1387 ut->chunkNativeLimit = u8b->bufNativeLimit; 1388 ut->nativeIndexingLimit = u8b->bufNILimit; 1389 return TRUE; 1390 } 1391 1392} 1393 1394 1395 1396// 1397// This is a slightly modified copy of u_strFromUTF8, 1398// Inserts a Replacement Char rather than failing on invalid UTF-8 1399// Removes unnecessary features. 1400// 1401static UChar* 1402utext_strFromUTF8(UChar *dest, 1403 int32_t destCapacity, 1404 int32_t *pDestLength, 1405 const char* src, 1406 int32_t srcLength, // required. NUL terminated not supported. 1407 UErrorCode *pErrorCode 1408 ) 1409{ 1410 1411 UChar *pDest = dest; 1412 UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL; 1413 UChar32 ch=0; 1414 int32_t index = 0; 1415 int32_t reqLength = 0; 1416 uint8_t* pSrc = (uint8_t*) src; 1417 1418 1419 while((index < srcLength)&&(pDest<pDestLimit)){ 1420 ch = pSrc[index++]; 1421 if(ch <=0x7f){ 1422 *pDest++=(UChar)ch; 1423 }else{ 1424 ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3); 1425 if(U_IS_BMP(ch)){ 1426 *(pDest++)=(UChar)ch; 1427 }else{ 1428 *(pDest++)=U16_LEAD(ch); 1429 if(pDest<pDestLimit){ 1430 *(pDest++)=U16_TRAIL(ch); 1431 }else{ 1432 reqLength++; 1433 break; 1434 } 1435 } 1436 } 1437 } 1438 /* donot fill the dest buffer just count the UChars needed */ 1439 while(index < srcLength){ 1440 ch = pSrc[index++]; 1441 if(ch <= 0x7f){ 1442 reqLength++; 1443 }else{ 1444 ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3); 1445 reqLength+=U16_LENGTH(ch); 1446 } 1447 } 1448 1449 reqLength+=(int32_t)(pDest - dest); 1450 1451 if(pDestLength){ 1452 *pDestLength = reqLength; 1453 } 1454 1455 /* Terminate the buffer */ 1456 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 1457 1458 return dest; 1459} 1460 1461 1462 1463static int32_t U_CALLCONV 1464utf8TextExtract(UText *ut, 1465 int64_t start, int64_t limit, 1466 UChar *dest, int32_t destCapacity, 1467 UErrorCode *pErrorCode) { 1468 if(U_FAILURE(*pErrorCode)) { 1469 return 0; 1470 } 1471 if(destCapacity<0 || (dest==NULL && destCapacity>0)) { 1472 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1473 return 0; 1474 } 1475 int32_t length = ut->b; 1476 int32_t start32 = pinIndex(start, length); 1477 int32_t limit32 = pinIndex(limit, length); 1478 1479 if(start32>limit32) { 1480 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1481 return 0; 1482 } 1483 1484 1485 // adjust the incoming indexes to land on code point boundaries if needed. 1486 // adjust by no more than three, because that is the largest number of trail bytes 1487 // in a well formed UTF8 character. 1488 const uint8_t *buf = (const uint8_t *)ut->context; 1489 int i; 1490 if (start32 < ut->chunkNativeLimit) { 1491 for (i=0; i<3; i++) { 1492 if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) { 1493 break; 1494 } 1495 start32--; 1496 } 1497 } 1498 1499 if (limit32 < ut->chunkNativeLimit) { 1500 for (i=0; i<3; i++) { 1501 if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) { 1502 break; 1503 } 1504 limit32--; 1505 } 1506 } 1507 1508 // Do the actual extract. 1509 int32_t destLength=0; 1510 utext_strFromUTF8(dest, destCapacity, &destLength, 1511 (const char *)ut->context+start32, limit32-start32, 1512 pErrorCode); 1513 utf8TextAccess(ut, limit32, TRUE); 1514 return destLength; 1515} 1516 1517// 1518// utf8TextMapOffsetToNative 1519// 1520// Map a chunk (UTF-16) offset to a native index. 1521static int64_t U_CALLCONV 1522utf8TextMapOffsetToNative(const UText *ut) { 1523 // 1524 UTF8Buf *u8b = (UTF8Buf *)ut->p; 1525 U_ASSERT(ut->chunkOffset>ut->nativeIndexingLimit && ut->chunkOffset<=ut->chunkLength); 1526 int32_t nativeOffset = u8b->mapToNative[ut->chunkOffset + u8b->bufStartIdx] + u8b->toUCharsMapStart; 1527 U_ASSERT(nativeOffset >= ut->chunkNativeStart && nativeOffset <= ut->chunkNativeLimit); 1528 return nativeOffset; 1529} 1530 1531// 1532// Map a native index to the corrsponding chunk offset 1533// 1534static int32_t U_CALLCONV 1535utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) { 1536 U_ASSERT(index64 <= 0x7fffffff); 1537 int32_t index = (int32_t)index64; 1538 UTF8Buf *u8b = (UTF8Buf *)ut->p; 1539 U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit); 1540 U_ASSERT(index<=ut->chunkNativeLimit); 1541 int32_t mapIndex = index - u8b->toUCharsMapStart; 1542 int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; 1543 U_ASSERT(offset>=0 && offset<=ut->chunkLength); 1544 return offset; 1545} 1546 1547static UText * U_CALLCONV 1548utf8TextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) 1549{ 1550 // First do a generic shallow clone. Does everything needed for the UText struct itself. 1551 dest = shallowTextClone(dest, src, status); 1552 1553 // For deep clones, make a copy of the string. 1554 // The copied storage is owned by the newly created clone. 1555 // 1556 // TODO: There is an isssue with using utext_nativeLength(). 1557 // That function is non-const in cases where the input was NUL terminated 1558 // and the length has not yet been determined. 1559 // This function (clone()) is const. 1560 // There potentially a thread safety issue lurking here. 1561 // 1562 if (deep && U_SUCCESS(*status)) { 1563 int32_t len = (int32_t)utext_nativeLength((UText *)src); 1564 char *copyStr = (char *)uprv_malloc(len+1); 1565 if (copyStr == NULL) { 1566 *status = U_MEMORY_ALLOCATION_ERROR; 1567 } else { 1568 uprv_memcpy(copyStr, src->context, len+1); 1569 dest->context = copyStr; 1570 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); 1571 } 1572 } 1573 return dest; 1574} 1575 1576 1577static void U_CALLCONV 1578utf8TextClose(UText *ut) { 1579 // Most of the work of close is done by the generic UText framework close. 1580 // All that needs to be done here is to delete the UTF8 string if the UText 1581 // owns it. This occurs if the UText was created by cloning. 1582 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { 1583 char *s = (char *)ut->context; 1584 uprv_free(s); 1585 ut->context = NULL; 1586 } 1587} 1588 1589U_CDECL_END 1590 1591 1592static const struct UTextFuncs utf8Funcs = 1593{ 1594 sizeof(UTextFuncs), 1595 0, 0, 0, // Reserved alignment padding 1596 utf8TextClone, 1597 utf8TextLength, 1598 utf8TextAccess, 1599 utf8TextExtract, 1600 NULL, /* replace*/ 1601 NULL, /* copy */ 1602 utf8TextMapOffsetToNative, 1603 utf8TextMapIndexToUTF16, 1604 utf8TextClose, 1605 NULL, // spare 1 1606 NULL, // spare 2 1607 NULL // spare 3 1608}; 1609 1610 1611static const char gEmptyString[] = {0}; 1612 1613U_CAPI UText * U_EXPORT2 1614utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) { 1615 if(U_FAILURE(*status)) { 1616 return NULL; 1617 } 1618 if(s==NULL && length==0) { 1619 s = gEmptyString; 1620 } 1621 1622 if(s==NULL || length<-1 || length>INT32_MAX) { 1623 *status=U_ILLEGAL_ARGUMENT_ERROR; 1624 return NULL; 1625 } 1626 1627 ut = utext_setup(ut, sizeof(UTF8Buf) * 2, status); 1628 if (U_FAILURE(*status)) { 1629 return ut; 1630 } 1631 1632 ut->pFuncs = &utf8Funcs; 1633 ut->context = s; 1634 ut->b = (int32_t)length; 1635 ut->c = (int32_t)length; 1636 if (ut->c < 0) { 1637 ut->c = 0; 1638 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 1639 } 1640 ut->p = ut->pExtra; 1641 ut->q = (char *)ut->pExtra + sizeof(UTF8Buf); 1642 return ut; 1643 1644} 1645 1646 1647 1648 1649 1650 1651 1652 1653//------------------------------------------------------------------------------ 1654// 1655// UText implementation wrapper for Replaceable (read/write) 1656// 1657// Use of UText data members: 1658// context pointer to Replaceable. 1659// p pointer to Replaceable if it is owned by the UText. 1660// 1661//------------------------------------------------------------------------------ 1662 1663 1664 1665// minimum chunk size for this implementation: 3 1666// to allow for possible trimming for code point boundaries 1667enum { REP_TEXT_CHUNK_SIZE=10 }; 1668 1669struct ReplExtra { 1670 /* 1671 * Chunk UChars. 1672 * +1 to simplify filling with surrogate pair at the end. 1673 */ 1674 UChar s[REP_TEXT_CHUNK_SIZE+1]; 1675}; 1676 1677 1678U_CDECL_BEGIN 1679 1680static UText * U_CALLCONV 1681repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) { 1682 // First do a generic shallow clone. Does everything needed for the UText struct itself. 1683 dest = shallowTextClone(dest, src, status); 1684 1685 // For deep clones, make a copy of the Replaceable. 1686 // The copied Replaceable storage is owned by the newly created UText clone. 1687 // A non-NULL pointer in UText.p is the signal to the close() function to delete 1688 // it. 1689 // 1690 if (deep && U_SUCCESS(*status)) { 1691 const Replaceable *replSrc = (const Replaceable *)src->context; 1692 dest->context = replSrc->clone(); 1693 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); 1694 1695 // with deep clone, the copy is writable, even when the source is not. 1696 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE); 1697 } 1698 return dest; 1699} 1700 1701 1702static void U_CALLCONV 1703repTextClose(UText *ut) { 1704 // Most of the work of close is done by the generic UText framework close. 1705 // All that needs to be done here is delete the Replaceable if the UText 1706 // owns it. This occurs if the UText was created by cloning. 1707 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { 1708 Replaceable *rep = (Replaceable *)ut->context; 1709 delete rep; 1710 ut->context = NULL; 1711 } 1712} 1713 1714 1715static int64_t U_CALLCONV 1716repTextLength(UText *ut) { 1717 const Replaceable *replSrc = (const Replaceable *)ut->context; 1718 int32_t len = replSrc->length(); 1719 return len; 1720} 1721 1722 1723static UBool U_CALLCONV 1724repTextAccess(UText *ut, int64_t index, UBool forward) { 1725 const Replaceable *rep=(const Replaceable *)ut->context; 1726 int32_t length=rep->length(); // Full length of the input text (bigger than a chunk) 1727 1728 // clip the requested index to the limits of the text. 1729 int32_t index32 = pinIndex(index, length); 1730 U_ASSERT(index<=INT32_MAX); 1731 1732 1733 /* 1734 * Compute start/limit boundaries around index, for a segment of text 1735 * to be extracted. 1736 * To allow for the possibility that our user gave an index to the trailing 1737 * half of a surrogate pair, we must request one extra preceding UChar when 1738 * going in the forward direction. This will ensure that the buffer has the 1739 * entire code point at the specified index. 1740 */ 1741 if(forward) { 1742 1743 if (index32>=ut->chunkNativeStart && index32<ut->chunkNativeLimit) { 1744 // Buffer already contains the requested position. 1745 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart); 1746 return TRUE; 1747 } 1748 if (index32>=length && ut->chunkNativeLimit==length) { 1749 // Request for end of string, and buffer already extends up to it. 1750 // Can't get the data, but don't change the buffer. 1751 ut->chunkOffset = length - (int32_t)ut->chunkNativeStart; 1752 return FALSE; 1753 } 1754 1755 ut->chunkNativeLimit = index + REP_TEXT_CHUNK_SIZE - 1; 1756 // Going forward, so we want to have the buffer with stuff at and beyond 1757 // the requested index. The -1 gets us one code point before the 1758 // requested index also, to handle the case of the index being on 1759 // a trail surrogate of a surrogate pair. 1760 if(ut->chunkNativeLimit > length) { 1761 ut->chunkNativeLimit = length; 1762 } 1763 // unless buffer ran off end, start is index-1. 1764 ut->chunkNativeStart = ut->chunkNativeLimit - REP_TEXT_CHUNK_SIZE; 1765 if(ut->chunkNativeStart < 0) { 1766 ut->chunkNativeStart = 0; 1767 } 1768 } else { 1769 // Reverse iteration. Fill buffer with data preceding the requested index. 1770 if (index32>ut->chunkNativeStart && index32<=ut->chunkNativeLimit) { 1771 // Requested position already in buffer. 1772 ut->chunkOffset = index32 - (int32_t)ut->chunkNativeStart; 1773 return TRUE; 1774 } 1775 if (index32==0 && ut->chunkNativeStart==0) { 1776 // Request for start, buffer already begins at start. 1777 // No data, but keep the buffer as is. 1778 ut->chunkOffset = 0; 1779 return FALSE; 1780 } 1781 1782 // Figure out the bounds of the chunk to extract for reverse iteration. 1783 // Need to worry about chunk not splitting surrogate pairs, and while still 1784 // containing the data we need. 1785 // Fix by requesting a chunk that includes an extra UChar at the end. 1786 // If this turns out to be a lead surrogate, we can lop it off and still have 1787 // the data we wanted. 1788 ut->chunkNativeStart = index32 + 1 - REP_TEXT_CHUNK_SIZE; 1789 if (ut->chunkNativeStart < 0) { 1790 ut->chunkNativeStart = 0; 1791 } 1792 1793 ut->chunkNativeLimit = index32 + 1; 1794 if (ut->chunkNativeLimit > length) { 1795 ut->chunkNativeLimit = length; 1796 } 1797 } 1798 1799 // Extract the new chunk of text from the Replaceable source. 1800 ReplExtra *ex = (ReplExtra *)ut->pExtra; 1801 // UnicodeString with its buffer a writable alias to the chunk buffer 1802 UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/); 1803 rep->extractBetween((int32_t)ut->chunkNativeStart, (int32_t)ut->chunkNativeLimit, buffer); 1804 1805 ut->chunkContents = ex->s; 1806 ut->chunkLength = (int32_t)(ut->chunkNativeLimit - ut->chunkNativeStart); 1807 ut->chunkOffset = (int32_t)(index32 - ut->chunkNativeStart); 1808 1809 // Surrogate pairs from the input text must not span chunk boundaries. 1810 // If end of chunk could be the start of a surrogate, trim it off. 1811 if (ut->chunkNativeLimit < length && 1812 U16_IS_LEAD(ex->s[ut->chunkLength-1])) { 1813 ut->chunkLength--; 1814 ut->chunkNativeLimit--; 1815 if (ut->chunkOffset > ut->chunkLength) { 1816 ut->chunkOffset = ut->chunkLength; 1817 } 1818 } 1819 1820 // if the first UChar in the chunk could be the trailing half of a surrogate pair, 1821 // trim it off. 1822 if(ut->chunkNativeStart>0 && U16_IS_TRAIL(ex->s[0])) { 1823 ++(ut->chunkContents); 1824 ++(ut->chunkNativeStart); 1825 --(ut->chunkLength); 1826 --(ut->chunkOffset); 1827 } 1828 1829 // adjust the index/chunkOffset to a code point boundary 1830 U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset); 1831 1832 // Use fast indexing for get/setNativeIndex() 1833 ut->nativeIndexingLimit = ut->chunkLength; 1834 1835 return TRUE; 1836} 1837 1838 1839 1840static int32_t U_CALLCONV 1841repTextExtract(UText *ut, 1842 int64_t start, int64_t limit, 1843 UChar *dest, int32_t destCapacity, 1844 UErrorCode *status) { 1845 const Replaceable *rep=(const Replaceable *)ut->context; 1846 int32_t length=rep->length(); 1847 1848 if(U_FAILURE(*status)) { 1849 return 0; 1850 } 1851 if(destCapacity<0 || (dest==NULL && destCapacity>0)) { 1852 *status=U_ILLEGAL_ARGUMENT_ERROR; 1853 } 1854 if(start>limit) { 1855 *status=U_INDEX_OUTOFBOUNDS_ERROR; 1856 return 0; 1857 } 1858 1859 int32_t start32 = pinIndex(start, length); 1860 int32_t limit32 = pinIndex(limit, length); 1861 1862 // adjust start, limit if they point to trail half of surrogates 1863 if (start32<length && U16_IS_TRAIL(rep->charAt(start32)) && 1864 U_IS_SUPPLEMENTARY(rep->char32At(start32))){ 1865 start32--; 1866 } 1867 if (limit32<length && U16_IS_TRAIL(rep->charAt(limit32)) && 1868 U_IS_SUPPLEMENTARY(rep->char32At(limit32))){ 1869 limit32--; 1870 } 1871 1872 length=limit32-start32; 1873 if(length>destCapacity) { 1874 limit32 = start32 + destCapacity; 1875 } 1876 UnicodeString buffer(dest, 0, destCapacity); // writable alias 1877 rep->extractBetween(start32, limit32, buffer); 1878 repTextAccess(ut, limit32, TRUE); 1879 1880 return u_terminateUChars(dest, destCapacity, length, status); 1881} 1882 1883static int32_t U_CALLCONV 1884repTextReplace(UText *ut, 1885 int64_t start, int64_t limit, 1886 const UChar *src, int32_t length, 1887 UErrorCode *status) { 1888 Replaceable *rep=(Replaceable *)ut->context; 1889 int32_t oldLength; 1890 1891 if(U_FAILURE(*status)) { 1892 return 0; 1893 } 1894 if(src==NULL && length!=0) { 1895 *status=U_ILLEGAL_ARGUMENT_ERROR; 1896 return 0; 1897 } 1898 oldLength=rep->length(); // will subtract from new length 1899 if(start>limit ) { 1900 *status=U_INDEX_OUTOFBOUNDS_ERROR; 1901 return 0; 1902 } 1903 1904 int32_t start32 = pinIndex(start, oldLength); 1905 int32_t limit32 = pinIndex(limit, oldLength); 1906 1907 // Snap start & limit to code point boundaries. 1908 if (start32<oldLength && U16_IS_TRAIL(rep->charAt(start32)) && 1909 start32>0 && U16_IS_LEAD(rep->charAt(start32-1))) 1910 { 1911 start32--; 1912 } 1913 if (limit32<oldLength && U16_IS_LEAD(rep->charAt(limit32-1)) && 1914 U16_IS_TRAIL(rep->charAt(limit32))) 1915 { 1916 limit32++; 1917 } 1918 1919 // Do the actual replace operation using methods of the Replaceable class 1920 UnicodeString replStr((UBool)(length<0), src, length); // read-only alias 1921 rep->handleReplaceBetween(start32, limit32, replStr); 1922 int32_t newLength = rep->length(); 1923 int32_t lengthDelta = newLength - oldLength; 1924 1925 // Is the UText chunk buffer OK? 1926 if (ut->chunkNativeLimit > start32) { 1927 // this replace operation may have impacted the current chunk. 1928 // invalidate it, which will force a reload on the next access. 1929 invalidateChunk(ut); 1930 } 1931 1932 // set the iteration position to the end of the newly inserted replacement text. 1933 int32_t newIndexPos = limit32 + lengthDelta; 1934 repTextAccess(ut, newIndexPos, TRUE); 1935 1936 return lengthDelta; 1937} 1938 1939 1940static void U_CALLCONV 1941repTextCopy(UText *ut, 1942 int64_t start, int64_t limit, 1943 int64_t destIndex, 1944 UBool move, 1945 UErrorCode *status) 1946{ 1947 Replaceable *rep=(Replaceable *)ut->context; 1948 int32_t length=rep->length(); 1949 1950 if(U_FAILURE(*status)) { 1951 return; 1952 } 1953 if (start>limit || (start<destIndex && destIndex<limit)) 1954 { 1955 *status=U_INDEX_OUTOFBOUNDS_ERROR; 1956 return; 1957 } 1958 1959 int32_t start32 = pinIndex(start, length); 1960 int32_t limit32 = pinIndex(limit, length); 1961 int32_t destIndex32 = pinIndex(destIndex, length); 1962 1963 // TODO: snap input parameters to code point boundaries. 1964 1965 if(move) { 1966 // move: copy to destIndex, then replace original with nothing 1967 int32_t segLength=limit32-start32; 1968 rep->copy(start32, limit32, destIndex32); 1969 if(destIndex32<start32) { 1970 start32+=segLength; 1971 limit32+=segLength; 1972 } 1973 rep->handleReplaceBetween(start32, limit32, UnicodeString()); 1974 } else { 1975 // copy 1976 rep->copy(start32, limit32, destIndex32); 1977 } 1978 1979 // If the change to the text touched the region in the chunk buffer, 1980 // invalidate the buffer. 1981 int32_t firstAffectedIndex = destIndex32; 1982 if (move && start32<firstAffectedIndex) { 1983 firstAffectedIndex = start32; 1984 } 1985 if (firstAffectedIndex < ut->chunkNativeLimit) { 1986 // changes may have affected range covered by the chunk 1987 invalidateChunk(ut); 1988 } 1989 1990 // Put iteration position at the newly inserted (moved) block, 1991 int32_t nativeIterIndex = destIndex32 + limit32 - start32; 1992 if (move && destIndex32>start32) { 1993 // moved a block of text towards the end of the string. 1994 nativeIterIndex = destIndex32; 1995 } 1996 1997 // Set position, reload chunk if needed. 1998 repTextAccess(ut, nativeIterIndex, TRUE); 1999} 2000 2001static const struct UTextFuncs repFuncs = 2002{ 2003 sizeof(UTextFuncs), 2004 0, 0, 0, // Reserved alignment padding 2005 repTextClone, 2006 repTextLength, 2007 repTextAccess, 2008 repTextExtract, 2009 repTextReplace, 2010 repTextCopy, 2011 NULL, // MapOffsetToNative, 2012 NULL, // MapIndexToUTF16, 2013 repTextClose, 2014 NULL, // spare 1 2015 NULL, // spare 2 2016 NULL // spare 3 2017}; 2018 2019 2020U_CAPI UText * U_EXPORT2 2021utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status) 2022{ 2023 if(U_FAILURE(*status)) { 2024 return NULL; 2025 } 2026 if(rep==NULL) { 2027 *status=U_ILLEGAL_ARGUMENT_ERROR; 2028 return NULL; 2029 } 2030 ut = utext_setup(ut, sizeof(ReplExtra), status); 2031 if(U_FAILURE(*status)) { 2032 return ut; 2033 } 2034 2035 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE); 2036 if(rep->hasMetaData()) { 2037 ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA); 2038 } 2039 2040 ut->pFuncs = &repFuncs; 2041 ut->context = rep; 2042 return ut; 2043} 2044 2045U_CDECL_END 2046 2047 2048 2049 2050 2051 2052 2053 2054//------------------------------------------------------------------------------ 2055// 2056// UText implementation for UnicodeString (read/write) and 2057// for const UnicodeString (read only) 2058// (same implementation, only the flags are different) 2059// 2060// Use of UText data members: 2061// context pointer to UnicodeString 2062// p pointer to UnicodeString IF this UText owns the string 2063// and it must be deleted on close(). NULL otherwise. 2064// 2065//------------------------------------------------------------------------------ 2066 2067U_CDECL_BEGIN 2068 2069 2070static UText * U_CALLCONV 2071unistrTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) { 2072 // First do a generic shallow clone. Does everything needed for the UText struct itself. 2073 dest = shallowTextClone(dest, src, status); 2074 2075 // For deep clones, make a copy of the UnicodeSring. 2076 // The copied UnicodeString storage is owned by the newly created UText clone. 2077 // A non-NULL pointer in UText.p is the signal to the close() function to delete 2078 // the UText. 2079 // 2080 if (deep && U_SUCCESS(*status)) { 2081 const UnicodeString *srcString = (const UnicodeString *)src->context; 2082 dest->context = new UnicodeString(*srcString); 2083 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); 2084 2085 // with deep clone, the copy is writable, even when the source is not. 2086 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE); 2087 } 2088 return dest; 2089} 2090 2091static void U_CALLCONV 2092unistrTextClose(UText *ut) { 2093 // Most of the work of close is done by the generic UText framework close. 2094 // All that needs to be done here is delete the UnicodeString if the UText 2095 // owns it. This occurs if the UText was created by cloning. 2096 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { 2097 UnicodeString *str = (UnicodeString *)ut->context; 2098 delete str; 2099 ut->context = NULL; 2100 } 2101} 2102 2103 2104static int64_t U_CALLCONV 2105unistrTextLength(UText *t) { 2106 return ((const UnicodeString *)t->context)->length(); 2107} 2108 2109 2110static UBool U_CALLCONV 2111unistrTextAccess(UText *ut, int64_t index, UBool forward) { 2112 int32_t length = ut->chunkLength; 2113 ut->chunkOffset = pinIndex(index, length); 2114 2115 // Check whether request is at the start or end 2116 UBool retVal = (forward && index<length) || (!forward && index>0); 2117 return retVal; 2118} 2119 2120 2121 2122static int32_t U_CALLCONV 2123unistrTextExtract(UText *t, 2124 int64_t start, int64_t limit, 2125 UChar *dest, int32_t destCapacity, 2126 UErrorCode *pErrorCode) { 2127 const UnicodeString *us=(const UnicodeString *)t->context; 2128 int32_t length=us->length(); 2129 2130 if(U_FAILURE(*pErrorCode)) { 2131 return 0; 2132 } 2133 if(destCapacity<0 || (dest==NULL && destCapacity>0)) { 2134 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 2135 } 2136 if(start<0 || start>limit) { 2137 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2138 return 0; 2139 } 2140 2141 int32_t start32 = start<length ? us->getChar32Start((int32_t)start) : length; 2142 int32_t limit32 = limit<length ? us->getChar32Start((int32_t)limit) : length; 2143 2144 length=limit32-start32; 2145 if (destCapacity>0 && dest!=NULL) { 2146 int32_t trimmedLength = length; 2147 if(trimmedLength>destCapacity) { 2148 trimmedLength=destCapacity; 2149 } 2150 us->extract(start32, trimmedLength, dest); 2151 t->chunkOffset = start32+trimmedLength; 2152 } else { 2153 t->chunkOffset = start32; 2154 } 2155 u_terminateUChars(dest, destCapacity, length, pErrorCode); 2156 return length; 2157} 2158 2159static int32_t U_CALLCONV 2160unistrTextReplace(UText *ut, 2161 int64_t start, int64_t limit, 2162 const UChar *src, int32_t length, 2163 UErrorCode *pErrorCode) { 2164 UnicodeString *us=(UnicodeString *)ut->context; 2165 int32_t oldLength; 2166 2167 if(U_FAILURE(*pErrorCode)) { 2168 return 0; 2169 } 2170 if(src==NULL && length!=0) { 2171 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 2172 } 2173 if(start>limit) { 2174 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2175 return 0; 2176 } 2177 oldLength=us->length(); 2178 int32_t start32 = pinIndex(start, oldLength); 2179 int32_t limit32 = pinIndex(limit, oldLength); 2180 if (start32 < oldLength) { 2181 start32 = us->getChar32Start(start32); 2182 } 2183 if (limit32 < oldLength) { 2184 limit32 = us->getChar32Start(limit32); 2185 } 2186 2187 // replace 2188 us->replace(start32, limit32-start32, src, length); 2189 int32_t newLength = us->length(); 2190 2191 // Update the chunk description. 2192 ut->chunkContents = us->getBuffer(); 2193 ut->chunkLength = newLength; 2194 ut->chunkNativeLimit = newLength; 2195 ut->nativeIndexingLimit = newLength; 2196 2197 // Set iteration position to the point just following the newly inserted text. 2198 int32_t lengthDelta = newLength - oldLength; 2199 ut->chunkOffset = limit32 + lengthDelta; 2200 2201 return lengthDelta; 2202} 2203 2204static void U_CALLCONV 2205unistrTextCopy(UText *ut, 2206 int64_t start, int64_t limit, 2207 int64_t destIndex, 2208 UBool move, 2209 UErrorCode *pErrorCode) { 2210 UnicodeString *us=(UnicodeString *)ut->context; 2211 int32_t length=us->length(); 2212 2213 if(U_FAILURE(*pErrorCode)) { 2214 return; 2215 } 2216 int32_t start32 = pinIndex(start, length); 2217 int32_t limit32 = pinIndex(limit, length); 2218 int32_t destIndex32 = pinIndex(destIndex, length); 2219 2220 if( start32>limit32 || (start32<destIndex32 && destIndex32<limit32)) { 2221 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2222 return; 2223 } 2224 2225 if(move) { 2226 // move: copy to destIndex, then replace original with nothing 2227 int32_t segLength=limit32-start32; 2228 us->copy(start32, limit32, destIndex32); 2229 if(destIndex32<start32) { 2230 start32+=segLength; 2231 } 2232 us->replace(start32, segLength, NULL, 0); 2233 } else { 2234 // copy 2235 us->copy(start32, limit32, destIndex32); 2236 } 2237 2238 // update chunk description, set iteration position. 2239 ut->chunkContents = us->getBuffer(); 2240 if (move==FALSE) { 2241 // copy operation, string length grows 2242 ut->chunkLength += limit32-start32; 2243 ut->chunkNativeLimit = ut->chunkLength; 2244 ut->nativeIndexingLimit = ut->chunkLength; 2245 } 2246 2247 // Iteration position to end of the newly inserted text. 2248 ut->chunkOffset = destIndex32+limit32-start32; 2249 if (move && destIndex32>start32) { 2250 ut->chunkOffset = destIndex32; 2251 } 2252 2253} 2254 2255static const struct UTextFuncs unistrFuncs = 2256{ 2257 sizeof(UTextFuncs), 2258 0, 0, 0, // Reserved alignment padding 2259 unistrTextClone, 2260 unistrTextLength, 2261 unistrTextAccess, 2262 unistrTextExtract, 2263 unistrTextReplace, 2264 unistrTextCopy, 2265 NULL, // MapOffsetToNative, 2266 NULL, // MapIndexToUTF16, 2267 unistrTextClose, 2268 NULL, // spare 1 2269 NULL, // spare 2 2270 NULL // spare 3 2271}; 2272 2273 2274 2275U_CDECL_END 2276 2277 2278U_CAPI UText * U_EXPORT2 2279utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) { 2280 ut = utext_openConstUnicodeString(ut, s, status); 2281 if (U_SUCCESS(*status)) { 2282 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE); 2283 } 2284 return ut; 2285} 2286 2287 2288 2289U_CAPI UText * U_EXPORT2 2290utext_openConstUnicodeString(UText *ut, const UnicodeString *s, UErrorCode *status) { 2291 if (U_SUCCESS(*status) && s->isBogus()) { 2292 // The UnicodeString is bogus, but we still need to detach the UText 2293 // from whatever it was hooked to before, if anything. 2294 utext_openUChars(ut, NULL, 0, status); 2295 *status = U_ILLEGAL_ARGUMENT_ERROR; 2296 return ut; 2297 } 2298 ut = utext_setup(ut, 0, status); 2299 // note: use the standard (writable) function table for UnicodeString. 2300 // The flag settings disable writing, so having the functions in 2301 // the table is harmless. 2302 if (U_SUCCESS(*status)) { 2303 ut->pFuncs = &unistrFuncs; 2304 ut->context = s; 2305 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS); 2306 ut->chunkContents = s->getBuffer(); 2307 ut->chunkLength = s->length(); 2308 ut->chunkNativeStart = 0; 2309 ut->chunkNativeLimit = ut->chunkLength; 2310 ut->nativeIndexingLimit = ut->chunkLength; 2311 } 2312 return ut; 2313} 2314 2315//------------------------------------------------------------------------------ 2316// 2317// UText implementation for const UChar * strings 2318// 2319// Use of UText data members: 2320// context pointer to UnicodeString 2321// a length. -1 if not yet known. 2322// 2323// TODO: support 64 bit lengths. 2324// 2325//------------------------------------------------------------------------------ 2326 2327U_CDECL_BEGIN 2328 2329 2330static UText * U_CALLCONV 2331ucstrTextClone(UText *dest, const UText * src, UBool deep, UErrorCode * status) { 2332 // First do a generic shallow clone. 2333 dest = shallowTextClone(dest, src, status); 2334 2335 // For deep clones, make a copy of the string. 2336 // The copied storage is owned by the newly created clone. 2337 // A non-NULL pointer in UText.p is the signal to the close() function to delete 2338 // it. 2339 // 2340 if (deep && U_SUCCESS(*status)) { 2341 U_ASSERT(utext_nativeLength(dest) < INT32_MAX); 2342 int32_t len = (int32_t)utext_nativeLength(dest); 2343 2344 // The cloned string IS going to be NUL terminated, whether or not the original was. 2345 const UChar *srcStr = (const UChar *)src->context; 2346 UChar *copyStr = (UChar *)uprv_malloc((len+1) * sizeof(UChar)); 2347 if (copyStr == NULL) { 2348 *status = U_MEMORY_ALLOCATION_ERROR; 2349 } else { 2350 int64_t i; 2351 for (i=0; i<len; i++) { 2352 copyStr[i] = srcStr[i]; 2353 } 2354 copyStr[len] = 0; 2355 dest->context = copyStr; 2356 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); 2357 } 2358 } 2359 return dest; 2360} 2361 2362 2363static void U_CALLCONV 2364ucstrTextClose(UText *ut) { 2365 // Most of the work of close is done by the generic UText framework close. 2366 // All that needs to be done here is delete the string if the UText 2367 // owns it. This occurs if the UText was created by cloning. 2368 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { 2369 UChar *s = (UChar *)ut->context; 2370 uprv_free(s); 2371 ut->context = NULL; 2372 } 2373} 2374 2375 2376 2377static int64_t U_CALLCONV 2378ucstrTextLength(UText *ut) { 2379 if (ut->a < 0) { 2380 // null terminated, we don't yet know the length. Scan for it. 2381 // Access is not convenient for doing this 2382 // because the current interation postion can't be changed. 2383 const UChar *str = (const UChar *)ut->context; 2384 for (;;) { 2385 if (str[ut->chunkNativeLimit] == 0) { 2386 break; 2387 } 2388 ut->chunkNativeLimit++; 2389 } 2390 ut->a = ut->chunkNativeLimit; 2391 ut->chunkLength = (int32_t)ut->chunkNativeLimit; 2392 ut->nativeIndexingLimit = ut->chunkLength; 2393 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 2394 } 2395 return ut->a; 2396} 2397 2398 2399static UBool U_CALLCONV 2400ucstrTextAccess(UText *ut, int64_t index, UBool forward) { 2401 const UChar *str = (const UChar *)ut->context; 2402 2403 // pin the requested index to the bounds of the string, 2404 // and set current iteration position. 2405 if (index<0) { 2406 index = 0; 2407 } else if (index < ut->chunkNativeLimit) { 2408 // The request data is within the chunk as it is known so far. 2409 // Put index on a code point boundary. 2410 U16_SET_CP_START(str, 0, index); 2411 } else if (ut->a >= 0) { 2412 // We know the length of this string, and the user is requesting something 2413 // at or beyond the length. Pin the requested index to the length. 2414 index = ut->a; 2415 } else { 2416 // Null terminated string, length not yet known, and the requested index 2417 // is beyond where we have scanned so far. 2418 // Scan to 32 UChars beyond the requested index. The strategy here is 2419 // to avoid fully scanning a long string when the caller only wants to 2420 // see a few characters at its beginning. 2421 int32_t scanLimit = (int32_t)index + 32; 2422 if ((index + 32)>INT32_MAX || (index + 32)<0 ) { // note: int64 expression 2423 scanLimit = INT32_MAX; 2424 } 2425 2426 int32_t chunkLimit = (int32_t)ut->chunkNativeLimit; 2427 for (; chunkLimit<scanLimit; chunkLimit++) { 2428 if (str[chunkLimit] == 0) { 2429 // We found the end of the string. Remember it, pin the requested index to it, 2430 // and bail out of here. 2431 ut->a = chunkLimit; 2432 ut->chunkLength = chunkLimit; 2433 ut->nativeIndexingLimit = chunkLimit; 2434 if (index >= chunkLimit) { 2435 index = chunkLimit; 2436 } else { 2437 U16_SET_CP_START(str, 0, index); 2438 } 2439 2440 ut->chunkNativeLimit = chunkLimit; 2441 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 2442 goto breakout; 2443 } 2444 } 2445 // We scanned through the next batch of UChars without finding the end. 2446 U16_SET_CP_START(str, 0, index); 2447 if (chunkLimit == INT32_MAX) { 2448 // Scanned to the limit of a 32 bit length. 2449 // Forceably trim the overlength string back so length fits in int32 2450 // TODO: add support for 64 bit strings. 2451 ut->a = chunkLimit; 2452 ut->chunkLength = chunkLimit; 2453 ut->nativeIndexingLimit = chunkLimit; 2454 if (index > chunkLimit) { 2455 index = chunkLimit; 2456 } 2457 ut->chunkNativeLimit = chunkLimit; 2458 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 2459 } else { 2460 // The endpoint of a chunk must not be left in the middle of a surrogate pair. 2461 // If the current end is on a lead surrogate, back the end up by one. 2462 // It doesn't matter if the end char happens to be an unpaired surrogate, 2463 // and it's simpler not to worry about it. 2464 if (U16_IS_LEAD(str[chunkLimit-1])) { 2465 --chunkLimit; 2466 } 2467 // Null-terminated chunk with end still unknown. 2468 // Update the chunk length to reflect what has been scanned thus far. 2469 // That the full length is still unknown is (still) flagged by 2470 // ut->a being < 0. 2471 ut->chunkNativeLimit = chunkLimit; 2472 ut->nativeIndexingLimit = chunkLimit; 2473 ut->chunkLength = chunkLimit; 2474 } 2475 2476 } 2477breakout: 2478 U_ASSERT(index<=INT32_MAX); 2479 ut->chunkOffset = (int32_t)index; 2480 2481 // Check whether request is at the start or end 2482 UBool retVal = (forward && index<ut->chunkNativeLimit) || (!forward && index>0); 2483 return retVal; 2484} 2485 2486 2487 2488static int32_t U_CALLCONV 2489ucstrTextExtract(UText *ut, 2490 int64_t start, int64_t limit, 2491 UChar *dest, int32_t destCapacity, 2492 UErrorCode *pErrorCode) 2493{ 2494 if(U_FAILURE(*pErrorCode)) { 2495 return 0; 2496 } 2497 if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) { 2498 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 2499 return 0; 2500 } 2501 2502 //const UChar *s=(const UChar *)ut->context; 2503 int32_t si, di; 2504 2505 int32_t start32; 2506 int32_t limit32; 2507 2508 // Access the start. Does two things we need: 2509 // Pins 'start' to the length of the string, if it came in out-of-bounds. 2510 // Snaps 'start' to the beginning of a code point. 2511 ucstrTextAccess(ut, start, TRUE); 2512 const UChar *s=ut->chunkContents; 2513 start32 = ut->chunkOffset; 2514 2515 int32_t strLength=(int32_t)ut->a; 2516 if (strLength >= 0) { 2517 limit32 = pinIndex(limit, strLength); 2518 } else { 2519 limit32 = pinIndex(limit, INT32_MAX); 2520 } 2521 di = 0; 2522 for (si=start32; si<limit32; si++) { 2523 if (strLength<0 && s[si]==0) { 2524 // Just hit the end of a null-terminated string. 2525 ut->a = si; // set string length for this UText 2526 ut->chunkNativeLimit = si; 2527 ut->chunkLength = si; 2528 ut->nativeIndexingLimit = si; 2529 strLength = si; 2530 break; 2531 } 2532 U_ASSERT(di>=0); /* to ensure di never exceeds INT32_MAX, which must not happen logically */ 2533 if (di<destCapacity) { 2534 // only store if there is space. 2535 dest[di] = s[si]; 2536 } else { 2537 if (strLength>=0) { 2538 // We have filled the destination buffer, and the string length is known. 2539 // Cut the loop short. There is no need to scan string termination. 2540 di = limit32 - start32; 2541 si = limit32; 2542 break; 2543 } 2544 } 2545 di++; 2546 } 2547 2548 // If the limit index points to a lead surrogate of a pair, 2549 // add the corresponding trail surrogate to the destination. 2550 if (si>0 && U16_IS_LEAD(s[si-1]) && 2551 ((si<strLength || strLength<0) && U16_IS_TRAIL(s[si]))) 2552 { 2553 if (di<destCapacity) { 2554 // store only if there is space in the output buffer. 2555 dest[di++] = s[si++]; 2556 } 2557 } 2558 2559 // Put iteration position at the point just following the extracted text 2560 ut->chunkOffset = uprv_min(strLength, start32 + destCapacity); 2561 2562 // Add a terminating NUL if space in the buffer permits, 2563 // and set the error status as required. 2564 u_terminateUChars(dest, destCapacity, di, pErrorCode); 2565 return di; 2566} 2567 2568static const struct UTextFuncs ucstrFuncs = 2569{ 2570 sizeof(UTextFuncs), 2571 0, 0, 0, // Reserved alignment padding 2572 ucstrTextClone, 2573 ucstrTextLength, 2574 ucstrTextAccess, 2575 ucstrTextExtract, 2576 NULL, // Replace 2577 NULL, // Copy 2578 NULL, // MapOffsetToNative, 2579 NULL, // MapIndexToUTF16, 2580 ucstrTextClose, 2581 NULL, // spare 1 2582 NULL, // spare 2 2583 NULL, // spare 3 2584}; 2585 2586U_CDECL_END 2587 2588static const UChar gEmptyUString[] = {0}; 2589 2590U_CAPI UText * U_EXPORT2 2591utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status) { 2592 if (U_FAILURE(*status)) { 2593 return NULL; 2594 } 2595 if(s==NULL && length==0) { 2596 s = gEmptyUString; 2597 } 2598 if (s==NULL || length < -1 || length>INT32_MAX) { 2599 *status = U_ILLEGAL_ARGUMENT_ERROR; 2600 return NULL; 2601 } 2602 ut = utext_setup(ut, 0, status); 2603 if (U_SUCCESS(*status)) { 2604 ut->pFuncs = &ucstrFuncs; 2605 ut->context = s; 2606 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS); 2607 if (length==-1) { 2608 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 2609 } 2610 ut->a = length; 2611 ut->chunkContents = s; 2612 ut->chunkNativeStart = 0; 2613 ut->chunkNativeLimit = length>=0? length : 0; 2614 ut->chunkLength = (int32_t)ut->chunkNativeLimit; 2615 ut->chunkOffset = 0; 2616 ut->nativeIndexingLimit = ut->chunkLength; 2617 } 2618 return ut; 2619} 2620 2621 2622//------------------------------------------------------------------------------ 2623// 2624// UText implementation for text from ICU CharacterIterators 2625// 2626// Use of UText data members: 2627// context pointer to the CharacterIterator 2628// a length of the full text. 2629// p pointer to buffer 1 2630// b start index of local buffer 1 contents 2631// q pointer to buffer 2 2632// c start index of local buffer 2 contents 2633// r pointer to the character iterator if the UText owns it. 2634// Null otherwise. 2635// 2636//------------------------------------------------------------------------------ 2637#define CIBufSize 16 2638 2639U_CDECL_BEGIN 2640static void U_CALLCONV 2641charIterTextClose(UText *ut) { 2642 // Most of the work of close is done by the generic UText framework close. 2643 // All that needs to be done here is delete the CharacterIterator if the UText 2644 // owns it. This occurs if the UText was created by cloning. 2645 CharacterIterator *ci = (CharacterIterator *)ut->r; 2646 delete ci; 2647 ut->r = NULL; 2648} 2649 2650static int64_t U_CALLCONV 2651charIterTextLength(UText *ut) { 2652 return (int32_t)ut->a; 2653} 2654 2655static UBool U_CALLCONV 2656charIterTextAccess(UText *ut, int64_t index, UBool forward) { 2657 CharacterIterator *ci = (CharacterIterator *)ut->context; 2658 2659 int32_t clippedIndex = (int32_t)index; 2660 if (clippedIndex<0) { 2661 clippedIndex=0; 2662 } else if (clippedIndex>=ut->a) { 2663 clippedIndex=(int32_t)ut->a; 2664 } 2665 int32_t neededIndex = clippedIndex; 2666 if (!forward && neededIndex>0) { 2667 // reverse iteration, want the position just before what was asked for. 2668 neededIndex--; 2669 } else if (forward && neededIndex==ut->a && neededIndex>0) { 2670 // Forward iteration, don't ask for something past the end of the text. 2671 neededIndex--; 2672 } 2673 2674 // Find the native index of the start of the buffer containing what we want. 2675 neededIndex -= neededIndex % CIBufSize; 2676 2677 UChar *buf = NULL; 2678 UBool needChunkSetup = TRUE; 2679 int i; 2680 if (ut->chunkNativeStart == neededIndex) { 2681 // The buffer we want is already the current chunk. 2682 needChunkSetup = FALSE; 2683 } else if (ut->b == neededIndex) { 2684 // The first buffer (buffer p) has what we need. 2685 buf = (UChar *)ut->p; 2686 } else if (ut->c == neededIndex) { 2687 // The second buffer (buffer q) has what we need. 2688 buf = (UChar *)ut->q; 2689 } else { 2690 // Neither buffer already has what we need. 2691 // Load new data from the character iterator. 2692 // Use the buf that is not the current buffer. 2693 buf = (UChar *)ut->p; 2694 if (ut->p == ut->chunkContents) { 2695 buf = (UChar *)ut->q; 2696 } 2697 ci->setIndex(neededIndex); 2698 for (i=0; i<CIBufSize; i++) { 2699 buf[i] = ci->nextPostInc(); 2700 if (i+neededIndex > ut->a) { 2701 break; 2702 } 2703 } 2704 } 2705 2706 // We have a buffer with the data we need. 2707 // Set it up as the current chunk, if it wasn't already. 2708 if (needChunkSetup) { 2709 ut->chunkContents = buf; 2710 ut->chunkLength = CIBufSize; 2711 ut->chunkNativeStart = neededIndex; 2712 ut->chunkNativeLimit = neededIndex + CIBufSize; 2713 if (ut->chunkNativeLimit > ut->a) { 2714 ut->chunkNativeLimit = ut->a; 2715 ut->chunkLength = (int32_t)(ut->chunkNativeLimit)-(int32_t)(ut->chunkNativeStart); 2716 } 2717 ut->nativeIndexingLimit = ut->chunkLength; 2718 U_ASSERT(ut->chunkOffset>=0 && ut->chunkOffset<=CIBufSize); 2719 } 2720 ut->chunkOffset = clippedIndex - (int32_t)ut->chunkNativeStart; 2721 UBool success = (forward? ut->chunkOffset<ut->chunkLength : ut->chunkOffset>0); 2722 return success; 2723} 2724 2725static UText * U_CALLCONV 2726charIterTextClone(UText *dest, const UText *src, UBool deep, UErrorCode * status) { 2727 if (U_FAILURE(*status)) { 2728 return NULL; 2729 } 2730 2731 if (deep) { 2732 // There is no CharacterIterator API for cloning the underlying text storage. 2733 *status = U_UNSUPPORTED_ERROR; 2734 return NULL; 2735 } else { 2736 CharacterIterator *srcCI =(CharacterIterator *)src->context; 2737 srcCI = srcCI->clone(); 2738 dest = utext_openCharacterIterator(dest, srcCI, status); 2739 if (U_FAILURE(*status)) { 2740 return dest; 2741 } 2742 // cast off const on getNativeIndex. 2743 // For CharacterIterator based UTexts, this is safe, the operation is const. 2744 int64_t ix = utext_getNativeIndex((UText *)src); 2745 utext_setNativeIndex(dest, ix); 2746 dest->r = srcCI; // flags that this UText owns the CharacterIterator 2747 } 2748 return dest; 2749} 2750 2751static int32_t U_CALLCONV 2752charIterTextExtract(UText *ut, 2753 int64_t start, int64_t limit, 2754 UChar *dest, int32_t destCapacity, 2755 UErrorCode *status) 2756{ 2757 if(U_FAILURE(*status)) { 2758 return 0; 2759 } 2760 if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) { 2761 *status=U_ILLEGAL_ARGUMENT_ERROR; 2762 return 0; 2763 } 2764 int32_t length = (int32_t)ut->a; 2765 int32_t start32 = pinIndex(start, length); 2766 int32_t limit32 = pinIndex(limit, length); 2767 int32_t desti = 0; 2768 int32_t srci; 2769 int32_t copyLimit; 2770 2771 CharacterIterator *ci = (CharacterIterator *)ut->context; 2772 ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed. 2773 srci = ci->getIndex(); 2774 copyLimit = srci; 2775 while (srci<limit32) { 2776 UChar32 c = ci->next32PostInc(); 2777 int32_t len = U16_LENGTH(c); 2778 U_ASSERT(desti+len>0); /* to ensure desti+len never exceeds MAX_INT32, which must not happen logically */ 2779 if (desti+len <= destCapacity) { 2780 U16_APPEND_UNSAFE(dest, desti, c); 2781 copyLimit = srci+len; 2782 } else { 2783 desti += len; 2784 *status = U_BUFFER_OVERFLOW_ERROR; 2785 } 2786 srci += len; 2787 } 2788 2789 charIterTextAccess(ut, copyLimit, TRUE); 2790 2791 u_terminateUChars(dest, destCapacity, desti, status); 2792 return desti; 2793} 2794 2795static const struct UTextFuncs charIterFuncs = 2796{ 2797 sizeof(UTextFuncs), 2798 0, 0, 0, // Reserved alignment padding 2799 charIterTextClone, 2800 charIterTextLength, 2801 charIterTextAccess, 2802 charIterTextExtract, 2803 NULL, // Replace 2804 NULL, // Copy 2805 NULL, // MapOffsetToNative, 2806 NULL, // MapIndexToUTF16, 2807 charIterTextClose, 2808 NULL, // spare 1 2809 NULL, // spare 2 2810 NULL // spare 3 2811}; 2812U_CDECL_END 2813 2814 2815U_CAPI UText * U_EXPORT2 2816utext_openCharacterIterator(UText *ut, CharacterIterator *ci, UErrorCode *status) { 2817 if (U_FAILURE(*status)) { 2818 return NULL; 2819 } 2820 2821 if (ci->startIndex() > 0) { 2822 // No support for CharacterIterators that do not start indexing from zero. 2823 *status = U_UNSUPPORTED_ERROR; 2824 return NULL; 2825 } 2826 2827 // Extra space in UText for 2 buffers of CIBufSize UChars each. 2828 int32_t extraSpace = 2 * CIBufSize * sizeof(UChar); 2829 ut = utext_setup(ut, extraSpace, status); 2830 if (U_SUCCESS(*status)) { 2831 ut->pFuncs = &charIterFuncs; 2832 ut->context = ci; 2833 ut->providerProperties = 0; 2834 ut->a = ci->endIndex(); // Length of text 2835 ut->p = ut->pExtra; // First buffer 2836 ut->b = -1; // Native index of first buffer contents 2837 ut->q = (UChar*)ut->pExtra+CIBufSize; // Second buffer 2838 ut->c = -1; // Native index of second buffer contents 2839 2840 // Initialize current chunk contents to be empty. 2841 // First access will fault something in. 2842 // Note: The initial nativeStart and chunkOffset must sum to zero 2843 // so that getNativeIndex() will correctly compute to zero 2844 // if no call to Access() has ever been made. They can't be both 2845 // zero without Access() thinking that the chunk is valid. 2846 ut->chunkContents = (UChar *)ut->p; 2847 ut->chunkNativeStart = -1; 2848 ut->chunkOffset = 1; 2849 ut->chunkNativeLimit = 0; 2850 ut->chunkLength = 0; 2851 ut->nativeIndexingLimit = ut->chunkOffset; // enables native indexing 2852 } 2853 return ut; 2854} 2855