1/* 2******************************************************************************* 3* 4* Copyright (C) 2002-2011, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: uiter.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2002jan18 14* created by: Markus W. Scherer 15*/ 16 17#include "unicode/utypes.h" 18#include "unicode/ustring.h" 19#include "unicode/chariter.h" 20#include "unicode/rep.h" 21#include "unicode/uiter.h" 22#include "unicode/utf.h" 23#include "unicode/utf8.h" 24#include "unicode/utf16.h" 25#include "cstring.h" 26 27U_NAMESPACE_USE 28 29#define IS_EVEN(n) (((n)&1)==0) 30#define IS_POINTER_EVEN(p) IS_EVEN((size_t)p) 31 32U_CDECL_BEGIN 33 34/* No-Op UCharIterator implementation for illegal input --------------------- */ 35 36static int32_t U_CALLCONV 37noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) { 38 return 0; 39} 40 41static int32_t U_CALLCONV 42noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) { 43 return 0; 44} 45 46static UBool U_CALLCONV 47noopHasNext(UCharIterator * /*iter*/) { 48 return FALSE; 49} 50 51static UChar32 U_CALLCONV 52noopCurrent(UCharIterator * /*iter*/) { 53 return U_SENTINEL; 54} 55 56static uint32_t U_CALLCONV 57noopGetState(const UCharIterator * /*iter*/) { 58 return UITER_NO_STATE; 59} 60 61static void U_CALLCONV 62noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) { 63 *pErrorCode=U_UNSUPPORTED_ERROR; 64} 65 66static const UCharIterator noopIterator={ 67 0, 0, 0, 0, 0, 0, 68 noopGetIndex, 69 noopMove, 70 noopHasNext, 71 noopHasNext, 72 noopCurrent, 73 noopCurrent, 74 noopCurrent, 75 NULL, 76 noopGetState, 77 noopSetState 78}; 79 80/* UCharIterator implementation for simple strings -------------------------- */ 81 82/* 83 * This is an implementation of a code unit (UChar) iterator 84 * for UChar * strings. 85 * 86 * The UCharIterator.context field holds a pointer to the string. 87 */ 88 89static int32_t U_CALLCONV 90stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { 91 switch(origin) { 92 case UITER_ZERO: 93 return 0; 94 case UITER_START: 95 return iter->start; 96 case UITER_CURRENT: 97 return iter->index; 98 case UITER_LIMIT: 99 return iter->limit; 100 case UITER_LENGTH: 101 return iter->length; 102 default: 103 /* not a valid origin */ 104 /* Should never get here! */ 105 return -1; 106 } 107} 108 109static int32_t U_CALLCONV 110stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { 111 int32_t pos; 112 113 switch(origin) { 114 case UITER_ZERO: 115 pos=delta; 116 break; 117 case UITER_START: 118 pos=iter->start+delta; 119 break; 120 case UITER_CURRENT: 121 pos=iter->index+delta; 122 break; 123 case UITER_LIMIT: 124 pos=iter->limit+delta; 125 break; 126 case UITER_LENGTH: 127 pos=iter->length+delta; 128 break; 129 default: 130 return -1; /* Error */ 131 } 132 133 if(pos<iter->start) { 134 pos=iter->start; 135 } else if(pos>iter->limit) { 136 pos=iter->limit; 137 } 138 139 return iter->index=pos; 140} 141 142static UBool U_CALLCONV 143stringIteratorHasNext(UCharIterator *iter) { 144 return iter->index<iter->limit; 145} 146 147static UBool U_CALLCONV 148stringIteratorHasPrevious(UCharIterator *iter) { 149 return iter->index>iter->start; 150} 151 152static UChar32 U_CALLCONV 153stringIteratorCurrent(UCharIterator *iter) { 154 if(iter->index<iter->limit) { 155 return ((const UChar *)(iter->context))[iter->index]; 156 } else { 157 return U_SENTINEL; 158 } 159} 160 161static UChar32 U_CALLCONV 162stringIteratorNext(UCharIterator *iter) { 163 if(iter->index<iter->limit) { 164 return ((const UChar *)(iter->context))[iter->index++]; 165 } else { 166 return U_SENTINEL; 167 } 168} 169 170static UChar32 U_CALLCONV 171stringIteratorPrevious(UCharIterator *iter) { 172 if(iter->index>iter->start) { 173 return ((const UChar *)(iter->context))[--iter->index]; 174 } else { 175 return U_SENTINEL; 176 } 177} 178 179static uint32_t U_CALLCONV 180stringIteratorGetState(const UCharIterator *iter) { 181 return (uint32_t)iter->index; 182} 183 184static void U_CALLCONV 185stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 186 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 187 /* do nothing */ 188 } else if(iter==NULL) { 189 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 190 } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) { 191 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 192 } else { 193 iter->index=(int32_t)state; 194 } 195} 196 197static const UCharIterator stringIterator={ 198 0, 0, 0, 0, 0, 0, 199 stringIteratorGetIndex, 200 stringIteratorMove, 201 stringIteratorHasNext, 202 stringIteratorHasPrevious, 203 stringIteratorCurrent, 204 stringIteratorNext, 205 stringIteratorPrevious, 206 NULL, 207 stringIteratorGetState, 208 stringIteratorSetState 209}; 210 211U_CAPI void U_EXPORT2 212uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) { 213 if(iter!=0) { 214 if(s!=0 && length>=-1) { 215 *iter=stringIterator; 216 iter->context=s; 217 if(length>=0) { 218 iter->length=length; 219 } else { 220 iter->length=u_strlen(s); 221 } 222 iter->limit=iter->length; 223 } else { 224 *iter=noopIterator; 225 } 226 } 227} 228 229/* UCharIterator implementation for UTF-16BE strings ------------------------ */ 230 231/* 232 * This is an implementation of a code unit (UChar) iterator 233 * for UTF-16BE strings, i.e., strings in byte-vectors where 234 * each UChar is stored as a big-endian pair of bytes. 235 * 236 * The UCharIterator.context field holds a pointer to the string. 237 * Everything works just like with a normal UChar iterator (uiter_setString), 238 * except that UChars are assembled from byte pairs. 239 */ 240 241/* internal helper function */ 242static inline UChar32 243utf16BEIteratorGet(UCharIterator *iter, int32_t index) { 244 const uint8_t *p=(const uint8_t *)iter->context; 245 return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1]; 246} 247 248static UChar32 U_CALLCONV 249utf16BEIteratorCurrent(UCharIterator *iter) { 250 int32_t index; 251 252 if((index=iter->index)<iter->limit) { 253 return utf16BEIteratorGet(iter, index); 254 } else { 255 return U_SENTINEL; 256 } 257} 258 259static UChar32 U_CALLCONV 260utf16BEIteratorNext(UCharIterator *iter) { 261 int32_t index; 262 263 if((index=iter->index)<iter->limit) { 264 iter->index=index+1; 265 return utf16BEIteratorGet(iter, index); 266 } else { 267 return U_SENTINEL; 268 } 269} 270 271static UChar32 U_CALLCONV 272utf16BEIteratorPrevious(UCharIterator *iter) { 273 int32_t index; 274 275 if((index=iter->index)>iter->start) { 276 iter->index=--index; 277 return utf16BEIteratorGet(iter, index); 278 } else { 279 return U_SENTINEL; 280 } 281} 282 283static const UCharIterator utf16BEIterator={ 284 0, 0, 0, 0, 0, 0, 285 stringIteratorGetIndex, 286 stringIteratorMove, 287 stringIteratorHasNext, 288 stringIteratorHasPrevious, 289 utf16BEIteratorCurrent, 290 utf16BEIteratorNext, 291 utf16BEIteratorPrevious, 292 NULL, 293 stringIteratorGetState, 294 stringIteratorSetState 295}; 296 297/* 298 * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL, 299 * i.e., before a pair of 0 bytes where the first 0 byte is at an even 300 * offset from s. 301 */ 302static int32_t 303utf16BE_strlen(const char *s) { 304 if(IS_POINTER_EVEN(s)) { 305 /* 306 * even-aligned, call u_strlen(s) 307 * we are probably on a little-endian machine, but searching for UChar NUL 308 * does not care about endianness 309 */ 310 return u_strlen((const UChar *)s); 311 } else { 312 /* odd-aligned, search for pair of 0 bytes */ 313 const char *p=s; 314 315 while(!(*p==0 && p[1]==0)) { 316 p+=2; 317 } 318 return (int32_t)((p-s)/2); 319 } 320} 321 322U_CAPI void U_EXPORT2 323uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) { 324 if(iter!=NULL) { 325 /* allow only even-length strings (the input length counts bytes) */ 326 if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) { 327 /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */ 328 length>>=1; 329 330 if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) { 331 /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */ 332 uiter_setString(iter, (const UChar *)s, length); 333 return; 334 } 335 336 *iter=utf16BEIterator; 337 iter->context=s; 338 if(length>=0) { 339 iter->length=length; 340 } else { 341 iter->length=utf16BE_strlen(s); 342 } 343 iter->limit=iter->length; 344 } else { 345 *iter=noopIterator; 346 } 347 } 348} 349 350/* UCharIterator wrapper around CharacterIterator --------------------------- */ 351 352/* 353 * This is wrapper code around a C++ CharacterIterator to 354 * look like a C UCharIterator. 355 * 356 * The UCharIterator.context field holds a pointer to the CharacterIterator. 357 */ 358 359static int32_t U_CALLCONV 360characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { 361 switch(origin) { 362 case UITER_ZERO: 363 return 0; 364 case UITER_START: 365 return ((CharacterIterator *)(iter->context))->startIndex(); 366 case UITER_CURRENT: 367 return ((CharacterIterator *)(iter->context))->getIndex(); 368 case UITER_LIMIT: 369 return ((CharacterIterator *)(iter->context))->endIndex(); 370 case UITER_LENGTH: 371 return ((CharacterIterator *)(iter->context))->getLength(); 372 default: 373 /* not a valid origin */ 374 /* Should never get here! */ 375 return -1; 376 } 377} 378 379static int32_t U_CALLCONV 380characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { 381 switch(origin) { 382 case UITER_ZERO: 383 ((CharacterIterator *)(iter->context))->setIndex(delta); 384 return ((CharacterIterator *)(iter->context))->getIndex(); 385 case UITER_START: 386 case UITER_CURRENT: 387 case UITER_LIMIT: 388 return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin); 389 case UITER_LENGTH: 390 ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta); 391 return ((CharacterIterator *)(iter->context))->getIndex(); 392 default: 393 /* not a valid origin */ 394 /* Should never get here! */ 395 return -1; 396 } 397} 398 399static UBool U_CALLCONV 400characterIteratorHasNext(UCharIterator *iter) { 401 return ((CharacterIterator *)(iter->context))->hasNext(); 402} 403 404static UBool U_CALLCONV 405characterIteratorHasPrevious(UCharIterator *iter) { 406 return ((CharacterIterator *)(iter->context))->hasPrevious(); 407} 408 409static UChar32 U_CALLCONV 410characterIteratorCurrent(UCharIterator *iter) { 411 UChar32 c; 412 413 c=((CharacterIterator *)(iter->context))->current(); 414 if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) { 415 return c; 416 } else { 417 return U_SENTINEL; 418 } 419} 420 421static UChar32 U_CALLCONV 422characterIteratorNext(UCharIterator *iter) { 423 if(((CharacterIterator *)(iter->context))->hasNext()) { 424 return ((CharacterIterator *)(iter->context))->nextPostInc(); 425 } else { 426 return U_SENTINEL; 427 } 428} 429 430static UChar32 U_CALLCONV 431characterIteratorPrevious(UCharIterator *iter) { 432 if(((CharacterIterator *)(iter->context))->hasPrevious()) { 433 return ((CharacterIterator *)(iter->context))->previous(); 434 } else { 435 return U_SENTINEL; 436 } 437} 438 439static uint32_t U_CALLCONV 440characterIteratorGetState(const UCharIterator *iter) { 441 return ((CharacterIterator *)(iter->context))->getIndex(); 442} 443 444static void U_CALLCONV 445characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 446 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 447 /* do nothing */ 448 } else if(iter==NULL || iter->context==NULL) { 449 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 450 } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) { 451 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 452 } else { 453 ((CharacterIterator *)(iter->context))->setIndex((int32_t)state); 454 } 455} 456 457static const UCharIterator characterIteratorWrapper={ 458 0, 0, 0, 0, 0, 0, 459 characterIteratorGetIndex, 460 characterIteratorMove, 461 characterIteratorHasNext, 462 characterIteratorHasPrevious, 463 characterIteratorCurrent, 464 characterIteratorNext, 465 characterIteratorPrevious, 466 NULL, 467 characterIteratorGetState, 468 characterIteratorSetState 469}; 470 471U_CAPI void U_EXPORT2 472uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) { 473 if(iter!=0) { 474 if(charIter!=0) { 475 *iter=characterIteratorWrapper; 476 iter->context=charIter; 477 } else { 478 *iter=noopIterator; 479 } 480 } 481} 482 483/* UCharIterator wrapper around Replaceable --------------------------------- */ 484 485/* 486 * This is an implementation of a code unit (UChar) iterator 487 * based on a Replaceable object. 488 * 489 * The UCharIterator.context field holds a pointer to the Replaceable. 490 * UCharIterator.length and UCharIterator.index hold Replaceable.length() 491 * and the iteration index. 492 */ 493 494static UChar32 U_CALLCONV 495replaceableIteratorCurrent(UCharIterator *iter) { 496 if(iter->index<iter->limit) { 497 return ((Replaceable *)(iter->context))->charAt(iter->index); 498 } else { 499 return U_SENTINEL; 500 } 501} 502 503static UChar32 U_CALLCONV 504replaceableIteratorNext(UCharIterator *iter) { 505 if(iter->index<iter->limit) { 506 return ((Replaceable *)(iter->context))->charAt(iter->index++); 507 } else { 508 return U_SENTINEL; 509 } 510} 511 512static UChar32 U_CALLCONV 513replaceableIteratorPrevious(UCharIterator *iter) { 514 if(iter->index>iter->start) { 515 return ((Replaceable *)(iter->context))->charAt(--iter->index); 516 } else { 517 return U_SENTINEL; 518 } 519} 520 521static const UCharIterator replaceableIterator={ 522 0, 0, 0, 0, 0, 0, 523 stringIteratorGetIndex, 524 stringIteratorMove, 525 stringIteratorHasNext, 526 stringIteratorHasPrevious, 527 replaceableIteratorCurrent, 528 replaceableIteratorNext, 529 replaceableIteratorPrevious, 530 NULL, 531 stringIteratorGetState, 532 stringIteratorSetState 533}; 534 535U_CAPI void U_EXPORT2 536uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) { 537 if(iter!=0) { 538 if(rep!=0) { 539 *iter=replaceableIterator; 540 iter->context=rep; 541 iter->limit=iter->length=rep->length(); 542 } else { 543 *iter=noopIterator; 544 } 545 } 546} 547 548/* UCharIterator implementation for UTF-8 strings --------------------------- */ 549 550/* 551 * Possible, probably necessary only for an implementation for arbitrary 552 * converters: 553 * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text. 554 * This would require to turn reservedFn into a close function and 555 * to introduce a uiter_close(iter). 556 */ 557 558#define UITER_CNV_CAPACITY 16 559 560/* 561 * Minimal implementation: 562 * Maintain a single-UChar buffer for an additional surrogate. 563 * The caller must not modify start and limit because they are used internally. 564 * 565 * Use UCharIterator fields as follows: 566 * context pointer to UTF-8 string 567 * length UTF-16 length of the string; -1 until lazy evaluation 568 * start current UTF-8 index 569 * index current UTF-16 index; may be -1="unknown" after setState() 570 * limit UTF-8 length of the string 571 * reservedField supplementary code point 572 * 573 * Since UCharIterator delivers 16-bit code units, the iteration can be 574 * currently in the middle of the byte sequence for a supplementary code point. 575 * In this case, reservedField will contain that code point and start will 576 * point to after the corresponding byte sequence. The UTF-16 index will be 577 * one less than what it would otherwise be corresponding to the UTF-8 index. 578 * Otherwise, reservedField will be 0. 579 */ 580 581/* 582 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings: 583 * Add implementations that do not call strlen() for iteration but check for NUL. 584 */ 585 586static int32_t U_CALLCONV 587utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { 588 switch(origin) { 589 case UITER_ZERO: 590 case UITER_START: 591 return 0; 592 case UITER_CURRENT: 593 if(iter->index<0) { 594 /* the current UTF-16 index is unknown after setState(), count from the beginning */ 595 const uint8_t *s; 596 UChar32 c; 597 int32_t i, limit, index; 598 599 s=(const uint8_t *)iter->context; 600 i=index=0; 601 limit=iter->start; /* count up to the UTF-8 index */ 602 while(i<limit) { 603 U8_NEXT(s, i, limit, c); 604 if(c<=0xffff) { 605 ++index; 606 } else { 607 index+=2; 608 } 609 } 610 611 iter->start=i; /* just in case setState() did not get us to a code point boundary */ 612 if(i==iter->limit) { 613 iter->length=index; /* in case it was <0 or wrong */ 614 } 615 if(iter->reservedField!=0) { 616 --index; /* we are in the middle of a supplementary code point */ 617 } 618 iter->index=index; 619 } 620 return iter->index; 621 case UITER_LIMIT: 622 case UITER_LENGTH: 623 if(iter->length<0) { 624 const uint8_t *s; 625 UChar32 c; 626 int32_t i, limit, length; 627 628 s=(const uint8_t *)iter->context; 629 if(iter->index<0) { 630 /* 631 * the current UTF-16 index is unknown after setState(), 632 * we must first count from the beginning to here 633 */ 634 i=length=0; 635 limit=iter->start; 636 637 /* count from the beginning to the current index */ 638 while(i<limit) { 639 U8_NEXT(s, i, limit, c); 640 if(c<=0xffff) { 641 ++length; 642 } else { 643 length+=2; 644 } 645 } 646 647 /* assume i==limit==iter->start, set the UTF-16 index */ 648 iter->start=i; /* just in case setState() did not get us to a code point boundary */ 649 iter->index= iter->reservedField!=0 ? length-1 : length; 650 } else { 651 i=iter->start; 652 length=iter->index; 653 if(iter->reservedField!=0) { 654 ++length; 655 } 656 } 657 658 /* count from the current index to the end */ 659 limit=iter->limit; 660 while(i<limit) { 661 U8_NEXT(s, i, limit, c); 662 if(c<=0xffff) { 663 ++length; 664 } else { 665 length+=2; 666 } 667 } 668 iter->length=length; 669 } 670 return iter->length; 671 default: 672 /* not a valid origin */ 673 /* Should never get here! */ 674 return -1; 675 } 676} 677 678static int32_t U_CALLCONV 679utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { 680 const uint8_t *s; 681 UChar32 c; 682 int32_t pos; /* requested UTF-16 index */ 683 int32_t i; /* UTF-8 index */ 684 UBool havePos; 685 686 /* calculate the requested UTF-16 index */ 687 switch(origin) { 688 case UITER_ZERO: 689 case UITER_START: 690 pos=delta; 691 havePos=TRUE; 692 /* iter->index<0 (unknown) is possible */ 693 break; 694 case UITER_CURRENT: 695 if(iter->index>=0) { 696 pos=iter->index+delta; 697 havePos=TRUE; 698 } else { 699 /* the current UTF-16 index is unknown after setState(), use only delta */ 700 pos=0; 701 havePos=FALSE; 702 } 703 break; 704 case UITER_LIMIT: 705 case UITER_LENGTH: 706 if(iter->length>=0) { 707 pos=iter->length+delta; 708 havePos=TRUE; 709 } else { 710 /* pin to the end, avoid counting the length */ 711 iter->index=-1; 712 iter->start=iter->limit; 713 iter->reservedField=0; 714 if(delta>=0) { 715 return UITER_UNKNOWN_INDEX; 716 } else { 717 /* the current UTF-16 index is unknown, use only delta */ 718 pos=0; 719 havePos=FALSE; 720 } 721 } 722 break; 723 default: 724 return -1; /* Error */ 725 } 726 727 if(havePos) { 728 /* shortcuts: pinning to the edges of the string */ 729 if(pos<=0) { 730 iter->index=iter->start=iter->reservedField=0; 731 return 0; 732 } else if(iter->length>=0 && pos>=iter->length) { 733 iter->index=iter->length; 734 iter->start=iter->limit; 735 iter->reservedField=0; 736 return iter->index; 737 } 738 739 /* minimize the number of U8_NEXT/PREV operations */ 740 if(iter->index<0 || pos<iter->index/2) { 741 /* go forward from the start instead of backward from the current index */ 742 iter->index=iter->start=iter->reservedField=0; 743 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) { 744 /* 745 * if we have the UTF-16 index and length and the new position is 746 * closer to the end than the current index, 747 * then go backward from the end instead of forward from the current index 748 */ 749 iter->index=iter->length; 750 iter->start=iter->limit; 751 iter->reservedField=0; 752 } 753 754 delta=pos-iter->index; 755 if(delta==0) { 756 return iter->index; /* nothing to do */ 757 } 758 } else { 759 /* move relative to unknown UTF-16 index */ 760 if(delta==0) { 761 return UITER_UNKNOWN_INDEX; /* nothing to do */ 762 } else if(-delta>=iter->start) { 763 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */ 764 iter->index=iter->start=iter->reservedField=0; 765 return 0; 766 } else if(delta>=(iter->limit-iter->start)) { 767 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */ 768 iter->index=iter->length; /* may or may not be <0 (unknown) */ 769 iter->start=iter->limit; 770 iter->reservedField=0; 771 return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX; 772 } 773 } 774 775 /* delta!=0 */ 776 777 /* move towards the requested position, pin to the edges of the string */ 778 s=(const uint8_t *)iter->context; 779 pos=iter->index; /* could be <0 (unknown) */ 780 i=iter->start; 781 if(delta>0) { 782 /* go forward */ 783 int32_t limit=iter->limit; 784 if(iter->reservedField!=0) { 785 iter->reservedField=0; 786 ++pos; 787 --delta; 788 } 789 while(delta>0 && i<limit) { 790 U8_NEXT(s, i, limit, c); 791 if(c<0xffff) { 792 ++pos; 793 --delta; 794 } else if(delta>=2) { 795 pos+=2; 796 delta-=2; 797 } else /* delta==1 */ { 798 /* stop in the middle of a supplementary code point */ 799 iter->reservedField=c; 800 ++pos; 801 break; /* delta=0; */ 802 } 803 } 804 if(i==limit) { 805 if(iter->length<0 && iter->index>=0) { 806 iter->length= iter->reservedField==0 ? pos : pos+1; 807 } else if(iter->index<0 && iter->length>=0) { 808 iter->index= iter->reservedField==0 ? iter->length : iter->length-1; 809 } 810 } 811 } else /* delta<0 */ { 812 /* go backward */ 813 if(iter->reservedField!=0) { 814 iter->reservedField=0; 815 i-=4; /* we stayed behind the supplementary code point; go before it now */ 816 --pos; 817 ++delta; 818 } 819 while(delta<0 && i>0) { 820 U8_PREV(s, 0, i, c); 821 if(c<0xffff) { 822 --pos; 823 ++delta; 824 } else if(delta<=-2) { 825 pos-=2; 826 delta+=2; 827 } else /* delta==-1 */ { 828 /* stop in the middle of a supplementary code point */ 829 i+=4; /* back to behind this supplementary code point for consistent state */ 830 iter->reservedField=c; 831 --pos; 832 break; /* delta=0; */ 833 } 834 } 835 } 836 837 iter->start=i; 838 if(iter->index>=0) { 839 return iter->index=pos; 840 } else { 841 /* we started with index<0 (unknown) so pos is bogus */ 842 if(i<=1) { 843 return iter->index=i; /* reached the beginning */ 844 } else { 845 /* we still don't know the UTF-16 index */ 846 return UITER_UNKNOWN_INDEX; 847 } 848 } 849} 850 851static UBool U_CALLCONV 852utf8IteratorHasNext(UCharIterator *iter) { 853 return iter->start<iter->limit || iter->reservedField!=0; 854} 855 856static UBool U_CALLCONV 857utf8IteratorHasPrevious(UCharIterator *iter) { 858 return iter->start>0; 859} 860 861static UChar32 U_CALLCONV 862utf8IteratorCurrent(UCharIterator *iter) { 863 if(iter->reservedField!=0) { 864 return U16_TRAIL(iter->reservedField); 865 } else if(iter->start<iter->limit) { 866 const uint8_t *s=(const uint8_t *)iter->context; 867 UChar32 c; 868 int32_t i=iter->start; 869 870 U8_NEXT(s, i, iter->limit, c); 871 if(c<0) { 872 return 0xfffd; 873 } else if(c<=0xffff) { 874 return c; 875 } else { 876 return U16_LEAD(c); 877 } 878 } else { 879 return U_SENTINEL; 880 } 881} 882 883static UChar32 U_CALLCONV 884utf8IteratorNext(UCharIterator *iter) { 885 int32_t index; 886 887 if(iter->reservedField!=0) { 888 UChar trail=U16_TRAIL(iter->reservedField); 889 iter->reservedField=0; 890 if((index=iter->index)>=0) { 891 iter->index=index+1; 892 } 893 return trail; 894 } else if(iter->start<iter->limit) { 895 const uint8_t *s=(const uint8_t *)iter->context; 896 UChar32 c; 897 898 U8_NEXT(s, iter->start, iter->limit, c); 899 if((index=iter->index)>=0) { 900 iter->index=++index; 901 if(iter->length<0 && iter->start==iter->limit) { 902 iter->length= c<=0xffff ? index : index+1; 903 } 904 } else if(iter->start==iter->limit && iter->length>=0) { 905 iter->index= c<=0xffff ? iter->length : iter->length-1; 906 } 907 if(c<0) { 908 return 0xfffd; 909 } else if(c<=0xffff) { 910 return c; 911 } else { 912 iter->reservedField=c; 913 return U16_LEAD(c); 914 } 915 } else { 916 return U_SENTINEL; 917 } 918} 919 920static UChar32 U_CALLCONV 921utf8IteratorPrevious(UCharIterator *iter) { 922 int32_t index; 923 924 if(iter->reservedField!=0) { 925 UChar lead=U16_LEAD(iter->reservedField); 926 iter->reservedField=0; 927 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */ 928 if((index=iter->index)>0) { 929 iter->index=index-1; 930 } 931 return lead; 932 } else if(iter->start>0) { 933 const uint8_t *s=(const uint8_t *)iter->context; 934 UChar32 c; 935 936 U8_PREV(s, 0, iter->start, c); 937 if((index=iter->index)>0) { 938 iter->index=index-1; 939 } else if(iter->start<=1) { 940 iter->index= c<=0xffff ? iter->start : iter->start+1; 941 } 942 if(c<0) { 943 return 0xfffd; 944 } else if(c<=0xffff) { 945 return c; 946 } else { 947 iter->start+=4; /* back to behind this supplementary code point for consistent state */ 948 iter->reservedField=c; 949 return U16_TRAIL(c); 950 } 951 } else { 952 return U_SENTINEL; 953 } 954} 955 956static uint32_t U_CALLCONV 957utf8IteratorGetState(const UCharIterator *iter) { 958 uint32_t state=(uint32_t)(iter->start<<1); 959 if(iter->reservedField!=0) { 960 state|=1; 961 } 962 return state; 963} 964 965static void U_CALLCONV 966utf8IteratorSetState(UCharIterator *iter, 967 uint32_t state, 968 UErrorCode *pErrorCode) 969{ 970 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 971 /* do nothing */ 972 } else if(iter==NULL) { 973 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 974 } else if(state==utf8IteratorGetState(iter)) { 975 /* setting to the current state: no-op */ 976 } else { 977 int32_t index=(int32_t)(state>>1); /* UTF-8 index */ 978 state&=1; /* 1 if in surrogate pair, must be index>=4 */ 979 980 if((state==0 ? index<0 : index<4) || iter->limit<index) { 981 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 982 } else { 983 iter->start=index; /* restore UTF-8 byte index */ 984 if(index<=1) { 985 iter->index=index; 986 } else { 987 iter->index=-1; /* unknown UTF-16 index */ 988 } 989 if(state==0) { 990 iter->reservedField=0; 991 } else { 992 /* verified index>=4 above */ 993 UChar32 c; 994 U8_PREV((const uint8_t *)iter->context, 0, index, c); 995 if(c<=0xffff) { 996 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 997 } else { 998 iter->reservedField=c; 999 } 1000 } 1001 } 1002 } 1003} 1004 1005static const UCharIterator utf8Iterator={ 1006 0, 0, 0, 0, 0, 0, 1007 utf8IteratorGetIndex, 1008 utf8IteratorMove, 1009 utf8IteratorHasNext, 1010 utf8IteratorHasPrevious, 1011 utf8IteratorCurrent, 1012 utf8IteratorNext, 1013 utf8IteratorPrevious, 1014 NULL, 1015 utf8IteratorGetState, 1016 utf8IteratorSetState 1017}; 1018 1019U_CAPI void U_EXPORT2 1020uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) { 1021 if(iter!=0) { 1022 if(s!=0 && length>=-1) { 1023 *iter=utf8Iterator; 1024 iter->context=s; 1025 if(length>=0) { 1026 iter->limit=length; 1027 } else { 1028 iter->limit=(int32_t)uprv_strlen(s); 1029 } 1030 iter->length= iter->limit<=1 ? iter->limit : -1; 1031 } else { 1032 *iter=noopIterator; 1033 } 1034 } 1035} 1036 1037/* Helper functions --------------------------------------------------------- */ 1038 1039U_CAPI UChar32 U_EXPORT2 1040uiter_current32(UCharIterator *iter) { 1041 UChar32 c, c2; 1042 1043 c=iter->current(iter); 1044 if(U16_IS_SURROGATE(c)) { 1045 if(U16_IS_SURROGATE_LEAD(c)) { 1046 /* 1047 * go to the next code unit 1048 * we know that we are not at the limit because c!=U_SENTINEL 1049 */ 1050 iter->move(iter, 1, UITER_CURRENT); 1051 if(U16_IS_TRAIL(c2=iter->current(iter))) { 1052 c=U16_GET_SUPPLEMENTARY(c, c2); 1053 } 1054 1055 /* undo index movement */ 1056 iter->move(iter, -1, UITER_CURRENT); 1057 } else { 1058 if(U16_IS_LEAD(c2=iter->previous(iter))) { 1059 c=U16_GET_SUPPLEMENTARY(c2, c); 1060 } 1061 if(c2>=0) { 1062 /* undo index movement */ 1063 iter->move(iter, 1, UITER_CURRENT); 1064 } 1065 } 1066 } 1067 return c; 1068} 1069 1070U_CAPI UChar32 U_EXPORT2 1071uiter_next32(UCharIterator *iter) { 1072 UChar32 c, c2; 1073 1074 c=iter->next(iter); 1075 if(U16_IS_LEAD(c)) { 1076 if(U16_IS_TRAIL(c2=iter->next(iter))) { 1077 c=U16_GET_SUPPLEMENTARY(c, c2); 1078 } else if(c2>=0) { 1079 /* unmatched first surrogate, undo index movement */ 1080 iter->move(iter, -1, UITER_CURRENT); 1081 } 1082 } 1083 return c; 1084} 1085 1086U_CAPI UChar32 U_EXPORT2 1087uiter_previous32(UCharIterator *iter) { 1088 UChar32 c, c2; 1089 1090 c=iter->previous(iter); 1091 if(U16_IS_TRAIL(c)) { 1092 if(U16_IS_LEAD(c2=iter->previous(iter))) { 1093 c=U16_GET_SUPPLEMENTARY(c2, c); 1094 } else if(c2>=0) { 1095 /* unmatched second surrogate, undo index movement */ 1096 iter->move(iter, 1, UITER_CURRENT); 1097 } 1098 } 1099 return c; 1100} 1101 1102U_CAPI uint32_t U_EXPORT2 1103uiter_getState(const UCharIterator *iter) { 1104 if(iter==NULL || iter->getState==NULL) { 1105 return UITER_NO_STATE; 1106 } else { 1107 return iter->getState(iter); 1108 } 1109} 1110 1111U_CAPI void U_EXPORT2 1112uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 1113 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1114 /* do nothing */ 1115 } else if(iter==NULL) { 1116 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1117 } else if(iter->setState==NULL) { 1118 *pErrorCode=U_UNSUPPORTED_ERROR; 1119 } else { 1120 iter->setState(iter, state, pErrorCode); 1121 } 1122} 1123 1124U_CDECL_END 1125