1/*
2*******************************************************************************
3*
4*   Copyright (C) 2002-2006, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  uiter.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2002jan18
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/ustring.h"
19#include "unicode/chariter.h"
20#include "unicode/rep.h"
21#include "unicode/uiter.h"
22#include "cstring.h"
23
24U_NAMESPACE_USE
25
26#define IS_EVEN(n) (((n)&1)==0)
27#define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
28
29U_CDECL_BEGIN
30
31/* No-Op UCharIterator implementation for illegal input --------------------- */
32
33static int32_t U_CALLCONV
34noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
35    return 0;
36}
37
38static int32_t U_CALLCONV
39noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
40    return 0;
41}
42
43static UBool U_CALLCONV
44noopHasNext(UCharIterator * /*iter*/) {
45    return FALSE;
46}
47
48static UChar32 U_CALLCONV
49noopCurrent(UCharIterator * /*iter*/) {
50    return U_SENTINEL;
51}
52
53static uint32_t U_CALLCONV
54noopGetState(const UCharIterator * /*iter*/) {
55    return UITER_NO_STATE;
56}
57
58static void U_CALLCONV
59noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) {
60    *pErrorCode=U_UNSUPPORTED_ERROR;
61}
62
63static const UCharIterator noopIterator={
64    0, 0, 0, 0, 0, 0,
65    noopGetIndex,
66    noopMove,
67    noopHasNext,
68    noopHasNext,
69    noopCurrent,
70    noopCurrent,
71    noopCurrent,
72    NULL,
73    noopGetState,
74    noopSetState
75};
76
77/* UCharIterator implementation for simple strings -------------------------- */
78
79/*
80 * This is an implementation of a code unit (UChar) iterator
81 * for UChar * strings.
82 *
83 * The UCharIterator.context field holds a pointer to the string.
84 */
85
86static int32_t U_CALLCONV
87stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
88    switch(origin) {
89    case UITER_ZERO:
90        return 0;
91    case UITER_START:
92        return iter->start;
93    case UITER_CURRENT:
94        return iter->index;
95    case UITER_LIMIT:
96        return iter->limit;
97    case UITER_LENGTH:
98        return iter->length;
99    default:
100        /* not a valid origin */
101        /* Should never get here! */
102        return -1;
103    }
104}
105
106static int32_t U_CALLCONV
107stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
108    int32_t pos;
109
110    switch(origin) {
111    case UITER_ZERO:
112        pos=delta;
113        break;
114    case UITER_START:
115        pos=iter->start+delta;
116        break;
117    case UITER_CURRENT:
118        pos=iter->index+delta;
119        break;
120    case UITER_LIMIT:
121        pos=iter->limit+delta;
122        break;
123    case UITER_LENGTH:
124        pos=iter->length+delta;
125        break;
126    default:
127        return -1;  /* Error */
128    }
129
130    if(pos<iter->start) {
131        pos=iter->start;
132    } else if(pos>iter->limit) {
133        pos=iter->limit;
134    }
135
136    return iter->index=pos;
137}
138
139static UBool U_CALLCONV
140stringIteratorHasNext(UCharIterator *iter) {
141    return iter->index<iter->limit;
142}
143
144static UBool U_CALLCONV
145stringIteratorHasPrevious(UCharIterator *iter) {
146    return iter->index>iter->start;
147}
148
149static UChar32 U_CALLCONV
150stringIteratorCurrent(UCharIterator *iter) {
151    if(iter->index<iter->limit) {
152        return ((const UChar *)(iter->context))[iter->index];
153    } else {
154        return U_SENTINEL;
155    }
156}
157
158static UChar32 U_CALLCONV
159stringIteratorNext(UCharIterator *iter) {
160    if(iter->index<iter->limit) {
161        return ((const UChar *)(iter->context))[iter->index++];
162    } else {
163        return U_SENTINEL;
164    }
165}
166
167static UChar32 U_CALLCONV
168stringIteratorPrevious(UCharIterator *iter) {
169    if(iter->index>iter->start) {
170        return ((const UChar *)(iter->context))[--iter->index];
171    } else {
172        return U_SENTINEL;
173    }
174}
175
176static uint32_t U_CALLCONV
177stringIteratorGetState(const UCharIterator *iter) {
178    return (uint32_t)iter->index;
179}
180
181static void U_CALLCONV
182stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
183    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
184        /* do nothing */
185    } else if(iter==NULL) {
186        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
187    } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
188        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
189    } else {
190        iter->index=(int32_t)state;
191    }
192}
193
194static const UCharIterator stringIterator={
195    0, 0, 0, 0, 0, 0,
196    stringIteratorGetIndex,
197    stringIteratorMove,
198    stringIteratorHasNext,
199    stringIteratorHasPrevious,
200    stringIteratorCurrent,
201    stringIteratorNext,
202    stringIteratorPrevious,
203    NULL,
204    stringIteratorGetState,
205    stringIteratorSetState
206};
207
208U_CAPI void U_EXPORT2
209uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) {
210    if(iter!=0) {
211        if(s!=0 && length>=-1) {
212            *iter=stringIterator;
213            iter->context=s;
214            if(length>=0) {
215                iter->length=length;
216            } else {
217                iter->length=u_strlen(s);
218            }
219            iter->limit=iter->length;
220        } else {
221            *iter=noopIterator;
222        }
223    }
224}
225
226/* UCharIterator implementation for UTF-16BE strings ------------------------ */
227
228/*
229 * This is an implementation of a code unit (UChar) iterator
230 * for UTF-16BE strings, i.e., strings in byte-vectors where
231 * each UChar is stored as a big-endian pair of bytes.
232 *
233 * The UCharIterator.context field holds a pointer to the string.
234 * Everything works just like with a normal UChar iterator (uiter_setString),
235 * except that UChars are assembled from byte pairs.
236 */
237
238/* internal helper function */
239static inline UChar32
240utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
241    const uint8_t *p=(const uint8_t *)iter->context;
242    return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
243}
244
245static UChar32 U_CALLCONV
246utf16BEIteratorCurrent(UCharIterator *iter) {
247    int32_t index;
248
249    if((index=iter->index)<iter->limit) {
250        return utf16BEIteratorGet(iter, index);
251    } else {
252        return U_SENTINEL;
253    }
254}
255
256static UChar32 U_CALLCONV
257utf16BEIteratorNext(UCharIterator *iter) {
258    int32_t index;
259
260    if((index=iter->index)<iter->limit) {
261        iter->index=index+1;
262        return utf16BEIteratorGet(iter, index);
263    } else {
264        return U_SENTINEL;
265    }
266}
267
268static UChar32 U_CALLCONV
269utf16BEIteratorPrevious(UCharIterator *iter) {
270    int32_t index;
271
272    if((index=iter->index)>iter->start) {
273        iter->index=--index;
274        return utf16BEIteratorGet(iter, index);
275    } else {
276        return U_SENTINEL;
277    }
278}
279
280static const UCharIterator utf16BEIterator={
281    0, 0, 0, 0, 0, 0,
282    stringIteratorGetIndex,
283    stringIteratorMove,
284    stringIteratorHasNext,
285    stringIteratorHasPrevious,
286    utf16BEIteratorCurrent,
287    utf16BEIteratorNext,
288    utf16BEIteratorPrevious,
289    NULL,
290    stringIteratorGetState,
291    stringIteratorSetState
292};
293
294/*
295 * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL,
296 * i.e., before a pair of 0 bytes where the first 0 byte is at an even
297 * offset from s.
298 */
299static int32_t
300utf16BE_strlen(const char *s) {
301    if(IS_POINTER_EVEN(s)) {
302        /*
303         * even-aligned, call u_strlen(s)
304         * we are probably on a little-endian machine, but searching for UChar NUL
305         * does not care about endianness
306         */
307        return u_strlen((const UChar *)s);
308    } else {
309        /* odd-aligned, search for pair of 0 bytes */
310        const char *p=s;
311
312        while(!(*p==0 && p[1]==0)) {
313            p+=2;
314        }
315        return (int32_t)((p-s)/2);
316    }
317}
318
319U_CAPI void U_EXPORT2
320uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
321    if(iter!=NULL) {
322        /* allow only even-length strings (the input length counts bytes) */
323        if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) {
324            /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
325            length>>=1;
326
327            if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
328                /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */
329                uiter_setString(iter, (const UChar *)s, length);
330                return;
331            }
332
333            *iter=utf16BEIterator;
334            iter->context=s;
335            if(length>=0) {
336                iter->length=length;
337            } else {
338                iter->length=utf16BE_strlen(s);
339            }
340            iter->limit=iter->length;
341        } else {
342            *iter=noopIterator;
343        }
344    }
345}
346
347/* UCharIterator wrapper around CharacterIterator --------------------------- */
348
349/*
350 * This is wrapper code around a C++ CharacterIterator to
351 * look like a C UCharIterator.
352 *
353 * The UCharIterator.context field holds a pointer to the CharacterIterator.
354 */
355
356static int32_t U_CALLCONV
357characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
358    switch(origin) {
359    case UITER_ZERO:
360        return 0;
361    case UITER_START:
362        return ((CharacterIterator *)(iter->context))->startIndex();
363    case UITER_CURRENT:
364        return ((CharacterIterator *)(iter->context))->getIndex();
365    case UITER_LIMIT:
366        return ((CharacterIterator *)(iter->context))->endIndex();
367    case UITER_LENGTH:
368        return ((CharacterIterator *)(iter->context))->getLength();
369    default:
370        /* not a valid origin */
371        /* Should never get here! */
372        return -1;
373    }
374}
375
376static int32_t U_CALLCONV
377characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
378    switch(origin) {
379    case UITER_ZERO:
380        ((CharacterIterator *)(iter->context))->setIndex(delta);
381        return ((CharacterIterator *)(iter->context))->getIndex();
382    case UITER_START:
383    case UITER_CURRENT:
384    case UITER_LIMIT:
385        return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
386    case UITER_LENGTH:
387        ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
388        return ((CharacterIterator *)(iter->context))->getIndex();
389    default:
390        /* not a valid origin */
391        /* Should never get here! */
392        return -1;
393    }
394}
395
396static UBool U_CALLCONV
397characterIteratorHasNext(UCharIterator *iter) {
398    return ((CharacterIterator *)(iter->context))->hasNext();
399}
400
401static UBool U_CALLCONV
402characterIteratorHasPrevious(UCharIterator *iter) {
403    return ((CharacterIterator *)(iter->context))->hasPrevious();
404}
405
406static UChar32 U_CALLCONV
407characterIteratorCurrent(UCharIterator *iter) {
408    UChar32 c;
409
410    c=((CharacterIterator *)(iter->context))->current();
411    if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
412        return c;
413    } else {
414        return U_SENTINEL;
415    }
416}
417
418static UChar32 U_CALLCONV
419characterIteratorNext(UCharIterator *iter) {
420    if(((CharacterIterator *)(iter->context))->hasNext()) {
421        return ((CharacterIterator *)(iter->context))->nextPostInc();
422    } else {
423        return U_SENTINEL;
424    }
425}
426
427static UChar32 U_CALLCONV
428characterIteratorPrevious(UCharIterator *iter) {
429    if(((CharacterIterator *)(iter->context))->hasPrevious()) {
430        return ((CharacterIterator *)(iter->context))->previous();
431    } else {
432        return U_SENTINEL;
433    }
434}
435
436static uint32_t U_CALLCONV
437characterIteratorGetState(const UCharIterator *iter) {
438    return ((CharacterIterator *)(iter->context))->getIndex();
439}
440
441static void U_CALLCONV
442characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
443    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
444        /* do nothing */
445    } else if(iter==NULL || iter->context==NULL) {
446        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
447    } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
448        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
449    } else {
450        ((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
451    }
452}
453
454static const UCharIterator characterIteratorWrapper={
455    0, 0, 0, 0, 0, 0,
456    characterIteratorGetIndex,
457    characterIteratorMove,
458    characterIteratorHasNext,
459    characterIteratorHasPrevious,
460    characterIteratorCurrent,
461    characterIteratorNext,
462    characterIteratorPrevious,
463    NULL,
464    characterIteratorGetState,
465    characterIteratorSetState
466};
467
468U_CAPI void U_EXPORT2
469uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
470    if(iter!=0) {
471        if(charIter!=0) {
472            *iter=characterIteratorWrapper;
473            iter->context=charIter;
474        } else {
475            *iter=noopIterator;
476        }
477    }
478}
479
480/* UCharIterator wrapper around Replaceable --------------------------------- */
481
482/*
483 * This is an implementation of a code unit (UChar) iterator
484 * based on a Replaceable object.
485 *
486 * The UCharIterator.context field holds a pointer to the Replaceable.
487 * UCharIterator.length and UCharIterator.index hold Replaceable.length()
488 * and the iteration index.
489 */
490
491static UChar32 U_CALLCONV
492replaceableIteratorCurrent(UCharIterator *iter) {
493    if(iter->index<iter->limit) {
494        return ((Replaceable *)(iter->context))->charAt(iter->index);
495    } else {
496        return U_SENTINEL;
497    }
498}
499
500static UChar32 U_CALLCONV
501replaceableIteratorNext(UCharIterator *iter) {
502    if(iter->index<iter->limit) {
503        return ((Replaceable *)(iter->context))->charAt(iter->index++);
504    } else {
505        return U_SENTINEL;
506    }
507}
508
509static UChar32 U_CALLCONV
510replaceableIteratorPrevious(UCharIterator *iter) {
511    if(iter->index>iter->start) {
512        return ((Replaceable *)(iter->context))->charAt(--iter->index);
513    } else {
514        return U_SENTINEL;
515    }
516}
517
518static const UCharIterator replaceableIterator={
519    0, 0, 0, 0, 0, 0,
520    stringIteratorGetIndex,
521    stringIteratorMove,
522    stringIteratorHasNext,
523    stringIteratorHasPrevious,
524    replaceableIteratorCurrent,
525    replaceableIteratorNext,
526    replaceableIteratorPrevious,
527    NULL,
528    stringIteratorGetState,
529    stringIteratorSetState
530};
531
532U_CAPI void U_EXPORT2
533uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
534    if(iter!=0) {
535        if(rep!=0) {
536            *iter=replaceableIterator;
537            iter->context=rep;
538            iter->limit=iter->length=rep->length();
539        } else {
540            *iter=noopIterator;
541        }
542    }
543}
544
545/* UCharIterator implementation for UTF-8 strings --------------------------- */
546
547/*
548 * Possible, probably necessary only for an implementation for arbitrary
549 * converters:
550 * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
551 * This would require to turn reservedFn into a close function and
552 * to introduce a uiter_close(iter).
553 */
554
555#define UITER_CNV_CAPACITY 16
556
557/*
558 * Minimal implementation:
559 * Maintain a single-UChar buffer for an additional surrogate.
560 * The caller must not modify start and limit because they are used internally.
561 *
562 * Use UCharIterator fields as follows:
563 *   context        pointer to UTF-8 string
564 *   length         UTF-16 length of the string; -1 until lazy evaluation
565 *   start          current UTF-8 index
566 *   index          current UTF-16 index; may be -1="unknown" after setState()
567 *   limit          UTF-8 length of the string
568 *   reservedField  supplementary code point
569 *
570 * Since UCharIterator delivers 16-bit code units, the iteration can be
571 * currently in the middle of the byte sequence for a supplementary code point.
572 * In this case, reservedField will contain that code point and start will
573 * point to after the corresponding byte sequence. The UTF-16 index will be
574 * one less than what it would otherwise be corresponding to the UTF-8 index.
575 * Otherwise, reservedField will be 0.
576 */
577
578/*
579 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
580 * Add implementations that do not call strlen() for iteration but check for NUL.
581 */
582
583static int32_t U_CALLCONV
584utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
585    switch(origin) {
586    case UITER_ZERO:
587    case UITER_START:
588        return 0;
589    case UITER_CURRENT:
590        if(iter->index<0) {
591            /* the current UTF-16 index is unknown after setState(), count from the beginning */
592            const uint8_t *s;
593            UChar32 c;
594            int32_t i, limit, index;
595
596            s=(const uint8_t *)iter->context;
597            i=index=0;
598            limit=iter->start; /* count up to the UTF-8 index */
599            while(i<limit) {
600                U8_NEXT(s, i, limit, c);
601                if(c<=0xffff) {
602                    ++index;
603                } else {
604                    index+=2;
605                }
606            }
607
608            iter->start=i; /* just in case setState() did not get us to a code point boundary */
609            if(i==iter->limit) {
610                iter->length=index; /* in case it was <0 or wrong */
611            }
612            if(iter->reservedField!=0) {
613                --index; /* we are in the middle of a supplementary code point */
614            }
615            iter->index=index;
616        }
617        return iter->index;
618    case UITER_LIMIT:
619    case UITER_LENGTH:
620        if(iter->length<0) {
621            const uint8_t *s;
622            UChar32 c;
623            int32_t i, limit, length;
624
625            s=(const uint8_t *)iter->context;
626            if(iter->index<0) {
627                /*
628                 * the current UTF-16 index is unknown after setState(),
629                 * we must first count from the beginning to here
630                 */
631                i=length=0;
632                limit=iter->start;
633
634                /* count from the beginning to the current index */
635                while(i<limit) {
636                    U8_NEXT(s, i, limit, c);
637                    if(c<=0xffff) {
638                        ++length;
639                    } else {
640                        length+=2;
641                    }
642                }
643
644                /* assume i==limit==iter->start, set the UTF-16 index */
645                iter->start=i; /* just in case setState() did not get us to a code point boundary */
646                iter->index= iter->reservedField!=0 ? length-1 : length;
647            } else {
648                i=iter->start;
649                length=iter->index;
650                if(iter->reservedField!=0) {
651                    ++length;
652                }
653            }
654
655            /* count from the current index to the end */
656            limit=iter->limit;
657            while(i<limit) {
658                U8_NEXT(s, i, limit, c);
659                if(c<=0xffff) {
660                    ++length;
661                } else {
662                    length+=2;
663                }
664            }
665            iter->length=length;
666        }
667        return iter->length;
668    default:
669        /* not a valid origin */
670        /* Should never get here! */
671        return -1;
672    }
673}
674
675static int32_t U_CALLCONV
676utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
677    const uint8_t *s;
678    UChar32 c;
679    int32_t pos; /* requested UTF-16 index */
680    int32_t i; /* UTF-8 index */
681    UBool havePos;
682
683    /* calculate the requested UTF-16 index */
684    switch(origin) {
685    case UITER_ZERO:
686    case UITER_START:
687        pos=delta;
688        havePos=TRUE;
689        /* iter->index<0 (unknown) is possible */
690        break;
691    case UITER_CURRENT:
692        if(iter->index>=0) {
693            pos=iter->index+delta;
694            havePos=TRUE;
695        } else {
696            /* the current UTF-16 index is unknown after setState(), use only delta */
697            pos=0;
698            havePos=FALSE;
699        }
700        break;
701    case UITER_LIMIT:
702    case UITER_LENGTH:
703        if(iter->length>=0) {
704            pos=iter->length+delta;
705            havePos=TRUE;
706        } else {
707            /* pin to the end, avoid counting the length */
708            iter->index=-1;
709            iter->start=iter->limit;
710            iter->reservedField=0;
711            if(delta>=0) {
712                return UITER_UNKNOWN_INDEX;
713            } else {
714                /* the current UTF-16 index is unknown, use only delta */
715                pos=0;
716                havePos=FALSE;
717            }
718        }
719        break;
720    default:
721        return -1;  /* Error */
722    }
723
724    if(havePos) {
725        /* shortcuts: pinning to the edges of the string */
726        if(pos<=0) {
727            iter->index=iter->start=iter->reservedField=0;
728            return 0;
729        } else if(iter->length>=0 && pos>=iter->length) {
730            iter->index=iter->length;
731            iter->start=iter->limit;
732            iter->reservedField=0;
733            return iter->index;
734        }
735
736        /* minimize the number of U8_NEXT/PREV operations */
737        if(iter->index<0 || pos<iter->index/2) {
738            /* go forward from the start instead of backward from the current index */
739            iter->index=iter->start=iter->reservedField=0;
740        } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
741            /*
742             * if we have the UTF-16 index and length and the new position is
743             * closer to the end than the current index,
744             * then go backward from the end instead of forward from the current index
745             */
746            iter->index=iter->length;
747            iter->start=iter->limit;
748            iter->reservedField=0;
749        }
750
751        delta=pos-iter->index;
752        if(delta==0) {
753            return iter->index; /* nothing to do */
754        }
755    } else {
756        /* move relative to unknown UTF-16 index */
757        if(delta==0) {
758            return UITER_UNKNOWN_INDEX; /* nothing to do */
759        } else if(-delta>=iter->start) {
760            /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
761            iter->index=iter->start=iter->reservedField=0;
762            return 0;
763        } else if(delta>=(iter->limit-iter->start)) {
764            /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
765            iter->index=iter->length; /* may or may not be <0 (unknown) */
766            iter->start=iter->limit;
767            iter->reservedField=0;
768            return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
769        }
770    }
771
772    /* delta!=0 */
773
774    /* move towards the requested position, pin to the edges of the string */
775    s=(const uint8_t *)iter->context;
776    pos=iter->index; /* could be <0 (unknown) */
777    i=iter->start;
778    if(delta>0) {
779        /* go forward */
780        int32_t limit=iter->limit;
781        if(iter->reservedField!=0) {
782            iter->reservedField=0;
783            ++pos;
784            --delta;
785        }
786        while(delta>0 && i<limit) {
787            U8_NEXT(s, i, limit, c);
788            if(c<0xffff) {
789                ++pos;
790                --delta;
791            } else if(delta>=2) {
792                pos+=2;
793                delta-=2;
794            } else /* delta==1 */ {
795                /* stop in the middle of a supplementary code point */
796                iter->reservedField=c;
797                ++pos;
798                break; /* delta=0; */
799            }
800        }
801        if(i==limit) {
802            if(iter->length<0 && iter->index>=0) {
803                iter->length= iter->reservedField==0 ? pos : pos+1;
804            } else if(iter->index<0 && iter->length>=0) {
805                iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
806            }
807        }
808    } else /* delta<0 */ {
809        /* go backward */
810        if(iter->reservedField!=0) {
811            iter->reservedField=0;
812            i-=4; /* we stayed behind the supplementary code point; go before it now */
813            --pos;
814            ++delta;
815        }
816        while(delta<0 && i>0) {
817            U8_PREV(s, 0, i, c);
818            if(c<0xffff) {
819                --pos;
820                ++delta;
821            } else if(delta<=-2) {
822                pos-=2;
823                delta+=2;
824            } else /* delta==-1 */ {
825                /* stop in the middle of a supplementary code point */
826                i+=4; /* back to behind this supplementary code point for consistent state */
827                iter->reservedField=c;
828                --pos;
829                break; /* delta=0; */
830            }
831        }
832    }
833
834    iter->start=i;
835    if(iter->index>=0) {
836        return iter->index=pos;
837    } else {
838        /* we started with index<0 (unknown) so pos is bogus */
839        if(i<=1) {
840            return iter->index=i; /* reached the beginning */
841        } else {
842            /* we still don't know the UTF-16 index */
843            return UITER_UNKNOWN_INDEX;
844        }
845    }
846}
847
848static UBool U_CALLCONV
849utf8IteratorHasNext(UCharIterator *iter) {
850    return iter->start<iter->limit || iter->reservedField!=0;
851}
852
853static UBool U_CALLCONV
854utf8IteratorHasPrevious(UCharIterator *iter) {
855    return iter->start>0;
856}
857
858static UChar32 U_CALLCONV
859utf8IteratorCurrent(UCharIterator *iter) {
860    if(iter->reservedField!=0) {
861        return U16_TRAIL(iter->reservedField);
862    } else if(iter->start<iter->limit) {
863        const uint8_t *s=(const uint8_t *)iter->context;
864        UChar32 c;
865        int32_t i=iter->start;
866
867        U8_NEXT(s, i, iter->limit, c);
868        if(c<0) {
869            return 0xfffd;
870        } else if(c<=0xffff) {
871            return c;
872        } else {
873            return U16_LEAD(c);
874        }
875    } else {
876        return U_SENTINEL;
877    }
878}
879
880static UChar32 U_CALLCONV
881utf8IteratorNext(UCharIterator *iter) {
882    int32_t index;
883
884    if(iter->reservedField!=0) {
885        UChar trail=U16_TRAIL(iter->reservedField);
886        iter->reservedField=0;
887        if((index=iter->index)>=0) {
888            iter->index=index+1;
889        }
890        return trail;
891    } else if(iter->start<iter->limit) {
892        const uint8_t *s=(const uint8_t *)iter->context;
893        UChar32 c;
894
895        U8_NEXT(s, iter->start, iter->limit, c);
896        if((index=iter->index)>=0) {
897            iter->index=++index;
898            if(iter->length<0 && iter->start==iter->limit) {
899                iter->length= c<=0xffff ? index : index+1;
900            }
901        } else if(iter->start==iter->limit && iter->length>=0) {
902            iter->index= c<=0xffff ? iter->length : iter->length-1;
903        }
904        if(c<0) {
905            return 0xfffd;
906        } else if(c<=0xffff) {
907            return c;
908        } else {
909            iter->reservedField=c;
910            return U16_LEAD(c);
911        }
912    } else {
913        return U_SENTINEL;
914    }
915}
916
917static UChar32 U_CALLCONV
918utf8IteratorPrevious(UCharIterator *iter) {
919    int32_t index;
920
921    if(iter->reservedField!=0) {
922        UChar lead=U16_LEAD(iter->reservedField);
923        iter->reservedField=0;
924        iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
925        if((index=iter->index)>0) {
926            iter->index=index-1;
927        }
928        return lead;
929    } else if(iter->start>0) {
930        const uint8_t *s=(const uint8_t *)iter->context;
931        UChar32 c;
932
933        U8_PREV(s, 0, iter->start, c);
934        if((index=iter->index)>0) {
935            iter->index=index-1;
936        } else if(iter->start<=1) {
937            iter->index= c<=0xffff ? iter->start : iter->start+1;
938        }
939        if(c<0) {
940            return 0xfffd;
941        } else if(c<=0xffff) {
942            return c;
943        } else {
944            iter->start+=4; /* back to behind this supplementary code point for consistent state */
945            iter->reservedField=c;
946            return U16_TRAIL(c);
947        }
948    } else {
949        return U_SENTINEL;
950    }
951}
952
953static uint32_t U_CALLCONV
954utf8IteratorGetState(const UCharIterator *iter) {
955    uint32_t state=(uint32_t)(iter->start<<1);
956    if(iter->reservedField!=0) {
957        state|=1;
958    }
959    return state;
960}
961
962static void U_CALLCONV
963utf8IteratorSetState(UCharIterator *iter,
964                     uint32_t state,
965                     UErrorCode *pErrorCode)
966{
967    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
968        /* do nothing */
969    } else if(iter==NULL) {
970        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
971    } else if(state==utf8IteratorGetState(iter)) {
972        /* setting to the current state: no-op */
973    } else {
974        int32_t index=(int32_t)(state>>1); /* UTF-8 index */
975        state&=1; /* 1 if in surrogate pair, must be index>=4 */
976
977        if((state==0 ? index<0 : index<4) || iter->limit<index) {
978            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
979        } else {
980            iter->start=index; /* restore UTF-8 byte index */
981            if(index<=1) {
982                iter->index=index;
983            } else {
984                iter->index=-1; /* unknown UTF-16 index */
985            }
986            if(state==0) {
987                iter->reservedField=0;
988            } else {
989                /* verified index>=4 above */
990                UChar32 c;
991                U8_PREV((const uint8_t *)iter->context, 0, index, c);
992                if(c<=0xffff) {
993                    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
994                } else {
995                    iter->reservedField=c;
996                }
997            }
998        }
999    }
1000}
1001
1002static const UCharIterator utf8Iterator={
1003    0, 0, 0, 0, 0, 0,
1004    utf8IteratorGetIndex,
1005    utf8IteratorMove,
1006    utf8IteratorHasNext,
1007    utf8IteratorHasPrevious,
1008    utf8IteratorCurrent,
1009    utf8IteratorNext,
1010    utf8IteratorPrevious,
1011    NULL,
1012    utf8IteratorGetState,
1013    utf8IteratorSetState
1014};
1015
1016U_CAPI void U_EXPORT2
1017uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
1018    if(iter!=0) {
1019        if(s!=0 && length>=-1) {
1020            *iter=utf8Iterator;
1021            iter->context=s;
1022            if(length>=0) {
1023                iter->limit=length;
1024            } else {
1025                iter->limit=(int32_t)uprv_strlen(s);
1026            }
1027            iter->length= iter->limit<=1 ? iter->limit : -1;
1028        } else {
1029            *iter=noopIterator;
1030        }
1031    }
1032}
1033
1034/* Helper functions --------------------------------------------------------- */
1035
1036U_CAPI UChar32 U_EXPORT2
1037uiter_current32(UCharIterator *iter) {
1038    UChar32 c, c2;
1039
1040    c=iter->current(iter);
1041    if(UTF_IS_SURROGATE(c)) {
1042        if(UTF_IS_SURROGATE_FIRST(c)) {
1043            /*
1044             * go to the next code unit
1045             * we know that we are not at the limit because c!=U_SENTINEL
1046             */
1047            iter->move(iter, 1, UITER_CURRENT);
1048            if(UTF_IS_SECOND_SURROGATE(c2=iter->current(iter))) {
1049                c=UTF16_GET_PAIR_VALUE(c, c2);
1050            }
1051
1052            /* undo index movement */
1053            iter->move(iter, -1, UITER_CURRENT);
1054        } else {
1055            if(UTF_IS_FIRST_SURROGATE(c2=iter->previous(iter))) {
1056                c=UTF16_GET_PAIR_VALUE(c2, c);
1057            }
1058            if(c2>=0) {
1059                /* undo index movement */
1060                iter->move(iter, 1, UITER_CURRENT);
1061            }
1062        }
1063    }
1064    return c;
1065}
1066
1067U_CAPI UChar32 U_EXPORT2
1068uiter_next32(UCharIterator *iter) {
1069    UChar32 c, c2;
1070
1071    c=iter->next(iter);
1072    if(UTF_IS_FIRST_SURROGATE(c)) {
1073        if(UTF_IS_SECOND_SURROGATE(c2=iter->next(iter))) {
1074            c=UTF16_GET_PAIR_VALUE(c, c2);
1075        } else if(c2>=0) {
1076            /* unmatched first surrogate, undo index movement */
1077            iter->move(iter, -1, UITER_CURRENT);
1078        }
1079    }
1080    return c;
1081}
1082
1083U_CAPI UChar32 U_EXPORT2
1084uiter_previous32(UCharIterator *iter) {
1085    UChar32 c, c2;
1086
1087    c=iter->previous(iter);
1088    if(UTF_IS_SECOND_SURROGATE(c)) {
1089        if(UTF_IS_FIRST_SURROGATE(c2=iter->previous(iter))) {
1090            c=UTF16_GET_PAIR_VALUE(c2, c);
1091        } else if(c2>=0) {
1092            /* unmatched second surrogate, undo index movement */
1093            iter->move(iter, 1, UITER_CURRENT);
1094        }
1095    }
1096    return c;
1097}
1098
1099U_CAPI uint32_t U_EXPORT2
1100uiter_getState(const UCharIterator *iter) {
1101    if(iter==NULL || iter->getState==NULL) {
1102        return UITER_NO_STATE;
1103    } else {
1104        return iter->getState(iter);
1105    }
1106}
1107
1108U_CAPI void U_EXPORT2
1109uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
1110    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1111        /* do nothing */
1112    } else if(iter==NULL) {
1113        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1114    } else if(iter->setState==NULL) {
1115        *pErrorCode=U_UNSUPPORTED_ERROR;
1116    } else {
1117        iter->setState(iter, state, pErrorCode);
1118    }
1119}
1120
1121U_CDECL_END
1122