1/*
2*******************************************************************************
3*
4*   Copyright (C) 2003-2006, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  uit_len8.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2003feb10
14*   created by: Markus W. Scherer
15*
16*   This file contains the implementation of the "lenient UTF-8" UCharIterator
17*   as used in the uciter8 sample code.
18*   UTF-8-style macros are defined as well as the UCharIterator.
19*   The macros are incomplete (do not assemble code points from pairs of
20*   surrogates, see comment below)
21*   but sufficient for the iterator.
22*/
23
24#include <string.h>
25#include "unicode/utypes.h"
26#include "unicode/uiter.h"
27
28/* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
29
30/*
31 * This code leniently reads 8-bit Unicode strings,
32 * which could contain a mix of UTF-8 and CESU-8.
33 * More precisely:
34 * - supplementary code points may be encoded with dedicated 4-byte sequences
35 *   (UTF-8 style)
36 * - supplementary code points may be encoded with
37 *   pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
38 *   (CESU-8 style)
39 * - single surrogates are allowed, encoded with their "natural" 3-byte sequences
40 *
41 * Limitation:
42 * Right now, the macros do not attempt to assemble code points from pairs of
43 * separately encoded surrogates.
44 * This would not be sufficient for processing based on these macros,
45 * but it is sufficient for a UCharIterator that returns only UChars anyway.
46 *
47 * The code is copied and modified from utf_impl.c and utf8.h.
48 *
49 * Change 2006feb08: Much of the implementation code is replaced by calling
50 * the utf_impl.c functions which accept a new "strict" parameter value
51 * of -2 implementing exactly this leniency.
52 */
53
54#define L8_NEXT(s, i, length, c) { \
55    (c)=(uint8_t)(s)[(i)++]; \
56    if((c)>=0x80) { \
57        if(U8_IS_LEAD(c)) { \
58            (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
59        } else { \
60            (c)=U_SENTINEL; \
61        } \
62    } \
63}
64
65#define L8_PREV(s, start, i, c) { \
66    (c)=(uint8_t)(s)[--(i)]; \
67    if((c)>=0x80) { \
68        if((c)<=0xbf) { \
69            (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
70        } else { \
71            (c)=U_SENTINEL; \
72        } \
73    } \
74}
75
76/* lenient-8 UCharIterator -------------------------------------------------- */
77
78/*
79 * This is a copy of the UTF-8 UCharIterator in uiter.cpp,
80 * except that it uses the lenient-8-bit-Unicode macros above.
81 */
82
83/*
84 * Minimal implementation:
85 * Maintain a single-UChar buffer for an additional surrogate.
86 * The caller must not modify start and limit because they are used internally.
87 *
88 * Use UCharIterator fields as follows:
89 *   context        pointer to UTF-8 string
90 *   length         UTF-16 length of the string; -1 until lazy evaluation
91 *   start          current UTF-8 index
92 *   index          current UTF-16 index; may be -1="unknown" after setState()
93 *   limit          UTF-8 length of the string
94 *   reservedField  supplementary code point
95 *
96 * Since UCharIterator delivers 16-bit code units, the iteration can be
97 * currently in the middle of the byte sequence for a supplementary code point.
98 * In this case, reservedField will contain that code point and start will
99 * point to after the corresponding byte sequence. The UTF-16 index will be
100 * one less than what it would otherwise be corresponding to the UTF-8 index.
101 * Otherwise, reservedField will be 0.
102 */
103
104/*
105 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
106 * Add implementations that do not call strlen() for iteration but check for NUL.
107 */
108
109static int32_t U_CALLCONV
110lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
111    switch(origin) {
112    case UITER_ZERO:
113    case UITER_START:
114        return 0;
115    case UITER_CURRENT:
116        if(iter->index<0) {
117            /* the current UTF-16 index is unknown after setState(), count from the beginning */
118            const uint8_t *s;
119            UChar32 c;
120            int32_t i, limit, index;
121
122            s=(const uint8_t *)iter->context;
123            i=index=0;
124            limit=iter->start; /* count up to the UTF-8 index */
125            while(i<limit) {
126                L8_NEXT(s, i, limit, c);
127                if(c<=0xffff) {
128                    ++index;
129                } else {
130                    index+=2;
131                }
132            }
133
134            iter->start=i; /* just in case setState() did not get us to a code point boundary */
135            if(i==iter->limit) {
136                iter->length=index; /* in case it was <0 or wrong */
137            }
138            if(iter->reservedField!=0) {
139                --index; /* we are in the middle of a supplementary code point */
140            }
141            iter->index=index;
142        }
143        return iter->index;
144    case UITER_LIMIT:
145    case UITER_LENGTH:
146        if(iter->length<0) {
147            const uint8_t *s;
148            UChar32 c;
149            int32_t i, limit, length;
150
151            s=(const uint8_t *)iter->context;
152            if(iter->index<0) {
153                /*
154                 * the current UTF-16 index is unknown after setState(),
155                 * we must first count from the beginning to here
156                 */
157                i=length=0;
158                limit=iter->start;
159
160                /* count from the beginning to the current index */
161                while(i<limit) {
162                    L8_NEXT(s, i, limit, c);
163                    if(c<=0xffff) {
164                        ++length;
165                    } else {
166                        length+=2;
167                    }
168                }
169
170                /* assume i==limit==iter->start, set the UTF-16 index */
171                iter->start=i; /* just in case setState() did not get us to a code point boundary */
172                iter->index= iter->reservedField!=0 ? length-1 : length;
173            } else {
174                i=iter->start;
175                length=iter->index;
176                if(iter->reservedField!=0) {
177                    ++length;
178                }
179            }
180
181            /* count from the current index to the end */
182            limit=iter->limit;
183            while(i<limit) {
184                L8_NEXT(s, i, limit, c);
185                if(c<=0xffff) {
186                    ++length;
187                } else {
188                    length+=2;
189                }
190            }
191            iter->length=length;
192        }
193        return iter->length;
194    default:
195        /* not a valid origin */
196        /* Should never get here! */
197        return -1;
198    }
199}
200
201static int32_t U_CALLCONV
202lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
203    const uint8_t *s;
204    UChar32 c;
205    int32_t pos; /* requested UTF-16 index */
206    int32_t i; /* UTF-8 index */
207    UBool havePos;
208
209    /* calculate the requested UTF-16 index */
210    switch(origin) {
211    case UITER_ZERO:
212    case UITER_START:
213        pos=delta;
214        havePos=TRUE;
215        /* iter->index<0 (unknown) is possible */
216        break;
217    case UITER_CURRENT:
218        if(iter->index>=0) {
219            pos=iter->index+delta;
220            havePos=TRUE;
221        } else {
222            /* the current UTF-16 index is unknown after setState(), use only delta */
223            pos=0;
224            havePos=FALSE;
225        }
226        break;
227    case UITER_LIMIT:
228    case UITER_LENGTH:
229        if(iter->length>=0) {
230            pos=iter->length+delta;
231            havePos=TRUE;
232        } else {
233            /* pin to the end, avoid counting the length */
234            iter->index=-1;
235            iter->start=iter->limit;
236            iter->reservedField=0;
237            if(delta>=0) {
238                return UITER_UNKNOWN_INDEX;
239            } else {
240                /* the current UTF-16 index is unknown, use only delta */
241                pos=0;
242                havePos=FALSE;
243            }
244        }
245        break;
246    default:
247        return -1;  /* Error */
248    }
249
250    if(havePos) {
251        /* shortcuts: pinning to the edges of the string */
252        if(pos<=0) {
253            iter->index=iter->start=iter->reservedField=0;
254            return 0;
255        } else if(iter->length>=0 && pos>=iter->length) {
256            iter->index=iter->length;
257            iter->start=iter->limit;
258            iter->reservedField=0;
259            return iter->index;
260        }
261
262        /* minimize the number of L8_NEXT/PREV operations */
263        if(iter->index<0 || pos<iter->index/2) {
264            /* go forward from the start instead of backward from the current index */
265            iter->index=iter->start=iter->reservedField=0;
266        } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
267            /*
268             * if we have the UTF-16 index and length and the new position is
269             * closer to the end than the current index,
270             * then go backward from the end instead of forward from the current index
271             */
272            iter->index=iter->length;
273            iter->start=iter->limit;
274            iter->reservedField=0;
275        }
276
277        delta=pos-iter->index;
278        if(delta==0) {
279            return iter->index; /* nothing to do */
280        }
281    } else {
282        /* move relative to unknown UTF-16 index */
283        if(delta==0) {
284            return UITER_UNKNOWN_INDEX; /* nothing to do */
285        } else if(-delta>=iter->start) {
286            /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
287            iter->index=iter->start=iter->reservedField=0;
288            return 0;
289        } else if(delta>=(iter->limit-iter->start)) {
290            /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
291            iter->index=iter->length; /* may or may not be <0 (unknown) */
292            iter->start=iter->limit;
293            iter->reservedField=0;
294            return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX;
295        }
296    }
297
298    /* delta!=0 */
299
300    /* move towards the requested position, pin to the edges of the string */
301    s=(const uint8_t *)iter->context;
302    pos=iter->index; /* could be <0 (unknown) */
303    i=iter->start;
304    if(delta>0) {
305        /* go forward */
306        int32_t limit=iter->limit;
307        if(iter->reservedField!=0) {
308            iter->reservedField=0;
309            ++pos;
310            --delta;
311        }
312        while(delta>0 && i<limit) {
313            L8_NEXT(s, i, limit, c);
314            if(c<0xffff) {
315                ++pos;
316                --delta;
317            } else if(delta>=2) {
318                pos+=2;
319                delta-=2;
320            } else /* delta==1 */ {
321                /* stop in the middle of a supplementary code point */
322                iter->reservedField=c;
323                ++pos;
324                break; /* delta=0; */
325            }
326        }
327        if(i==limit) {
328            if(iter->length<0 && iter->index>=0) {
329                iter->length= iter->reservedField==0 ? pos : pos+1;
330            } else if(iter->index<0 && iter->length>=0) {
331                iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
332            }
333        }
334    } else /* delta<0 */ {
335        /* go backward */
336        if(iter->reservedField!=0) {
337            iter->reservedField=0;
338            i-=4; /* we stayed behind the supplementary code point; go before it now */
339            --pos;
340            ++delta;
341        }
342        while(delta<0 && i>0) {
343            L8_PREV(s, 0, i, c);
344            if(c<0xffff) {
345                --pos;
346                ++delta;
347            } else if(delta<=-2) {
348                pos-=2;
349                delta+=2;
350            } else /* delta==-1 */ {
351                /* stop in the middle of a supplementary code point */
352                i+=4; /* back to behind this supplementary code point for consistent state */
353                iter->reservedField=c;
354                --pos;
355                break; /* delta=0; */
356            }
357        }
358    }
359
360    iter->start=i;
361    if(iter->index>=0) {
362        return iter->index=pos;
363    } else {
364        /* we started with index<0 (unknown) so pos is bogus */
365        if(i<=1) {
366            return iter->index=i; /* reached the beginning */
367        } else {
368            /* we still don't know the UTF-16 index */
369            return UITER_UNKNOWN_INDEX;
370        }
371    }
372}
373
374static UBool U_CALLCONV
375lenient8IteratorHasNext(UCharIterator *iter) {
376    return iter->reservedField!=0 || iter->start<iter->limit;
377}
378
379static UBool U_CALLCONV
380lenient8IteratorHasPrevious(UCharIterator *iter) {
381    return iter->start>0;
382}
383
384static UChar32 U_CALLCONV
385lenient8IteratorCurrent(UCharIterator *iter) {
386    if(iter->reservedField!=0) {
387        return U16_TRAIL(iter->reservedField);
388    } else if(iter->start<iter->limit) {
389        const uint8_t *s=(const uint8_t *)iter->context;
390        UChar32 c;
391        int32_t i=iter->start;
392
393        L8_NEXT(s, i, iter->limit, c);
394        if(c<0) {
395            return 0xfffd;
396        } else if(c<=0xffff) {
397            return c;
398        } else {
399            return U16_LEAD(c);
400        }
401    } else {
402        return U_SENTINEL;
403    }
404}
405
406static UChar32 U_CALLCONV
407lenient8IteratorNext(UCharIterator *iter) {
408    int32_t index;
409
410    if(iter->reservedField!=0) {
411        UChar trail=U16_TRAIL(iter->reservedField);
412        iter->reservedField=0;
413        if((index=iter->index)>=0) {
414            iter->index=index+1;
415        }
416        return trail;
417    } else if(iter->start<iter->limit) {
418        const uint8_t *s=(const uint8_t *)iter->context;
419        UChar32 c;
420
421        L8_NEXT(s, iter->start, iter->limit, c);
422        if((index=iter->index)>=0) {
423            iter->index=++index;
424            if(iter->length<0 && iter->start==iter->limit) {
425                iter->length= c<=0xffff ? index : index+1;
426            }
427        } else if(iter->start==iter->limit && iter->length>=0) {
428            iter->index= c<=0xffff ? iter->length : iter->length-1;
429        }
430        if(c<0) {
431            return 0xfffd;
432        } else if(c<=0xffff) {
433            return c;
434        } else {
435            iter->reservedField=c;
436            return U16_LEAD(c);
437        }
438    } else {
439        return U_SENTINEL;
440    }
441}
442
443static UChar32 U_CALLCONV
444lenient8IteratorPrevious(UCharIterator *iter) {
445    int32_t index;
446
447    if(iter->reservedField!=0) {
448        UChar lead=U16_LEAD(iter->reservedField);
449        iter->reservedField=0;
450        iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
451        if((index=iter->index)>0) {
452            iter->index=index-1;
453        }
454        return lead;
455    } else if(iter->start>0) {
456        const uint8_t *s=(const uint8_t *)iter->context;
457        UChar32 c;
458
459        L8_PREV(s, 0, iter->start, c);
460        if((index=iter->index)>0) {
461            iter->index=index-1;
462        } else if(iter->start<=1) {
463            iter->index= c<=0xffff ? iter->start : iter->start+1;
464        }
465        if(c<0) {
466            return 0xfffd;
467        } else if(c<=0xffff) {
468            return c;
469        } else {
470            iter->start+=4; /* back to behind this supplementary code point for consistent state */
471            iter->reservedField=c;
472            return U16_TRAIL(c);
473        }
474    } else {
475        return U_SENTINEL;
476    }
477}
478
479static uint32_t U_CALLCONV
480lenient8IteratorGetState(const UCharIterator *iter) {
481    uint32_t state=(uint32_t)(iter->start<<1);
482    if(iter->reservedField!=0) {
483        state|=1;
484    }
485    return state;
486}
487
488static void U_CALLCONV
489lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
490    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
491        /* do nothing */
492    } else if(iter==NULL) {
493        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
494    } else if(state==lenient8IteratorGetState(iter)) {
495        /* setting to the current state: no-op */
496    } else {
497        int32_t index=(int32_t)(state>>1); /* UTF-8 index */
498        state&=1; /* 1 if in surrogate pair, must be index>=4 */
499
500        if((state==0 ? index<0 : index<4) || iter->limit<index) {
501            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
502        } else {
503            iter->start=index; /* restore UTF-8 byte index */
504            if(index<=1) {
505                iter->index=index;
506            } else {
507                iter->index=-1; /* unknown UTF-16 index */
508            }
509            if(state==0) {
510                iter->reservedField=0;
511            } else {
512                /* verified index>=4 above */
513                UChar32 c;
514                L8_PREV((const uint8_t *)iter->context, 0, index, c);
515                if(c<=0xffff) {
516                    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
517                } else {
518                    iter->reservedField=c;
519                }
520            }
521        }
522    }
523}
524
525static const UCharIterator lenient8Iterator={
526    0, 0, 0, 0, 0, 0,
527    lenient8IteratorGetIndex,
528    lenient8IteratorMove,
529    lenient8IteratorHasNext,
530    lenient8IteratorHasPrevious,
531    lenient8IteratorCurrent,
532    lenient8IteratorNext,
533    lenient8IteratorPrevious,
534    NULL,
535    lenient8IteratorGetState,
536    lenient8IteratorSetState
537};
538
539U_CAPI void U_EXPORT2
540uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
541    if(iter!=0) {
542        if(s!=0 && length>=-1) {
543            *iter=lenient8Iterator;
544            iter->context=s;
545            if(length>=0) {
546                iter->limit=length;
547            } else {
548                iter->limit=strlen(s);
549            }
550            iter->length= iter->limit<=1 ? iter->limit : -1;
551        } else {
552            /* set no-op iterator */
553            uiter_setString(iter, NULL, 0);
554        }
555    }
556}
557