1/*
2*******************************************************************************
3*
4*   Copyright (C) 2003-2008, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  unorm_it.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2003jan21
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
20
21#include "unicode/uiter.h"
22#include "unicode/unorm.h"
23#include "unorm_it.h"
24#include "cmemory.h"
25
26/* UNormIterator ------------------------------------------------------------ */
27
28enum {
29    INITIAL_CAPACITY=100
30};
31
32struct UNormIterator {
33    UCharIterator api;
34    UCharIterator *iter;
35
36    /*
37     * chars and states either use the static buffers
38     * or are allocated in the same memory block
39     *
40     * They are parallel arrays with states[] holding the getState() values
41     * from normalization boundaries, and UITER_NO_STATE in between.
42     */
43    UChar *chars;
44    uint32_t *states;
45
46    /*
47     * api.start: first valid character & state in the arrays
48     * api.index: current position
49     * api.limit: one past the last valid character in chars[], but states[limit] is valid
50     * capacity: length of allocated arrays
51     */
52    int32_t capacity;
53
54    /* the current iter->getState(), saved to avoid unnecessary setState() calls; may not correspond to api->index! */
55    uint32_t state;
56
57    /* there are UChars available before start or after limit? */
58    UBool hasPrevious, hasNext, isStackAllocated;
59
60    UNormalizationMode mode;
61
62    UChar charsBuffer[INITIAL_CAPACITY];
63    uint32_t statesBuffer[INITIAL_CAPACITY+1]; /* one more than charsBuffer[]! */
64};
65
66static void
67initIndexes(UNormIterator *uni, UCharIterator *iter) {
68    /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
69    UCharIterator *api=&uni->api;
70
71    if(!iter->hasPrevious(iter)) {
72        /* set indexes to the beginning of the arrays */
73        api->start=api->index=api->limit=0;
74        uni->hasPrevious=FALSE;
75        uni->hasNext=iter->hasNext(iter);
76    } else if(!iter->hasNext(iter)) {
77        /* set indexes to the end of the arrays */
78        api->start=api->index=api->limit=uni->capacity;
79        uni->hasNext=FALSE;
80        uni->hasPrevious=iter->hasPrevious(iter);
81    } else {
82        /* set indexes into the middle of the arrays */
83        api->start=api->index=api->limit=uni->capacity/2;
84        uni->hasPrevious=uni->hasNext=TRUE;
85    }
86}
87
88static UBool
89reallocArrays(UNormIterator *uni, int32_t capacity, UBool addAtStart) {
90    /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
91    UCharIterator *api=&uni->api;
92
93    uint32_t *states;
94    UChar *chars;
95    int32_t start, limit;
96
97    states=(uint32_t *)uprv_malloc((capacity+1)*4+capacity*2);
98    if(states==NULL) {
99        return FALSE;
100    }
101
102    chars=(UChar *)(states+(capacity+1));
103    uni->capacity=capacity;
104
105    start=api->start;
106    limit=api->limit;
107
108    if(addAtStart) {
109        /* copy old contents to the end of the new arrays */
110        int32_t delta;
111
112        delta=capacity-uni->capacity;
113        uprv_memcpy(states+delta+start, uni->states+start, (limit-start+1)*4);
114        uprv_memcpy(chars+delta+start, uni->chars+start, (limit-start)*4);
115
116        api->start=start+delta;
117        api->index+=delta;
118        api->limit=limit+delta;
119    } else {
120        /* copy old contents to the beginning of the new arrays */
121        uprv_memcpy(states+start, uni->states+start, (limit-start+1)*4);
122        uprv_memcpy(chars+start, uni->chars+start, (limit-start)*4);
123    }
124
125    uni->chars=chars;
126    uni->states=states;
127
128    return TRUE;
129}
130
131static void
132moveContentsTowardStart(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
133    /* move array contents up to make room */
134    int32_t srcIndex, destIndex, limit;
135
136    limit=api->limit;
137    srcIndex=delta;
138    if(srcIndex>api->start) {
139        /* look for a position in the arrays with a known state */
140        while(srcIndex<limit && states[srcIndex]==UITER_NO_STATE) {
141            ++srcIndex;
142        }
143    }
144
145    /* now actually move the array contents */
146    api->start=destIndex=0;
147    while(srcIndex<limit) {
148        chars[destIndex]=chars[srcIndex];
149        states[destIndex++]=states[srcIndex++];
150    }
151
152    /* copy states[limit] as well! */
153    states[destIndex]=states[srcIndex];
154
155    api->limit=destIndex;
156}
157
158static void
159moveContentsTowardEnd(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
160    /* move array contents up to make room */
161    int32_t srcIndex, destIndex, start;
162
163    start=api->start;
164    destIndex=((UNormIterator *)api)->capacity;
165    srcIndex=destIndex-delta;
166    if(srcIndex<api->limit) {
167        /* look for a position in the arrays with a known state */
168        while(srcIndex>start && states[srcIndex]==UITER_NO_STATE) {
169            --srcIndex;
170        }
171    }
172
173    /* now actually move the array contents */
174    api->limit=destIndex;
175
176    /* copy states[limit] as well! */
177    states[destIndex]=states[srcIndex];
178
179    while(srcIndex>start) {
180        chars[--destIndex]=chars[--srcIndex];
181        states[destIndex]=states[srcIndex];
182    }
183
184    api->start=destIndex;
185}
186
187/* normalize forward from the limit, assume hasNext is true */
188static UBool
189readNext(UNormIterator *uni, UCharIterator *iter) {
190    /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
191    UCharIterator *api=&uni->api;
192
193    /* make capacity/4 room at the end of the arrays */
194    int32_t limit, capacity, room;
195    UErrorCode errorCode;
196
197    limit=api->limit;
198    capacity=uni->capacity;
199    room=capacity/4;
200    if(room>(capacity-limit)) {
201        /* move array contents to make room */
202        moveContentsTowardStart(api, uni->chars, uni->states, room);
203        api->index=limit=api->limit;
204        uni->hasPrevious=TRUE;
205    }
206
207    /* normalize starting from the limit position */
208    errorCode=U_ZERO_ERROR;
209    if(uni->state!=uni->states[limit]) {
210        uiter_setState(iter, uni->states[limit], &errorCode);
211        if(U_FAILURE(errorCode)) {
212            uni->state=UITER_NO_STATE;
213            uni->hasNext=FALSE;
214            return FALSE;
215        }
216    }
217
218    room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
219    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
220        if(room<=capacity) {
221            /* empty and re-use the arrays */
222            uni->states[0]=uni->states[limit];
223            api->start=api->index=api->limit=limit=0;
224            uni->hasPrevious=TRUE;
225        } else {
226            capacity+=room+100;
227            if(!reallocArrays(uni, capacity, FALSE)) {
228                uni->state=UITER_NO_STATE;
229                uni->hasNext=FALSE;
230                return FALSE;
231            }
232            limit=api->limit;
233        }
234
235        errorCode=U_ZERO_ERROR;
236        uiter_setState(iter, uni->states[limit], &errorCode);
237        room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
238    }
239    if(U_FAILURE(errorCode) || room==0) {
240        uni->state=UITER_NO_STATE;
241        uni->hasNext=FALSE;
242        return FALSE;
243    }
244
245    /* room>0 */
246    ++limit; /* leave the known states[limit] alone */
247    for(--room; room>0; --room) {
248        /* set unknown states for all but the normalization boundaries */
249        uni->states[limit++]=UITER_NO_STATE;
250    }
251    uni->states[limit]=uni->state=uiter_getState(iter);
252    uni->hasNext=iter->hasNext(iter);
253    api->limit=limit;
254    return TRUE;
255}
256
257/* normalize backward from the start, assume hasPrevious is true */
258static UBool
259readPrevious(UNormIterator *uni, UCharIterator *iter) {
260    /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
261    UCharIterator *api=&uni->api;
262
263    /* make capacity/4 room at the start of the arrays */
264    int32_t start, capacity, room;
265    UErrorCode errorCode;
266
267    start=api->start;
268    capacity=uni->capacity;
269    room=capacity/4;
270    if(room>start) {
271        /* move array contents to make room */
272        moveContentsTowardEnd(api, uni->chars, uni->states, room);
273        api->index=start=api->start;
274        uni->hasNext=TRUE;
275    }
276
277    /* normalize ending at the start position */
278    errorCode=U_ZERO_ERROR;
279    if(uni->state!=uni->states[start]) {
280        uiter_setState(iter, uni->states[start], &errorCode);
281        if(U_FAILURE(errorCode)) {
282            uni->state=UITER_NO_STATE;
283            uni->hasPrevious=FALSE;
284            return FALSE;
285        }
286    }
287
288    room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
289    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
290        if(room<=capacity) {
291            /* empty and re-use the arrays */
292            uni->states[capacity]=uni->states[start];
293            api->start=api->index=api->limit=start=capacity;
294            uni->hasNext=TRUE;
295        } else {
296            capacity+=room+100;
297            if(!reallocArrays(uni, capacity, TRUE)) {
298                uni->state=UITER_NO_STATE;
299                uni->hasPrevious=FALSE;
300                return FALSE;
301            }
302            start=api->start;
303        }
304
305        errorCode=U_ZERO_ERROR;
306        uiter_setState(iter, uni->states[start], &errorCode);
307        room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
308    }
309    if(U_FAILURE(errorCode) || room==0) {
310        uni->state=UITER_NO_STATE;
311        uni->hasPrevious=FALSE;
312        return FALSE;
313    }
314
315    /* room>0 */
316    do {
317        /* copy the UChars from chars[0..room[ to chars[(start-room)..start[ */
318        uni->chars[--start]=uni->chars[--room];
319        /* set unknown states for all but the normalization boundaries */
320        uni->states[start]=UITER_NO_STATE;
321    } while(room>0);
322    uni->states[start]=uni->state=uiter_getState(iter);
323    uni->hasPrevious=iter->hasPrevious(iter);
324    api->start=start;
325    return TRUE;
326}
327
328/* Iterator runtime API functions ------------------------------------------- */
329
330static int32_t U_CALLCONV
331unormIteratorGetIndex(UCharIterator *api, UCharIteratorOrigin origin) {
332    switch(origin) {
333    case UITER_ZERO:
334    case UITER_START:
335        return 0;
336    case UITER_CURRENT:
337    case UITER_LIMIT:
338    case UITER_LENGTH:
339        return UITER_UNKNOWN_INDEX;
340    default:
341        /* not a valid origin */
342        /* Should never get here! */
343        return -1;
344    }
345}
346
347static int32_t U_CALLCONV
348unormIteratorMove(UCharIterator *api, int32_t delta, UCharIteratorOrigin origin) {
349    UNormIterator *uni=(UNormIterator *)api;
350    UCharIterator *iter=uni->iter;
351    int32_t pos;
352
353    switch(origin) {
354    case UITER_ZERO:
355    case UITER_START:
356        /* restart from the beginning */
357        if(uni->hasPrevious) {
358            iter->move(iter, 0, UITER_START);
359            api->start=api->index=api->limit=0;
360            uni->states[api->limit]=uni->state=uiter_getState(iter);
361            uni->hasPrevious=FALSE;
362            uni->hasNext=iter->hasNext(iter);
363        } else {
364            /* we already have the beginning of the normalized text */
365            api->index=api->start;
366        }
367        break;
368    case UITER_CURRENT:
369        break;
370    case UITER_LIMIT:
371    case UITER_LENGTH:
372        /* restart from the end */
373        if(uni->hasNext) {
374            iter->move(iter, 0, UITER_LIMIT);
375            api->start=api->index=api->limit=uni->capacity;
376            uni->states[api->limit]=uni->state=uiter_getState(iter);
377            uni->hasPrevious=iter->hasPrevious(iter);
378            uni->hasNext=FALSE;
379        } else {
380            /* we already have the end of the normalized text */
381            api->index=api->limit;
382        }
383        break;
384    default:
385        return -1;  /* Error */
386    }
387
388    /* move relative to the current position by delta normalized UChars */
389    if(delta==0) {
390        /* nothing to do */
391    } else if(delta>0) {
392        /* go forward until the requested position is in the buffer */
393        for(;;) {
394            pos=api->index+delta;   /* requested position */
395            delta=pos-api->limit;   /* remainder beyond buffered text */
396            if(delta<=0) {
397                api->index=pos;     /* position reached */
398                break;
399            }
400
401            /* go to end of buffer and normalize further */
402            api->index=api->limit;
403            if(!uni->hasNext || !readNext(uni, iter)) {
404                break;              /* reached end of text */
405            }
406        }
407    } else /* delta<0 */ {
408        /* go backward until the requested position is in the buffer */
409        for(;;) {
410            pos=api->index+delta;   /* requested position */
411            delta=pos-api->start;   /* remainder beyond buffered text */
412            if(delta>=0) {
413                api->index=pos;     /* position reached */
414                break;
415            }
416
417            /* go to start of buffer and normalize further */
418            api->index=api->start;
419            if(!uni->hasPrevious || !readPrevious(uni, iter)) {
420                break;              /* reached start of text */
421            }
422        }
423    }
424
425    if(api->index==api->start && !uni->hasPrevious) {
426        return 0;
427    } else {
428        return UITER_UNKNOWN_INDEX;
429    }
430}
431
432static UBool U_CALLCONV
433unormIteratorHasNext(UCharIterator *api) {
434    return api->index<api->limit || ((UNormIterator *)api)->hasNext;
435}
436
437static UBool U_CALLCONV
438unormIteratorHasPrevious(UCharIterator *api) {
439    return api->index>api->start || ((UNormIterator *)api)->hasPrevious;
440}
441
442static UChar32 U_CALLCONV
443unormIteratorCurrent(UCharIterator *api) {
444    UNormIterator *uni=(UNormIterator *)api;
445
446    if( api->index<api->limit ||
447        (uni->hasNext && readNext(uni, uni->iter))
448    ) {
449        return uni->chars[api->index];
450    } else {
451        return U_SENTINEL;
452    }
453}
454
455static UChar32 U_CALLCONV
456unormIteratorNext(UCharIterator *api) {
457    UNormIterator *uni=(UNormIterator *)api;
458
459    if( api->index<api->limit ||
460        (uni->hasNext && readNext(uni, uni->iter))
461    ) {
462        return uni->chars[api->index++];
463    } else {
464        return U_SENTINEL;
465    }
466}
467
468static UChar32 U_CALLCONV
469unormIteratorPrevious(UCharIterator *api) {
470    UNormIterator *uni=(UNormIterator *)api;
471
472    if( api->index>api->start ||
473        (uni->hasPrevious && readPrevious(uni, uni->iter))
474    ) {
475        return uni->chars[--api->index];
476    } else {
477        return U_SENTINEL;
478    }
479}
480
481static uint32_t U_CALLCONV
482unormIteratorGetState(const UCharIterator *api) {
483    /* not uni->state because that may not be at api->index */
484    return ((UNormIterator *)api)->states[api->index];
485}
486
487static void U_CALLCONV
488unormIteratorSetState(UCharIterator *api, uint32_t state, UErrorCode *pErrorCode) {
489    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
490        /* do nothing */
491    } else if(api==NULL) {
492        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
493    } else if(state==UITER_NO_STATE) {
494        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
495    } else {
496        UNormIterator *uni=(UNormIterator *)api;
497        UCharIterator *iter=((UNormIterator *)api)->iter;
498        if(state!=uni->state) {
499            uni->state=state;
500            uiter_setState(iter, state, pErrorCode);
501        }
502
503        /*
504         * Try shortcuts: If the requested state is in the array contents
505         * then just set the index there.
506         *
507         * We assume that the state is unique per position!
508         */
509        if(state==uni->states[api->index]) {
510            return;
511        } else if(state==uni->states[api->limit]) {
512            api->index=api->limit;
513            return;
514        } else {
515            /* search for the index with this state */
516            int32_t i;
517
518            for(i=api->start; i<api->limit; ++i) {
519                if(state==uni->states[i]) {
520                    api->index=i;
521                    return;
522                }
523            }
524        }
525
526        /* there is no array index for this state, reset for fresh contents */
527        initIndexes((UNormIterator *)api, iter);
528        uni->states[api->limit]=state;
529    }
530}
531
532static const UCharIterator unormIterator={
533    NULL, 0, 0, 0, 0, 0,
534    unormIteratorGetIndex,
535    unormIteratorMove,
536    unormIteratorHasNext,
537    unormIteratorHasPrevious,
538    unormIteratorCurrent,
539    unormIteratorNext,
540    unormIteratorPrevious,
541    NULL,
542    unormIteratorGetState,
543    unormIteratorSetState
544};
545
546/* Setup functions ---------------------------------------------------------- */
547
548U_CAPI UNormIterator * U_EXPORT2
549unorm_openIter(void *stackMem, int32_t stackMemSize, UErrorCode *pErrorCode) {
550    UNormIterator *uni;
551
552    /* argument checking */
553    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
554        return NULL;
555    }
556
557    /* allocate */
558    uni=NULL;
559    if(stackMem!=NULL && stackMemSize>=sizeof(UNormIterator)) {
560        if(U_ALIGNMENT_OFFSET(stackMem)==0) {
561            /* already aligned */
562            uni=(UNormIterator *)stackMem;
563        } else {
564            int32_t align=(int32_t)U_ALIGNMENT_OFFSET_UP(stackMem);
565            if((stackMemSize-=align)>=(int32_t)sizeof(UNormIterator)) {
566                /* needs alignment */
567                uni=(UNormIterator *)((char *)stackMem+align);
568            }
569        }
570        /* else does not fit */
571    }
572
573    if(uni!=NULL) {
574        uni->isStackAllocated=TRUE;
575    } else {
576        uni=(UNormIterator *)uprv_malloc(sizeof(UNormIterator));
577        if(uni==NULL) {
578            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
579            return NULL;
580        }
581        uni->isStackAllocated=FALSE;
582    }
583
584    /*
585     * initialize
586     * do not memset because that would unnecessarily initialize the arrays
587     */
588    uni->iter=NULL;
589    uni->chars=uni->charsBuffer;
590    uni->states=uni->statesBuffer;
591    uni->capacity=INITIAL_CAPACITY;
592    uni->state=UITER_NO_STATE;
593    uni->hasPrevious=uni->hasNext=FALSE;
594    uni->mode=UNORM_NONE;
595
596    /* set a no-op iterator into the api */
597    uiter_setString(&uni->api, NULL, 0);
598    return uni;
599}
600
601U_CAPI void U_EXPORT2
602unorm_closeIter(UNormIterator *uni) {
603    if(uni!=NULL) {
604        if(uni->states!=uni->statesBuffer) {
605            /* chars and states are allocated in the same memory block */
606            uprv_free(uni->states);
607        }
608        if(!uni->isStackAllocated) {
609            uprv_free(uni);
610        }
611    }
612}
613
614U_CAPI UCharIterator * U_EXPORT2
615unorm_setIter(UNormIterator *uni, UCharIterator *iter, UNormalizationMode mode, UErrorCode *pErrorCode) {
616    /* argument checking */
617    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
618        return NULL;
619    }
620    if(uni==NULL) {
621        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
622        return NULL;
623    }
624    if( iter==NULL || iter->getState==NULL || iter->setState==NULL ||
625        mode<UNORM_NONE || UNORM_MODE_COUNT<=mode
626    ) {
627        /* set a no-op iterator into the api */
628        uiter_setString(&uni->api, NULL, 0);
629        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
630        return NULL;
631    }
632
633    /* set the iterator and initialize */
634    uprv_memcpy(&uni->api, &unormIterator, sizeof(unormIterator));
635
636    uni->iter=iter;
637    uni->mode=mode;
638
639    initIndexes(uni, iter);
640    uni->states[uni->api.limit]=uni->state=uiter_getState(iter);
641
642    return &uni->api;
643}
644
645#endif /* uconfig.h switches */
646