1/*
2*******************************************************************************
3*
4*   Copyright (C) 2003-2011, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  unorm_it.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2003jan21
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
20
21#include "unicode/uiter.h"
22#include "unicode/unorm.h"
23#include "unicode/utf.h"
24#include "unorm_it.h"
25#include "cmemory.h"
26
27/* UNormIterator ------------------------------------------------------------ */
28
29enum {
30    INITIAL_CAPACITY=100
31};
32
33struct UNormIterator {
34    UCharIterator api;
35    UCharIterator *iter;
36
37    /*
38     * chars and states either use the static buffers
39     * or are allocated in the same memory block
40     *
41     * They are parallel arrays with states[] holding the getState() values
42     * from normalization boundaries, and UITER_NO_STATE in between.
43     */
44    UChar *chars;
45    uint32_t *states;
46
47    /*
48     * api.start: first valid character & state in the arrays
49     * api.index: current position
50     * api.limit: one past the last valid character in chars[], but states[limit] is valid
51     * capacity: length of allocated arrays
52     */
53    int32_t capacity;
54
55    /* the current iter->getState(), saved to avoid unnecessary setState() calls; may not correspond to api->index! */
56    uint32_t state;
57
58    /* there are UChars available before start or after limit? */
59    UBool hasPrevious, hasNext, isStackAllocated;
60
61    UNormalizationMode mode;
62
63    UChar charsBuffer[INITIAL_CAPACITY];
64    uint32_t statesBuffer[INITIAL_CAPACITY+1]; /* one more than charsBuffer[]! */
65};
66
67static void
68initIndexes(UNormIterator *uni, UCharIterator *iter) {
69    /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
70    UCharIterator *api=&uni->api;
71
72    if(!iter->hasPrevious(iter)) {
73        /* set indexes to the beginning of the arrays */
74        api->start=api->index=api->limit=0;
75        uni->hasPrevious=FALSE;
76        uni->hasNext=iter->hasNext(iter);
77    } else if(!iter->hasNext(iter)) {
78        /* set indexes to the end of the arrays */
79        api->start=api->index=api->limit=uni->capacity;
80        uni->hasNext=FALSE;
81        uni->hasPrevious=iter->hasPrevious(iter);
82    } else {
83        /* set indexes into the middle of the arrays */
84        api->start=api->index=api->limit=uni->capacity/2;
85        uni->hasPrevious=uni->hasNext=TRUE;
86    }
87}
88
89static UBool
90reallocArrays(UNormIterator *uni, int32_t capacity, UBool addAtStart) {
91    /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
92    UCharIterator *api=&uni->api;
93
94    uint32_t *states;
95    UChar *chars;
96    int32_t start, limit;
97
98    states=(uint32_t *)uprv_malloc((capacity+1)*4+capacity*2);
99    if(states==NULL) {
100        return FALSE;
101    }
102
103    chars=(UChar *)(states+(capacity+1));
104    uni->capacity=capacity;
105
106    start=api->start;
107    limit=api->limit;
108
109    if(addAtStart) {
110        /* copy old contents to the end of the new arrays */
111        int32_t delta;
112
113        delta=capacity-uni->capacity;
114        uprv_memcpy(states+delta+start, uni->states+start, (limit-start+1)*4);
115        uprv_memcpy(chars+delta+start, uni->chars+start, (limit-start)*4);
116
117        api->start=start+delta;
118        api->index+=delta;
119        api->limit=limit+delta;
120    } else {
121        /* copy old contents to the beginning of the new arrays */
122        uprv_memcpy(states+start, uni->states+start, (limit-start+1)*4);
123        uprv_memcpy(chars+start, uni->chars+start, (limit-start)*4);
124    }
125
126    uni->chars=chars;
127    uni->states=states;
128
129    return TRUE;
130}
131
132static void
133moveContentsTowardStart(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
134    /* move array contents up to make room */
135    int32_t srcIndex, destIndex, limit;
136
137    limit=api->limit;
138    srcIndex=delta;
139    if(srcIndex>api->start) {
140        /* look for a position in the arrays with a known state */
141        while(srcIndex<limit && states[srcIndex]==UITER_NO_STATE) {
142            ++srcIndex;
143        }
144    }
145
146    /* now actually move the array contents */
147    api->start=destIndex=0;
148    while(srcIndex<limit) {
149        chars[destIndex]=chars[srcIndex];
150        states[destIndex++]=states[srcIndex++];
151    }
152
153    /* copy states[limit] as well! */
154    states[destIndex]=states[srcIndex];
155
156    api->limit=destIndex;
157}
158
159static void
160moveContentsTowardEnd(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
161    /* move array contents up to make room */
162    int32_t srcIndex, destIndex, start;
163
164    start=api->start;
165    destIndex=((UNormIterator *)api)->capacity;
166    srcIndex=destIndex-delta;
167    if(srcIndex<api->limit) {
168        /* look for a position in the arrays with a known state */
169        while(srcIndex>start && states[srcIndex]==UITER_NO_STATE) {
170            --srcIndex;
171        }
172    }
173
174    /* now actually move the array contents */
175    api->limit=destIndex;
176
177    /* copy states[limit] as well! */
178    states[destIndex]=states[srcIndex];
179
180    while(srcIndex>start) {
181        chars[--destIndex]=chars[--srcIndex];
182        states[destIndex]=states[srcIndex];
183    }
184
185    api->start=destIndex;
186}
187
188/* normalize forward from the limit, assume hasNext is true */
189static UBool
190readNext(UNormIterator *uni, UCharIterator *iter) {
191    /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
192    UCharIterator *api=&uni->api;
193
194    /* make capacity/4 room at the end of the arrays */
195    int32_t limit, capacity, room;
196    UErrorCode errorCode;
197
198    limit=api->limit;
199    capacity=uni->capacity;
200    room=capacity/4;
201    if(room>(capacity-limit)) {
202        /* move array contents to make room */
203        moveContentsTowardStart(api, uni->chars, uni->states, room);
204        api->index=limit=api->limit;
205        uni->hasPrevious=TRUE;
206    }
207
208    /* normalize starting from the limit position */
209    errorCode=U_ZERO_ERROR;
210    if(uni->state!=uni->states[limit]) {
211        uiter_setState(iter, uni->states[limit], &errorCode);
212        if(U_FAILURE(errorCode)) {
213            uni->state=UITER_NO_STATE;
214            uni->hasNext=FALSE;
215            return FALSE;
216        }
217    }
218
219    room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
220    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
221        if(room<=capacity) {
222            /* empty and re-use the arrays */
223            uni->states[0]=uni->states[limit];
224            api->start=api->index=api->limit=limit=0;
225            uni->hasPrevious=TRUE;
226        } else {
227            capacity+=room+100;
228            if(!reallocArrays(uni, capacity, FALSE)) {
229                uni->state=UITER_NO_STATE;
230                uni->hasNext=FALSE;
231                return FALSE;
232            }
233            limit=api->limit;
234        }
235
236        errorCode=U_ZERO_ERROR;
237        uiter_setState(iter, uni->states[limit], &errorCode);
238        room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
239    }
240    if(U_FAILURE(errorCode) || room==0) {
241        uni->state=UITER_NO_STATE;
242        uni->hasNext=FALSE;
243        return FALSE;
244    }
245
246    /* room>0 */
247    ++limit; /* leave the known states[limit] alone */
248    for(--room; room>0; --room) {
249        /* set unknown states for all but the normalization boundaries */
250        uni->states[limit++]=UITER_NO_STATE;
251    }
252    uni->states[limit]=uni->state=uiter_getState(iter);
253    uni->hasNext=iter->hasNext(iter);
254    api->limit=limit;
255    return TRUE;
256}
257
258/* normalize backward from the start, assume hasPrevious is true */
259static UBool
260readPrevious(UNormIterator *uni, UCharIterator *iter) {
261    /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
262    UCharIterator *api=&uni->api;
263
264    /* make capacity/4 room at the start of the arrays */
265    int32_t start, capacity, room;
266    UErrorCode errorCode;
267
268    start=api->start;
269    capacity=uni->capacity;
270    room=capacity/4;
271    if(room>start) {
272        /* move array contents to make room */
273        moveContentsTowardEnd(api, uni->chars, uni->states, room);
274        api->index=start=api->start;
275        uni->hasNext=TRUE;
276    }
277
278    /* normalize ending at the start position */
279    errorCode=U_ZERO_ERROR;
280    if(uni->state!=uni->states[start]) {
281        uiter_setState(iter, uni->states[start], &errorCode);
282        if(U_FAILURE(errorCode)) {
283            uni->state=UITER_NO_STATE;
284            uni->hasPrevious=FALSE;
285            return FALSE;
286        }
287    }
288
289    room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
290    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
291        if(room<=capacity) {
292            /* empty and re-use the arrays */
293            uni->states[capacity]=uni->states[start];
294            api->start=api->index=api->limit=start=capacity;
295            uni->hasNext=TRUE;
296        } else {
297            capacity+=room+100;
298            if(!reallocArrays(uni, capacity, TRUE)) {
299                uni->state=UITER_NO_STATE;
300                uni->hasPrevious=FALSE;
301                return FALSE;
302            }
303            start=api->start;
304        }
305
306        errorCode=U_ZERO_ERROR;
307        uiter_setState(iter, uni->states[start], &errorCode);
308        room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
309    }
310    if(U_FAILURE(errorCode) || room==0) {
311        uni->state=UITER_NO_STATE;
312        uni->hasPrevious=FALSE;
313        return FALSE;
314    }
315
316    /* room>0 */
317    do {
318        /* copy the UChars from chars[0..room[ to chars[(start-room)..start[ */
319        uni->chars[--start]=uni->chars[--room];
320        /* set unknown states for all but the normalization boundaries */
321        uni->states[start]=UITER_NO_STATE;
322    } while(room>0);
323    uni->states[start]=uni->state=uiter_getState(iter);
324    uni->hasPrevious=iter->hasPrevious(iter);
325    api->start=start;
326    return TRUE;
327}
328
329/* Iterator runtime API functions ------------------------------------------- */
330
331static int32_t U_CALLCONV
332unormIteratorGetIndex(UCharIterator *api, UCharIteratorOrigin origin) {
333    switch(origin) {
334    case UITER_ZERO:
335    case UITER_START:
336        return 0;
337    case UITER_CURRENT:
338    case UITER_LIMIT:
339    case UITER_LENGTH:
340        return UITER_UNKNOWN_INDEX;
341    default:
342        /* not a valid origin */
343        /* Should never get here! */
344        return -1;
345    }
346}
347
348static int32_t U_CALLCONV
349unormIteratorMove(UCharIterator *api, int32_t delta, UCharIteratorOrigin origin) {
350    UNormIterator *uni=(UNormIterator *)api;
351    UCharIterator *iter=uni->iter;
352    int32_t pos;
353
354    switch(origin) {
355    case UITER_ZERO:
356    case UITER_START:
357        /* restart from the beginning */
358        if(uni->hasPrevious) {
359            iter->move(iter, 0, UITER_START);
360            api->start=api->index=api->limit=0;
361            uni->states[api->limit]=uni->state=uiter_getState(iter);
362            uni->hasPrevious=FALSE;
363            uni->hasNext=iter->hasNext(iter);
364        } else {
365            /* we already have the beginning of the normalized text */
366            api->index=api->start;
367        }
368        break;
369    case UITER_CURRENT:
370        break;
371    case UITER_LIMIT:
372    case UITER_LENGTH:
373        /* restart from the end */
374        if(uni->hasNext) {
375            iter->move(iter, 0, UITER_LIMIT);
376            api->start=api->index=api->limit=uni->capacity;
377            uni->states[api->limit]=uni->state=uiter_getState(iter);
378            uni->hasPrevious=iter->hasPrevious(iter);
379            uni->hasNext=FALSE;
380        } else {
381            /* we already have the end of the normalized text */
382            api->index=api->limit;
383        }
384        break;
385    default:
386        return -1;  /* Error */
387    }
388
389    /* move relative to the current position by delta normalized UChars */
390    if(delta==0) {
391        /* nothing to do */
392    } else if(delta>0) {
393        /* go forward until the requested position is in the buffer */
394        for(;;) {
395            pos=api->index+delta;   /* requested position */
396            delta=pos-api->limit;   /* remainder beyond buffered text */
397            if(delta<=0) {
398                api->index=pos;     /* position reached */
399                break;
400            }
401
402            /* go to end of buffer and normalize further */
403            api->index=api->limit;
404            if(!uni->hasNext || !readNext(uni, iter)) {
405                break;              /* reached end of text */
406            }
407        }
408    } else /* delta<0 */ {
409        /* go backward until the requested position is in the buffer */
410        for(;;) {
411            pos=api->index+delta;   /* requested position */
412            delta=pos-api->start;   /* remainder beyond buffered text */
413            if(delta>=0) {
414                api->index=pos;     /* position reached */
415                break;
416            }
417
418            /* go to start of buffer and normalize further */
419            api->index=api->start;
420            if(!uni->hasPrevious || !readPrevious(uni, iter)) {
421                break;              /* reached start of text */
422            }
423        }
424    }
425
426    if(api->index==api->start && !uni->hasPrevious) {
427        return 0;
428    } else {
429        return UITER_UNKNOWN_INDEX;
430    }
431}
432
433static UBool U_CALLCONV
434unormIteratorHasNext(UCharIterator *api) {
435    return api->index<api->limit || ((UNormIterator *)api)->hasNext;
436}
437
438static UBool U_CALLCONV
439unormIteratorHasPrevious(UCharIterator *api) {
440    return api->index>api->start || ((UNormIterator *)api)->hasPrevious;
441}
442
443static UChar32 U_CALLCONV
444unormIteratorCurrent(UCharIterator *api) {
445    UNormIterator *uni=(UNormIterator *)api;
446
447    if( api->index<api->limit ||
448        (uni->hasNext && readNext(uni, uni->iter))
449    ) {
450        return uni->chars[api->index];
451    } else {
452        return U_SENTINEL;
453    }
454}
455
456static UChar32 U_CALLCONV
457unormIteratorNext(UCharIterator *api) {
458    UNormIterator *uni=(UNormIterator *)api;
459
460    if( api->index<api->limit ||
461        (uni->hasNext && readNext(uni, uni->iter))
462    ) {
463        return uni->chars[api->index++];
464    } else {
465        return U_SENTINEL;
466    }
467}
468
469static UChar32 U_CALLCONV
470unormIteratorPrevious(UCharIterator *api) {
471    UNormIterator *uni=(UNormIterator *)api;
472
473    if( api->index>api->start ||
474        (uni->hasPrevious && readPrevious(uni, uni->iter))
475    ) {
476        return uni->chars[--api->index];
477    } else {
478        return U_SENTINEL;
479    }
480}
481
482static uint32_t U_CALLCONV
483unormIteratorGetState(const UCharIterator *api) {
484    /* not uni->state because that may not be at api->index */
485    return ((UNormIterator *)api)->states[api->index];
486}
487
488static void U_CALLCONV
489unormIteratorSetState(UCharIterator *api, uint32_t state, UErrorCode *pErrorCode) {
490    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
491        /* do nothing */
492    } else if(api==NULL) {
493        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
494    } else if(state==UITER_NO_STATE) {
495        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
496    } else {
497        UNormIterator *uni=(UNormIterator *)api;
498        UCharIterator *iter=((UNormIterator *)api)->iter;
499        if(state!=uni->state) {
500            uni->state=state;
501            uiter_setState(iter, state, pErrorCode);
502        }
503
504        /*
505         * Try shortcuts: If the requested state is in the array contents
506         * then just set the index there.
507         *
508         * We assume that the state is unique per position!
509         */
510        if(state==uni->states[api->index]) {
511            return;
512        } else if(state==uni->states[api->limit]) {
513            api->index=api->limit;
514            return;
515        } else {
516            /* search for the index with this state */
517            int32_t i;
518
519            for(i=api->start; i<api->limit; ++i) {
520                if(state==uni->states[i]) {
521                    api->index=i;
522                    return;
523                }
524            }
525        }
526
527        /* there is no array index for this state, reset for fresh contents */
528        initIndexes((UNormIterator *)api, iter);
529        uni->states[api->limit]=state;
530    }
531}
532
533static const UCharIterator unormIterator={
534    NULL, 0, 0, 0, 0, 0,
535    unormIteratorGetIndex,
536    unormIteratorMove,
537    unormIteratorHasNext,
538    unormIteratorHasPrevious,
539    unormIteratorCurrent,
540    unormIteratorNext,
541    unormIteratorPrevious,
542    NULL,
543    unormIteratorGetState,
544    unormIteratorSetState
545};
546
547/* Setup functions ---------------------------------------------------------- */
548
549U_CAPI UNormIterator * U_EXPORT2
550unorm_openIter(void *stackMem, int32_t stackMemSize, UErrorCode *pErrorCode) {
551    UNormIterator *uni;
552
553    /* argument checking */
554    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
555        return NULL;
556    }
557
558    /* allocate */
559    uni=NULL;
560    if(stackMem!=NULL && stackMemSize>=sizeof(UNormIterator)) {
561        if(U_ALIGNMENT_OFFSET(stackMem)==0) {
562            /* already aligned */
563            uni=(UNormIterator *)stackMem;
564        } else {
565            int32_t align=(int32_t)U_ALIGNMENT_OFFSET_UP(stackMem);
566            if((stackMemSize-=align)>=(int32_t)sizeof(UNormIterator)) {
567                /* needs alignment */
568                uni=(UNormIterator *)((char *)stackMem+align);
569            }
570        }
571        /* else does not fit */
572    }
573
574    if(uni!=NULL) {
575        uni->isStackAllocated=TRUE;
576    } else {
577        uni=(UNormIterator *)uprv_malloc(sizeof(UNormIterator));
578        if(uni==NULL) {
579            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
580            return NULL;
581        }
582        uni->isStackAllocated=FALSE;
583    }
584
585    /*
586     * initialize
587     * do not memset because that would unnecessarily initialize the arrays
588     */
589    uni->iter=NULL;
590    uni->chars=uni->charsBuffer;
591    uni->states=uni->statesBuffer;
592    uni->capacity=INITIAL_CAPACITY;
593    uni->state=UITER_NO_STATE;
594    uni->hasPrevious=uni->hasNext=FALSE;
595    uni->mode=UNORM_NONE;
596
597    /* set a no-op iterator into the api */
598    uiter_setString(&uni->api, NULL, 0);
599    return uni;
600}
601
602U_CAPI void U_EXPORT2
603unorm_closeIter(UNormIterator *uni) {
604    if(uni!=NULL) {
605        if(uni->states!=uni->statesBuffer) {
606            /* chars and states are allocated in the same memory block */
607            uprv_free(uni->states);
608        }
609        if(!uni->isStackAllocated) {
610            uprv_free(uni);
611        }
612    }
613}
614
615U_CAPI UCharIterator * U_EXPORT2
616unorm_setIter(UNormIterator *uni, UCharIterator *iter, UNormalizationMode mode, UErrorCode *pErrorCode) {
617    /* argument checking */
618    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
619        return NULL;
620    }
621    if(uni==NULL) {
622        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
623        return NULL;
624    }
625    if( iter==NULL || iter->getState==NULL || iter->setState==NULL ||
626        mode<UNORM_NONE || UNORM_MODE_COUNT<=mode
627    ) {
628        /* set a no-op iterator into the api */
629        uiter_setString(&uni->api, NULL, 0);
630        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
631        return NULL;
632    }
633
634    /* set the iterator and initialize */
635    uprv_memcpy(&uni->api, &unormIterator, sizeof(unormIterator));
636
637    uni->iter=iter;
638    uni->mode=mode;
639
640    initIndexes(uni, iter);
641    uni->states[uni->api.limit]=uni->state=uiter_getState(iter);
642
643    return &uni->api;
644}
645
646#endif /* uconfig.h switches */
647