1/*
2*******************************************************************************
3* Copyright (C) 2012-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* uitercollationiterator.cpp
7*
8* created on: 2012sep23 (from utf16collationiterator.cpp)
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/uiter.h"
17#include "charstr.h"
18#include "cmemory.h"
19#include "collation.h"
20#include "collationdata.h"
21#include "collationfcd.h"
22#include "collationiterator.h"
23#include "normalizer2impl.h"
24#include "uassert.h"
25#include "uitercollationiterator.h"
26
27U_NAMESPACE_BEGIN
28
29UIterCollationIterator::~UIterCollationIterator() {}
30
31void
32UIterCollationIterator::resetToOffset(int32_t newOffset) {
33    reset();
34    iter.move(&iter, newOffset, UITER_START);
35}
36
37int32_t
38UIterCollationIterator::getOffset() const {
39    return iter.getIndex(&iter, UITER_CURRENT);
40}
41
42uint32_t
43UIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
44    c = iter.next(&iter);
45    if(c < 0) {
46        return Collation::FALLBACK_CE32;
47    }
48    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
49}
50
51UChar
52UIterCollationIterator::handleGetTrailSurrogate() {
53    UChar32 trail = iter.next(&iter);
54    if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); }
55    return (UChar)trail;
56}
57
58UChar32
59UIterCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
60    return uiter_next32(&iter);
61}
62
63UChar32
64UIterCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
65    return uiter_previous32(&iter);
66}
67
68void
69UIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
70    while(num > 0 && (uiter_next32(&iter)) >= 0) {
71        --num;
72    }
73}
74
75void
76UIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
77    while(num > 0 && (uiter_previous32(&iter)) >= 0) {
78        --num;
79    }
80}
81
82// FCDUIterCollationIterator ----------------------------------------------- ***
83
84FCDUIterCollationIterator::~FCDUIterCollationIterator() {}
85
86void
87FCDUIterCollationIterator::resetToOffset(int32_t newOffset) {
88    UIterCollationIterator::resetToOffset(newOffset);
89    start = newOffset;
90    state = ITER_CHECK_FWD;
91}
92
93int32_t
94FCDUIterCollationIterator::getOffset() const {
95    if(state <= ITER_CHECK_BWD) {
96        return iter.getIndex(&iter, UITER_CURRENT);
97    } else if(state == ITER_IN_FCD_SEGMENT) {
98        return pos;
99    } else if(pos == 0) {
100        return start;
101    } else {
102        return limit;
103    }
104}
105
106uint32_t
107FCDUIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
108    for(;;) {
109        if(state == ITER_CHECK_FWD) {
110            c = iter.next(&iter);
111            if(c < 0) {
112                return Collation::FALLBACK_CE32;
113            }
114            if(CollationFCD::hasTccc(c)) {
115                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
116                        CollationFCD::hasLccc(iter.current(&iter))) {
117                    iter.previous(&iter);
118                    if(!nextSegment(errorCode)) {
119                        c = U_SENTINEL;
120                        return Collation::FALLBACK_CE32;
121                    }
122                    continue;
123                }
124            }
125            break;
126        } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
127            c = iter.next(&iter);
128            ++pos;
129            U_ASSERT(c >= 0);
130            break;
131        } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
132            c = normalized[pos++];
133            break;
134        } else {
135            switchToForward();
136        }
137    }
138    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
139}
140
141UChar
142FCDUIterCollationIterator::handleGetTrailSurrogate() {
143    if(state <= ITER_IN_FCD_SEGMENT) {
144        UChar32 trail = iter.next(&iter);
145        if(U16_IS_TRAIL(trail)) {
146            if(state == ITER_IN_FCD_SEGMENT) { ++pos; }
147        } else if(trail >= 0) {
148            iter.previous(&iter);
149        }
150        return (UChar)trail;
151    } else {
152        U_ASSERT(pos < normalized.length());
153        UChar trail;
154        if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
155        return trail;
156    }
157}
158
159UChar32
160FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) {
161    UChar32 c;
162    for(;;) {
163        if(state == ITER_CHECK_FWD) {
164            c = iter.next(&iter);
165            if(c < 0) {
166                return c;
167            }
168            if(CollationFCD::hasTccc(c)) {
169                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
170                        CollationFCD::hasLccc(iter.current(&iter))) {
171                    iter.previous(&iter);
172                    if(!nextSegment(errorCode)) {
173                        return U_SENTINEL;
174                    }
175                    continue;
176                }
177            }
178            if(U16_IS_LEAD(c)) {
179                UChar32 trail = iter.next(&iter);
180                if(U16_IS_TRAIL(trail)) {
181                    return U16_GET_SUPPLEMENTARY(c, trail);
182                } else if(trail >= 0) {
183                    iter.previous(&iter);
184                }
185            }
186            return c;
187        } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
188            c = uiter_next32(&iter);
189            pos += U16_LENGTH(c);
190            U_ASSERT(c >= 0);
191            return c;
192        } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
193            c = normalized.char32At(pos);
194            pos += U16_LENGTH(c);
195            return c;
196        } else {
197            switchToForward();
198        }
199    }
200}
201
202UChar32
203FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) {
204    UChar32 c;
205    for(;;) {
206        if(state == ITER_CHECK_BWD) {
207            c = iter.previous(&iter);
208            if(c < 0) {
209                start = pos = 0;
210                state = ITER_IN_FCD_SEGMENT;
211                return U_SENTINEL;
212            }
213            if(CollationFCD::hasLccc(c)) {
214                UChar32 prev = U_SENTINEL;
215                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
216                        CollationFCD::hasTccc(prev = iter.previous(&iter))) {
217                    iter.next(&iter);
218                    if(prev >= 0) {
219                        iter.next(&iter);
220                    }
221                    if(!previousSegment(errorCode)) {
222                        return U_SENTINEL;
223                    }
224                    continue;
225                }
226                // hasLccc(trail)=true for all trail surrogates
227                if(U16_IS_TRAIL(c)) {
228                    if(prev < 0) {
229                        prev = iter.previous(&iter);
230                    }
231                    if(U16_IS_LEAD(prev)) {
232                        return U16_GET_SUPPLEMENTARY(prev, c);
233                    }
234                }
235                if(prev >= 0) {
236                    iter.next(&iter);
237                }
238            }
239            return c;
240        } else if(state == ITER_IN_FCD_SEGMENT && pos != start) {
241            c = uiter_previous32(&iter);
242            pos -= U16_LENGTH(c);
243            U_ASSERT(c >= 0);
244            return c;
245        } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) {
246            c = normalized.char32At(pos - 1);
247            pos -= U16_LENGTH(c);
248            return c;
249        } else {
250            switchToBackward();
251        }
252    }
253}
254
255void
256FCDUIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
257    // Specify the class to avoid a virtual-function indirection.
258    // In Java, we would declare this class final.
259    while(num > 0 && FCDUIterCollationIterator::nextCodePoint(errorCode) >= 0) {
260        --num;
261    }
262}
263
264void
265FCDUIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
266    // Specify the class to avoid a virtual-function indirection.
267    // In Java, we would declare this class final.
268    while(num > 0 && FCDUIterCollationIterator::previousCodePoint(errorCode) >= 0) {
269        --num;
270    }
271}
272
273void
274FCDUIterCollationIterator::switchToForward() {
275    U_ASSERT(state == ITER_CHECK_BWD ||
276             (state == ITER_IN_FCD_SEGMENT && pos == limit) ||
277             (state >= IN_NORM_ITER_AT_LIMIT && pos == normalized.length()));
278    if(state == ITER_CHECK_BWD) {
279        // Turn around from backward checking.
280        start = pos = iter.getIndex(&iter, UITER_CURRENT);
281        if(pos == limit) {
282            state = ITER_CHECK_FWD;  // Check forward.
283        } else {  // pos < limit
284            state = ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
285        }
286    } else {
287        // Reached the end of the FCD segment.
288        if(state == ITER_IN_FCD_SEGMENT) {
289            // The input text segment is FCD, extend it forward.
290        } else {
291            // The input text segment needed to be normalized.
292            // Switch to checking forward from it.
293            if(state == IN_NORM_ITER_AT_START) {
294                iter.move(&iter, limit - start, UITER_CURRENT);
295            }
296            start = limit;
297        }
298        state = ITER_CHECK_FWD;
299    }
300}
301
302UBool
303FCDUIterCollationIterator::nextSegment(UErrorCode &errorCode) {
304    if(U_FAILURE(errorCode)) { return FALSE; }
305    U_ASSERT(state == ITER_CHECK_FWD);
306    // The input text [start..(iter index)[ passes the FCD check.
307    pos = iter.getIndex(&iter, UITER_CURRENT);
308    // Collect the characters being checked, in case they need to be normalized.
309    UnicodeString s;
310    uint8_t prevCC = 0;
311    for(;;) {
312        // Fetch the next character and its fcd16 value.
313        UChar32 c = uiter_next32(&iter);
314        if(c < 0) { break; }
315        uint16_t fcd16 = nfcImpl.getFCD16(c);
316        uint8_t leadCC = (uint8_t)(fcd16 >> 8);
317        if(leadCC == 0 && !s.isEmpty()) {
318            // FCD boundary before this character.
319            uiter_previous32(&iter);
320            break;
321        }
322        s.append(c);
323        if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
324            // Fails FCD check. Find the next FCD boundary and normalize.
325            for(;;) {
326                c = uiter_next32(&iter);
327                if(c < 0) { break; }
328                if(nfcImpl.getFCD16(c) <= 0xff) {
329                    uiter_previous32(&iter);
330                    break;
331                }
332                s.append(c);
333            }
334            if(!normalize(s, errorCode)) { return FALSE; }
335            start = pos;
336            limit = pos + s.length();
337            state = IN_NORM_ITER_AT_LIMIT;
338            pos = 0;
339            return TRUE;
340        }
341        prevCC = (uint8_t)fcd16;
342        if(prevCC == 0) {
343            // FCD boundary after the last character.
344            break;
345        }
346    }
347    limit = pos + s.length();
348    U_ASSERT(pos != limit);
349    iter.move(&iter, -s.length(), UITER_CURRENT);
350    state = ITER_IN_FCD_SEGMENT;
351    return TRUE;
352}
353
354void
355FCDUIterCollationIterator::switchToBackward() {
356    U_ASSERT(state == ITER_CHECK_FWD ||
357             (state == ITER_IN_FCD_SEGMENT && pos == start) ||
358             (state >= IN_NORM_ITER_AT_LIMIT && pos == 0));
359    if(state == ITER_CHECK_FWD) {
360        // Turn around from forward checking.
361        limit = pos = iter.getIndex(&iter, UITER_CURRENT);
362        if(pos == start) {
363            state = ITER_CHECK_BWD;  // Check backward.
364        } else {  // pos > start
365            state = ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
366        }
367    } else {
368        // Reached the start of the FCD segment.
369        if(state == ITER_IN_FCD_SEGMENT) {
370            // The input text segment is FCD, extend it backward.
371        } else {
372            // The input text segment needed to be normalized.
373            // Switch to checking backward from it.
374            if(state == IN_NORM_ITER_AT_LIMIT) {
375                iter.move(&iter, start - limit, UITER_CURRENT);
376            }
377            limit = start;
378        }
379        state = ITER_CHECK_BWD;
380    }
381}
382
383UBool
384FCDUIterCollationIterator::previousSegment(UErrorCode &errorCode) {
385    if(U_FAILURE(errorCode)) { return FALSE; }
386    U_ASSERT(state == ITER_CHECK_BWD);
387    // The input text [(iter index)..limit[ passes the FCD check.
388    pos = iter.getIndex(&iter, UITER_CURRENT);
389    // Collect the characters being checked, in case they need to be normalized.
390    UnicodeString s;
391    uint8_t nextCC = 0;
392    for(;;) {
393        // Fetch the previous character and its fcd16 value.
394        UChar32 c = uiter_previous32(&iter);
395        if(c < 0) { break; }
396        uint16_t fcd16 = nfcImpl.getFCD16(c);
397        uint8_t trailCC = (uint8_t)fcd16;
398        if(trailCC == 0 && !s.isEmpty()) {
399            // FCD boundary after this character.
400            uiter_next32(&iter);
401            break;
402        }
403        s.append(c);
404        if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
405                            CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
406            // Fails FCD check. Find the previous FCD boundary and normalize.
407            while(fcd16 > 0xff) {
408                c = uiter_previous32(&iter);
409                if(c < 0) { break; }
410                fcd16 = nfcImpl.getFCD16(c);
411                if(fcd16 == 0) {
412                    (void)uiter_next32(&iter);
413                    break;
414                }
415                s.append(c);
416            }
417            s.reverse();
418            if(!normalize(s, errorCode)) { return FALSE; }
419            limit = pos;
420            start = pos - s.length();
421            state = IN_NORM_ITER_AT_START;
422            pos = normalized.length();
423            return TRUE;
424        }
425        nextCC = (uint8_t)(fcd16 >> 8);
426        if(nextCC == 0) {
427            // FCD boundary before the following character.
428            break;
429        }
430    }
431    start = pos - s.length();
432    U_ASSERT(pos != start);
433    iter.move(&iter, s.length(), UITER_CURRENT);
434    state = ITER_IN_FCD_SEGMENT;
435    return TRUE;
436}
437
438UBool
439FCDUIterCollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
440    // NFD without argument checking.
441    U_ASSERT(U_SUCCESS(errorCode));
442    nfcImpl.decompose(s, normalized, errorCode);
443    return U_SUCCESS(errorCode);
444}
445
446U_NAMESPACE_END
447
448#endif  // !UCONFIG_NO_COLLATION
449