1/*
2*******************************************************************************
3* Copyright (C) 2010-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* utf16collationiterator.cpp
7*
8* created on: 2010oct27
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "charstr.h"
17#include "cmemory.h"
18#include "collation.h"
19#include "collationdata.h"
20#include "collationfcd.h"
21#include "collationiterator.h"
22#include "normalizer2impl.h"
23#include "uassert.h"
24#include "utf16collationiterator.h"
25
26U_NAMESPACE_BEGIN
27
28UTF16CollationIterator::UTF16CollationIterator(const UTF16CollationIterator &other,
29                                               const UChar *newText)
30        : CollationIterator(other),
31          start(newText),
32          pos(newText + (other.pos - other.start)),
33          limit(other.limit == NULL ? NULL : newText + (other.limit - other.start)) {
34}
35
36UTF16CollationIterator::~UTF16CollationIterator() {}
37
38UBool
39UTF16CollationIterator::operator==(const CollationIterator &other) const {
40    if(!CollationIterator::operator==(other)) { return FALSE; }
41    const UTF16CollationIterator &o = static_cast<const UTF16CollationIterator &>(other);
42    // Compare the iterator state but not the text: Assume that the caller does that.
43    return (pos - start) == (o.pos - o.start);
44}
45
46void
47UTF16CollationIterator::resetToOffset(int32_t newOffset) {
48    reset();
49    pos = start + newOffset;
50}
51
52int32_t
53UTF16CollationIterator::getOffset() const {
54    return (int32_t)(pos - start);
55}
56
57uint32_t
58UTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
59    if(pos == limit) {
60        c = U_SENTINEL;
61        return Collation::FALLBACK_CE32;
62    }
63    c = *pos++;
64    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
65}
66
67UChar
68UTF16CollationIterator::handleGetTrailSurrogate() {
69    if(pos == limit) { return 0; }
70    UChar trail;
71    if(U16_IS_TRAIL(trail = *pos)) { ++pos; }
72    return trail;
73}
74
75UBool
76UTF16CollationIterator::foundNULTerminator() {
77    if(limit == NULL) {
78        limit = --pos;
79        return TRUE;
80    } else {
81        return FALSE;
82    }
83}
84
85UChar32
86UTF16CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
87    if(pos == limit) {
88        return U_SENTINEL;
89    }
90    UChar32 c = *pos;
91    if(c == 0 && limit == NULL) {
92        limit = pos;
93        return U_SENTINEL;
94    }
95    ++pos;
96    UChar trail;
97    if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
98        ++pos;
99        return U16_GET_SUPPLEMENTARY(c, trail);
100    } else {
101        return c;
102    }
103}
104
105UChar32
106UTF16CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
107    if(pos == start) {
108        return U_SENTINEL;
109    }
110    UChar32 c = *--pos;
111    UChar lead;
112    if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
113        --pos;
114        return U16_GET_SUPPLEMENTARY(lead, c);
115    } else {
116        return c;
117    }
118}
119
120void
121UTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
122    while(num > 0 && pos != limit) {
123        UChar32 c = *pos;
124        if(c == 0 && limit == NULL) {
125            limit = pos;
126            break;
127        }
128        ++pos;
129        --num;
130        if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(*pos)) {
131            ++pos;
132        }
133    }
134}
135
136void
137UTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
138    while(num > 0 && pos != start) {
139        UChar32 c = *--pos;
140        --num;
141        if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(*(pos-1))) {
142            --pos;
143        }
144    }
145}
146
147// FCDUTF16CollationIterator ----------------------------------------------- ***
148
149FCDUTF16CollationIterator::FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other,
150                                                     const UChar *newText)
151        : UTF16CollationIterator(other),
152          rawStart(newText),
153          segmentStart(newText + (other.segmentStart - other.rawStart)),
154          segmentLimit(other.segmentLimit == NULL ? NULL : newText + (other.segmentLimit - other.rawStart)),
155          rawLimit(other.rawLimit == NULL ? NULL : newText + (other.rawLimit - other.rawStart)),
156          nfcImpl(other.nfcImpl),
157          normalized(other.normalized),
158          checkDir(other.checkDir) {
159    if(checkDir != 0 || other.start == other.segmentStart) {
160        start = newText + (other.start - other.rawStart);
161        pos = newText + (other.pos - other.rawStart);
162        limit = other.limit == NULL ? NULL : newText + (other.limit - other.rawStart);
163    } else {
164        start = normalized.getBuffer();
165        pos = start + (other.pos - other.start);
166        limit = start + normalized.length();
167    }
168}
169
170FCDUTF16CollationIterator::~FCDUTF16CollationIterator() {}
171
172UBool
173FCDUTF16CollationIterator::operator==(const CollationIterator &other) const {
174    // Skip the UTF16CollationIterator and call its parent.
175    if(!CollationIterator::operator==(other)) { return FALSE; }
176    const FCDUTF16CollationIterator &o = static_cast<const FCDUTF16CollationIterator &>(other);
177    // Compare the iterator state but not the text: Assume that the caller does that.
178    if(checkDir != o.checkDir) { return FALSE; }
179    if(checkDir == 0 && (start == segmentStart) != (o.start == o.segmentStart)) { return FALSE; }
180    if(checkDir != 0 || start == segmentStart) {
181        return (pos - rawStart) == (o.pos - o.rawStart);
182    } else {
183        return (segmentStart - rawStart) == (o.segmentStart - o.rawStart) &&
184                (pos - start) == (o.pos - o.start);
185    }
186}
187
188void
189FCDUTF16CollationIterator::resetToOffset(int32_t newOffset) {
190    reset();
191    start = segmentStart = pos = rawStart + newOffset;
192    limit = rawLimit;
193    checkDir = 1;
194}
195
196int32_t
197FCDUTF16CollationIterator::getOffset() const {
198    if(checkDir != 0 || start == segmentStart) {
199        return (int32_t)(pos - rawStart);
200    } else if(pos == start) {
201        return (int32_t)(segmentStart - rawStart);
202    } else {
203        return (int32_t)(segmentLimit - rawStart);
204    }
205}
206
207uint32_t
208FCDUTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
209    for(;;) {
210        if(checkDir > 0) {
211            if(pos == limit) {
212                c = U_SENTINEL;
213                return Collation::FALLBACK_CE32;
214            }
215            c = *pos++;
216            if(CollationFCD::hasTccc(c)) {
217                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
218                        (pos != limit && CollationFCD::hasLccc(*pos))) {
219                    --pos;
220                    if(!nextSegment(errorCode)) {
221                        c = U_SENTINEL;
222                        return Collation::FALLBACK_CE32;
223                    }
224                    c = *pos++;
225                }
226            }
227            break;
228        } else if(checkDir == 0 && pos != limit) {
229            c = *pos++;
230            break;
231        } else {
232            switchToForward();
233        }
234    }
235    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
236}
237
238UBool
239FCDUTF16CollationIterator::foundNULTerminator() {
240    if(limit == NULL) {
241        limit = rawLimit = --pos;
242        return TRUE;
243    } else {
244        return FALSE;
245    }
246}
247
248UChar32
249FCDUTF16CollationIterator::nextCodePoint(UErrorCode &errorCode) {
250    UChar32 c;
251    for(;;) {
252        if(checkDir > 0) {
253            if(pos == limit) {
254                return U_SENTINEL;
255            }
256            c = *pos++;
257            if(CollationFCD::hasTccc(c)) {
258                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
259                        (pos != limit && CollationFCD::hasLccc(*pos))) {
260                    --pos;
261                    if(!nextSegment(errorCode)) {
262                        return U_SENTINEL;
263                    }
264                    c = *pos++;
265                }
266            } else if(c == 0 && limit == NULL) {
267                limit = rawLimit = --pos;
268                return U_SENTINEL;
269            }
270            break;
271        } else if(checkDir == 0 && pos != limit) {
272            c = *pos++;
273            break;
274        } else {
275            switchToForward();
276        }
277    }
278    UChar trail;
279    if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
280        ++pos;
281        return U16_GET_SUPPLEMENTARY(c, trail);
282    } else {
283        return c;
284    }
285}
286
287UChar32
288FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) {
289    UChar32 c;
290    for(;;) {
291        if(checkDir < 0) {
292            if(pos == start) {
293                return U_SENTINEL;
294            }
295            c = *--pos;
296            if(CollationFCD::hasLccc(c)) {
297                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
298                        (pos != start && CollationFCD::hasTccc(*(pos - 1)))) {
299                    ++pos;
300                    if(!previousSegment(errorCode)) {
301                        return U_SENTINEL;
302                    }
303                    c = *--pos;
304                }
305            }
306            break;
307        } else if(checkDir == 0 && pos != start) {
308            c = *--pos;
309            break;
310        } else {
311            switchToBackward();
312        }
313    }
314    UChar lead;
315    if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
316        --pos;
317        return U16_GET_SUPPLEMENTARY(lead, c);
318    } else {
319        return c;
320    }
321}
322
323void
324FCDUTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
325    // Specify the class to avoid a virtual-function indirection.
326    // In Java, we would declare this class final.
327    while(num > 0 && FCDUTF16CollationIterator::nextCodePoint(errorCode) >= 0) {
328        --num;
329    }
330}
331
332void
333FCDUTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
334    // Specify the class to avoid a virtual-function indirection.
335    // In Java, we would declare this class final.
336    while(num > 0 && FCDUTF16CollationIterator::previousCodePoint(errorCode) >= 0) {
337        --num;
338    }
339}
340
341void
342FCDUTF16CollationIterator::switchToForward() {
343    U_ASSERT(checkDir < 0 || (checkDir == 0 && pos == limit));
344    if(checkDir < 0) {
345        // Turn around from backward checking.
346        start = segmentStart = pos;
347        if(pos == segmentLimit) {
348            limit = rawLimit;
349            checkDir = 1;  // Check forward.
350        } else {  // pos < segmentLimit
351            checkDir = 0;  // Stay in FCD segment.
352        }
353    } else {
354        // Reached the end of the FCD segment.
355        if(start == segmentStart) {
356            // The input text segment is FCD, extend it forward.
357        } else {
358            // The input text segment needed to be normalized.
359            // Switch to checking forward from it.
360            pos = start = segmentStart = segmentLimit;
361            // Note: If this segment is at the end of the input text,
362            // then it might help to return FALSE to indicate that, so that
363            // we do not have to re-check and normalize when we turn around and go backwards.
364            // However, that would complicate the call sites for an optimization of an unusual case.
365        }
366        limit = rawLimit;
367        checkDir = 1;
368    }
369}
370
371UBool
372FCDUTF16CollationIterator::nextSegment(UErrorCode &errorCode) {
373    if(U_FAILURE(errorCode)) { return FALSE; }
374    U_ASSERT(checkDir > 0 && pos != limit);
375    // The input text [segmentStart..pos[ passes the FCD check.
376    const UChar *p = pos;
377    uint8_t prevCC = 0;
378    for(;;) {
379        // Fetch the next character's fcd16 value.
380        const UChar *q = p;
381        uint16_t fcd16 = nfcImpl.nextFCD16(p, rawLimit);
382        uint8_t leadCC = (uint8_t)(fcd16 >> 8);
383        if(leadCC == 0 && q != pos) {
384            // FCD boundary before the [q, p[ character.
385            limit = segmentLimit = q;
386            break;
387        }
388        if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
389            // Fails FCD check. Find the next FCD boundary and normalize.
390            do {
391                q = p;
392            } while(p != rawLimit && nfcImpl.nextFCD16(p, rawLimit) > 0xff);
393            if(!normalize(pos, q, errorCode)) { return FALSE; }
394            pos = start;
395            break;
396        }
397        prevCC = (uint8_t)fcd16;
398        if(p == rawLimit || prevCC == 0) {
399            // FCD boundary after the last character.
400            limit = segmentLimit = p;
401            break;
402        }
403    }
404    U_ASSERT(pos != limit);
405    checkDir = 0;
406    return TRUE;
407}
408
409void
410FCDUTF16CollationIterator::switchToBackward() {
411    U_ASSERT(checkDir > 0 || (checkDir == 0 && pos == start));
412    if(checkDir > 0) {
413        // Turn around from forward checking.
414        limit = segmentLimit = pos;
415        if(pos == segmentStart) {
416            start = rawStart;
417            checkDir = -1;  // Check backward.
418        } else {  // pos > segmentStart
419            checkDir = 0;  // Stay in FCD segment.
420        }
421    } else {
422        // Reached the start of the FCD segment.
423        if(start == segmentStart) {
424            // The input text segment is FCD, extend it backward.
425        } else {
426            // The input text segment needed to be normalized.
427            // Switch to checking backward from it.
428            pos = limit = segmentLimit = segmentStart;
429        }
430        start = rawStart;
431        checkDir = -1;
432    }
433}
434
435UBool
436FCDUTF16CollationIterator::previousSegment(UErrorCode &errorCode) {
437    if(U_FAILURE(errorCode)) { return FALSE; }
438    U_ASSERT(checkDir < 0 && pos != start);
439    // The input text [pos..segmentLimit[ passes the FCD check.
440    const UChar *p = pos;
441    uint8_t nextCC = 0;
442    for(;;) {
443        // Fetch the previous character's fcd16 value.
444        const UChar *q = p;
445        uint16_t fcd16 = nfcImpl.previousFCD16(rawStart, p);
446        uint8_t trailCC = (uint8_t)fcd16;
447        if(trailCC == 0 && q != pos) {
448            // FCD boundary after the [p, q[ character.
449            start = segmentStart = q;
450            break;
451        }
452        if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
453                            CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
454            // Fails FCD check. Find the previous FCD boundary and normalize.
455            do {
456                q = p;
457            } while(fcd16 > 0xff && p != rawStart &&
458                    (fcd16 = nfcImpl.previousFCD16(rawStart, p)) != 0);
459            if(!normalize(q, pos, errorCode)) { return FALSE; }
460            pos = limit;
461            break;
462        }
463        nextCC = (uint8_t)(fcd16 >> 8);
464        if(p == rawStart || nextCC == 0) {
465            // FCD boundary before the following character.
466            start = segmentStart = p;
467            break;
468        }
469    }
470    U_ASSERT(pos != start);
471    checkDir = 0;
472    return TRUE;
473}
474
475UBool
476FCDUTF16CollationIterator::normalize(const UChar *from, const UChar *to, UErrorCode &errorCode) {
477    // NFD without argument checking.
478    U_ASSERT(U_SUCCESS(errorCode));
479    nfcImpl.decompose(from, to, normalized, (int32_t)(to - from), errorCode);
480    if(U_FAILURE(errorCode)) { return FALSE; }
481    // Switch collation processing into the FCD buffer
482    // with the result of normalizing [segmentStart, segmentLimit[.
483    segmentStart = from;
484    segmentLimit = to;
485    start = normalized.getBuffer();
486    limit = start + normalized.length();
487    return TRUE;
488}
489
490U_NAMESPACE_END
491
492#endif  // !UCONFIG_NO_COLLATION
493