1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2012-2014, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* utf8collationiterator.cpp
9*
10* created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp)
11* created by: Markus W. Scherer
12*/
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
18#include "unicode/utf8.h"
19#include "charstr.h"
20#include "cmemory.h"
21#include "collation.h"
22#include "collationdata.h"
23#include "collationfcd.h"
24#include "collationiterator.h"
25#include "normalizer2impl.h"
26#include "uassert.h"
27#include "utf8collationiterator.h"
28
29U_NAMESPACE_BEGIN
30
31UTF8CollationIterator::~UTF8CollationIterator() {}
32
33void
34UTF8CollationIterator::resetToOffset(int32_t newOffset) {
35    reset();
36    pos = newOffset;
37}
38
39int32_t
40UTF8CollationIterator::getOffset() const {
41    return pos;
42}
43
44uint32_t
45UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
46    if(pos == length) {
47        c = U_SENTINEL;
48        return Collation::FALLBACK_CE32;
49    }
50    // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
51    c = u8[pos++];
52    if(c < 0xc0) {
53        // ASCII 00..7F; trail bytes 80..BF map to error values.
54        return trie->data32[c];
55    }
56    uint8_t t1, t2;
57    if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
58        // U+0080..U+07FF; 00..7F map to error values.
59        uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
60        c = ((c & 0x1f) << 6) | t1;
61        ++pos;
62        return ce32;
63    } else if(c <= 0xef &&
64              ((pos + 1) < length || length < 0) &&
65              (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&
66              (t2 = (u8[pos + 1] - 0x80)) <= 0x3f
67    ) {
68        // U+0800..U+FFFF; caller maps surrogates to error values.
69        c = (UChar)((c << 12) | (t1 << 6) | t2);
70        pos += 2;
71        return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
72    } else {
73        // Function call for supplementary code points and error cases.
74        // Illegal byte sequences yield U+FFFD.
75        c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
76        return data->getCE32(c);
77    }
78}
79
80UBool
81UTF8CollationIterator::foundNULTerminator() {
82    if(length < 0) {
83        length = --pos;
84        return TRUE;
85    } else {
86        return FALSE;
87    }
88}
89
90UBool
91UTF8CollationIterator::forbidSurrogateCodePoints() const {
92    return TRUE;
93}
94
95UChar32
96UTF8CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
97    if(pos == length) {
98        return U_SENTINEL;
99    }
100    if(u8[pos] == 0 && length < 0) {
101        length = pos;
102        return U_SENTINEL;
103    }
104    UChar32 c;
105    U8_NEXT_OR_FFFD(u8, pos, length, c);
106    return c;
107}
108
109UChar32
110UTF8CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
111    if(pos == 0) {
112        return U_SENTINEL;
113    }
114    UChar32 c;
115    U8_PREV_OR_FFFD(u8, 0, pos, c);
116    return c;
117}
118
119void
120UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
121    U8_FWD_N(u8, pos, length, num);
122}
123
124void
125UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
126    U8_BACK_N(u8, 0, pos, num);
127}
128
129// FCDUTF8CollationIterator ------------------------------------------------ ***
130
131FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {}
132
133void
134FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) {
135    reset();
136    start = pos = newOffset;
137    state = CHECK_FWD;
138}
139
140int32_t
141FCDUTF8CollationIterator::getOffset() const {
142    if(state != IN_NORMALIZED) {
143        return pos;
144    } else if(pos == 0) {
145        return start;
146    } else {
147        return limit;
148    }
149}
150
151uint32_t
152FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
153    for(;;) {
154        if(state == CHECK_FWD) {
155            // Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath.
156            if(pos == length) {
157                c = U_SENTINEL;
158                return Collation::FALLBACK_CE32;
159            }
160            c = u8[pos++];
161            if(c < 0xc0) {
162                // ASCII 00..7F; trail bytes 80..BF map to error values.
163                return trie->data32[c];
164            }
165            uint8_t t1, t2;
166            if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
167                // U+0080..U+07FF; 00..7F map to error values.
168                uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
169                c = ((c & 0x1f) << 6) | t1;
170                ++pos;
171                if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
172                    pos -= 2;
173                } else {
174                    return ce32;
175                }
176            } else if(c <= 0xef &&
177                      ((pos + 1) < length || length < 0) &&
178                      (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&
179                      (t2 = (u8[pos + 1] - 0x80)) <= 0x3f
180            ) {
181                // U+0800..U+FFFF; caller maps surrogates to error values.
182                c = (UChar)((c << 12) | (t1 << 6) | t2);
183                pos += 2;
184                if(CollationFCD::hasTccc(c) &&
185                        (CollationFCD::maybeTibetanCompositeVowel(c) ||
186                            (pos != length && nextHasLccc()))) {
187                    pos -= 3;
188                } else {
189                    break;  // return CE32(BMP)
190                }
191            } else {
192                // Function call for supplementary code points and error cases.
193                // Illegal byte sequences yield U+FFFD.
194                c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
195                if(c == 0xfffd) {
196                    return Collation::FFFD_CE32;
197                } else {
198                    U_ASSERT(c > 0xffff);
199                    if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) {
200                        pos -= 4;
201                    } else {
202                        return data->getCE32FromSupplementary(c);
203                    }
204                }
205            }
206            if(!nextSegment(errorCode)) {
207                c = U_SENTINEL;
208                return Collation::FALLBACK_CE32;
209            }
210            continue;
211        } else if(state == IN_FCD_SEGMENT && pos != limit) {
212            return UTF8CollationIterator::handleNextCE32(c, errorCode);
213        } else if(state == IN_NORMALIZED && pos != normalized.length()) {
214            c = normalized[pos++];
215            break;
216        } else {
217            switchToForward();
218        }
219    }
220    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
221}
222
223UBool
224FCDUTF8CollationIterator::nextHasLccc() const {
225    U_ASSERT(state == CHECK_FWD && pos != length);
226    // The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8.
227    // CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.)
228    UChar32 c = u8[pos];
229    if(c < 0xcc || (0xe4 <= c && c <= 0xed && c != 0xea)) { return FALSE; }
230    int32_t i = pos;
231    U8_NEXT_OR_FFFD(u8, i, length, c);
232    if(c > 0xffff) { c = U16_LEAD(c); }
233    return CollationFCD::hasLccc(c);
234}
235
236UBool
237FCDUTF8CollationIterator::previousHasTccc() const {
238    U_ASSERT(state == CHECK_BWD && pos != 0);
239    UChar32 c = u8[pos - 1];
240    if(c < 0x80) { return FALSE; }
241    int32_t i = pos;
242    U8_PREV_OR_FFFD(u8, 0, i, c);
243    if(c > 0xffff) { c = U16_LEAD(c); }
244    return CollationFCD::hasTccc(c);
245}
246
247UChar
248FCDUTF8CollationIterator::handleGetTrailSurrogate() {
249    if(state != IN_NORMALIZED) { return 0; }
250    U_ASSERT(pos < normalized.length());
251    UChar trail;
252    if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
253    return trail;
254}
255
256UBool
257FCDUTF8CollationIterator::foundNULTerminator() {
258    if(state == CHECK_FWD && length < 0) {
259        length = --pos;
260        return TRUE;
261    } else {
262        return FALSE;
263    }
264}
265
266UChar32
267FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {
268    UChar32 c;
269    for(;;) {
270        if(state == CHECK_FWD) {
271            if(pos == length || ((c = u8[pos]) == 0 && length < 0)) {
272                return U_SENTINEL;
273            }
274            if(c < 0x80) {
275                ++pos;
276                return c;
277            }
278            U8_NEXT_OR_FFFD(u8, pos, length, c);
279            if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) &&
280                    (CollationFCD::maybeTibetanCompositeVowel(c) ||
281                        (pos != length && nextHasLccc()))) {
282                // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
283                // and we can use U8_LENGTH() rather than a previous-position variable.
284                pos -= U8_LENGTH(c);
285                if(!nextSegment(errorCode)) {
286                    return U_SENTINEL;
287                }
288                continue;
289            }
290            return c;
291        } else if(state == IN_FCD_SEGMENT && pos != limit) {
292            U8_NEXT_OR_FFFD(u8, pos, length, c);
293            return c;
294        } else if(state == IN_NORMALIZED && pos != normalized.length()) {
295            c = normalized.char32At(pos);
296            pos += U16_LENGTH(c);
297            return c;
298        } else {
299            switchToForward();
300        }
301    }
302}
303
304UChar32
305FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {
306    UChar32 c;
307    for(;;) {
308        if(state == CHECK_BWD) {
309            if(pos == 0) {
310                return U_SENTINEL;
311            }
312            if((c = u8[pos - 1]) < 0x80) {
313                --pos;
314                return c;
315            }
316            U8_PREV_OR_FFFD(u8, 0, pos, c);
317            if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) &&
318                    (CollationFCD::maybeTibetanCompositeVowel(c) ||
319                        (pos != 0 && previousHasTccc()))) {
320                // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
321                // and we can use U8_LENGTH() rather than a previous-position variable.
322                pos += U8_LENGTH(c);
323                if(!previousSegment(errorCode)) {
324                    return U_SENTINEL;
325                }
326                continue;
327            }
328            return c;
329        } else if(state == IN_FCD_SEGMENT && pos != start) {
330            U8_PREV_OR_FFFD(u8, 0, pos, c);
331            return c;
332        } else if(state >= IN_NORMALIZED && pos != 0) {
333            c = normalized.char32At(pos - 1);
334            pos -= U16_LENGTH(c);
335            return c;
336        } else {
337            switchToBackward();
338        }
339    }
340}
341
342void
343FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
344    // Specify the class to avoid a virtual-function indirection.
345    // In Java, we would declare this class final.
346    while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) {
347        --num;
348    }
349}
350
351void
352FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
353    // Specify the class to avoid a virtual-function indirection.
354    // In Java, we would declare this class final.
355    while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) {
356        --num;
357    }
358}
359
360void
361FCDUTF8CollationIterator::switchToForward() {
362    U_ASSERT(state == CHECK_BWD ||
363             (state == IN_FCD_SEGMENT && pos == limit) ||
364             (state == IN_NORMALIZED && pos == normalized.length()));
365    if(state == CHECK_BWD) {
366        // Turn around from backward checking.
367        start = pos;
368        if(pos == limit) {
369            state = CHECK_FWD;  // Check forward.
370        } else {  // pos < limit
371            state = IN_FCD_SEGMENT;  // Stay in FCD segment.
372        }
373    } else {
374        // Reached the end of the FCD segment.
375        if(state == IN_FCD_SEGMENT) {
376            // The input text segment is FCD, extend it forward.
377        } else {
378            // The input text segment needed to be normalized.
379            // Switch to checking forward from it.
380            start = pos = limit;
381        }
382        state = CHECK_FWD;
383    }
384}
385
386UBool
387FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) {
388    if(U_FAILURE(errorCode)) { return FALSE; }
389    U_ASSERT(state == CHECK_FWD && pos != length);
390    // The input text [start..pos[ passes the FCD check.
391    int32_t segmentStart = pos;
392    // Collect the characters being checked, in case they need to be normalized.
393    UnicodeString s;
394    uint8_t prevCC = 0;
395    for(;;) {
396        // Fetch the next character and its fcd16 value.
397        int32_t cpStart = pos;
398        UChar32 c;
399        U8_NEXT_OR_FFFD(u8, pos, length, c);
400        uint16_t fcd16 = nfcImpl.getFCD16(c);
401        uint8_t leadCC = (uint8_t)(fcd16 >> 8);
402        if(leadCC == 0 && cpStart != segmentStart) {
403            // FCD boundary before this character.
404            pos = cpStart;
405            break;
406        }
407        s.append(c);
408        if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
409            // Fails FCD check. Find the next FCD boundary and normalize.
410            while(pos != length) {
411                cpStart = pos;
412                U8_NEXT_OR_FFFD(u8, pos, length, c);
413                if(nfcImpl.getFCD16(c) <= 0xff) {
414                    pos = cpStart;
415                    break;
416                }
417                s.append(c);
418            }
419            if(!normalize(s, errorCode)) { return FALSE; }
420            start = segmentStart;
421            limit = pos;
422            state = IN_NORMALIZED;
423            pos = 0;
424            return TRUE;
425        }
426        prevCC = (uint8_t)fcd16;
427        if(pos == length || prevCC == 0) {
428            // FCD boundary after the last character.
429            break;
430        }
431    }
432    limit = pos;
433    pos = segmentStart;
434    U_ASSERT(pos != limit);
435    state = IN_FCD_SEGMENT;
436    return TRUE;
437}
438
439void
440FCDUTF8CollationIterator::switchToBackward() {
441    U_ASSERT(state == CHECK_FWD ||
442             (state == IN_FCD_SEGMENT && pos == start) ||
443             (state >= IN_NORMALIZED && pos == 0));
444    if(state == CHECK_FWD) {
445        // Turn around from forward checking.
446        limit = pos;
447        if(pos == start) {
448            state = CHECK_BWD;  // Check backward.
449        } else {  // pos > start
450            state = IN_FCD_SEGMENT;  // Stay in FCD segment.
451        }
452    } else {
453        // Reached the start of the FCD segment.
454        if(state == IN_FCD_SEGMENT) {
455            // The input text segment is FCD, extend it backward.
456        } else {
457            // The input text segment needed to be normalized.
458            // Switch to checking backward from it.
459            limit = pos = start;
460        }
461        state = CHECK_BWD;
462    }
463}
464
465UBool
466FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) {
467    if(U_FAILURE(errorCode)) { return FALSE; }
468    U_ASSERT(state == CHECK_BWD && pos != 0);
469    // The input text [pos..limit[ passes the FCD check.
470    int32_t segmentLimit = pos;
471    // Collect the characters being checked, in case they need to be normalized.
472    UnicodeString s;
473    uint8_t nextCC = 0;
474    for(;;) {
475        // Fetch the previous character and its fcd16 value.
476        int32_t cpLimit = pos;
477        UChar32 c;
478        U8_PREV_OR_FFFD(u8, 0, pos, c);
479        uint16_t fcd16 = nfcImpl.getFCD16(c);
480        uint8_t trailCC = (uint8_t)fcd16;
481        if(trailCC == 0 && cpLimit != segmentLimit) {
482            // FCD boundary after this character.
483            pos = cpLimit;
484            break;
485        }
486        s.append(c);
487        if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
488                            CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
489            // Fails FCD check. Find the previous FCD boundary and normalize.
490            while(fcd16 > 0xff && pos != 0) {
491                cpLimit = pos;
492                U8_PREV_OR_FFFD(u8, 0, pos, c);
493                fcd16 = nfcImpl.getFCD16(c);
494                if(fcd16 == 0) {
495                    pos = cpLimit;
496                    break;
497                }
498                s.append(c);
499            }
500            s.reverse();
501            if(!normalize(s, errorCode)) { return FALSE; }
502            limit = segmentLimit;
503            start = pos;
504            state = IN_NORMALIZED;
505            pos = normalized.length();
506            return TRUE;
507        }
508        nextCC = (uint8_t)(fcd16 >> 8);
509        if(pos == 0 || nextCC == 0) {
510            // FCD boundary before the following character.
511            break;
512        }
513    }
514    start = pos;
515    pos = segmentLimit;
516    U_ASSERT(pos != start);
517    state = IN_FCD_SEGMENT;
518    return TRUE;
519}
520
521UBool
522FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
523    // NFD without argument checking.
524    U_ASSERT(U_SUCCESS(errorCode));
525    nfcImpl.decompose(s, normalized, errorCode);
526    return U_SUCCESS(errorCode);
527}
528
529U_NAMESPACE_END
530
531#endif  // !UCONFIG_NO_COLLATION
532