1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4*******************************************************************************
5* Copyright (C) 2012-2014, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* FCDIterCollationIterator.java, ported from uitercollationiterator.h/.cpp
9*
10* C++ version created on: 2012sep23 (from utf16collationiterator.h)
11* created by: Markus W. Scherer
12*/
13
14package com.ibm.icu.impl.coll;
15
16import com.ibm.icu.impl.Normalizer2Impl;
17import com.ibm.icu.text.UCharacterIterator;
18
19/**
20 * Incrementally checks the input text for FCD and normalizes where necessary.
21 */
22public final class FCDIterCollationIterator extends IterCollationIterator {
23    public FCDIterCollationIterator(CollationData data, boolean numeric,
24            UCharacterIterator ui, int startIndex) {
25        super(data, numeric, ui);
26        state = State.ITER_CHECK_FWD;
27        start = startIndex;
28        nfcImpl = data.nfcImpl;
29    }
30
31    @Override
32    public void resetToOffset(int newOffset) {
33        super.resetToOffset(newOffset);
34        start = newOffset;
35        state = State.ITER_CHECK_FWD;
36    }
37
38    @Override
39    public int getOffset() {
40        if(state.compareTo(State.ITER_CHECK_BWD) <= 0) {
41            return iter.getIndex();
42        } else if(state == State.ITER_IN_FCD_SEGMENT) {
43            return pos;
44        } else if(pos == 0) {
45            return start;
46        } else {
47            return limit;
48        }
49    }
50
51    @Override
52    public int nextCodePoint() {
53        int c;
54        for(;;) {
55            if(state == State.ITER_CHECK_FWD) {
56                c = iter.next();
57                if(c < 0) {
58                    return c;
59                }
60                if(CollationFCD.hasTccc(c)) {
61                    if(CollationFCD.maybeTibetanCompositeVowel(c) ||
62                            CollationFCD.hasLccc(iter.current())) {
63                        iter.previous();
64                        if(!nextSegment()) {
65                            return Collation.SENTINEL_CP;
66                        }
67                        continue;
68                    }
69                }
70                if(isLeadSurrogate(c)) {
71                    int trail = iter.next();
72                    if(isTrailSurrogate(trail)) {
73                        return Character.toCodePoint((char)c, (char)trail);
74                    } else if(trail >= 0) {
75                        iter.previous();
76                    }
77                }
78                return c;
79            } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
80                c = iter.nextCodePoint();
81                pos += Character.charCount(c);
82                assert(c >= 0);
83                return c;
84            } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
85                    pos != normalized.length()) {
86                c = normalized.codePointAt(pos);
87                pos += Character.charCount(c);
88                return c;
89            } else {
90                switchToForward();
91            }
92        }
93    }
94
95    @Override
96    public int previousCodePoint() {
97        int c;
98        for(;;) {
99            if(state == State.ITER_CHECK_BWD) {
100                c = iter.previous();
101                if(c < 0) {
102                    start = pos = 0;
103                    state = State.ITER_IN_FCD_SEGMENT;
104                    return Collation.SENTINEL_CP;
105                }
106                if(CollationFCD.hasLccc(c)) {
107                    int prev = Collation.SENTINEL_CP;
108                    if(CollationFCD.maybeTibetanCompositeVowel(c) ||
109                            CollationFCD.hasTccc(prev = iter.previous())) {
110                        iter.next();
111                        if(prev >= 0) {
112                            iter.next();
113                        }
114                        if(!previousSegment()) {
115                            return Collation.SENTINEL_CP;
116                        }
117                        continue;
118                    }
119                    // hasLccc(trail)=true for all trail surrogates
120                    if(isTrailSurrogate(c)) {
121                        if(prev < 0) {
122                            prev = iter.previous();
123                        }
124                        if(isLeadSurrogate(prev)) {
125                            return Character.toCodePoint((char)prev, (char)c);
126                        }
127                    }
128                    if(prev >= 0) {
129                        iter.next();
130                    }
131                }
132                return c;
133            } else if(state == State.ITER_IN_FCD_SEGMENT && pos != start) {
134                c = iter.previousCodePoint();
135                pos -= Character.charCount(c);
136                assert(c >= 0);
137                return c;
138            } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != 0) {
139                c = normalized.codePointBefore(pos);
140                pos -= Character.charCount(c);
141                return c;
142            } else {
143                switchToBackward();
144            }
145        }
146    }
147
148    @Override
149    protected long handleNextCE32() {
150        int c;
151        for(;;) {
152            if(state == State.ITER_CHECK_FWD) {
153                c = iter.next();
154                if(c < 0) {
155                    return NO_CP_AND_CE32;
156                }
157                if(CollationFCD.hasTccc(c)) {
158                    if(CollationFCD.maybeTibetanCompositeVowel(c) ||
159                            CollationFCD.hasLccc(iter.current())) {
160                        iter.previous();
161                        if(!nextSegment()) {
162                            c = Collation.SENTINEL_CP;
163                            return Collation.FALLBACK_CE32;
164                        }
165                        continue;
166                    }
167                }
168                break;
169            } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
170                c = iter.next();
171                ++pos;
172                assert(c >= 0);
173                break;
174            } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
175                    pos != normalized.length()) {
176                c = normalized.charAt(pos++);
177                break;
178            } else {
179                switchToForward();
180            }
181        }
182        return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead((char)c));
183    }
184
185    @Override
186    protected char handleGetTrailSurrogate() {
187        if(state.compareTo(State.ITER_IN_FCD_SEGMENT) <= 0) {
188            int trail = iter.next();
189            if(isTrailSurrogate(trail)) {
190                if(state == State.ITER_IN_FCD_SEGMENT) { ++pos; }
191            } else if(trail >= 0) {
192                iter.previous();
193            }
194            return (char)trail;
195        } else {
196            assert(pos < normalized.length());
197            char trail;
198            if(Character.isLowSurrogate(trail = normalized.charAt(pos))) { ++pos; }
199            return trail;
200        }
201    }
202
203    @Override
204    protected void forwardNumCodePoints(int num) {
205        // Specify the class to avoid a virtual-function indirection.
206        // In Java, we would declare this class final.
207        while(num > 0 && nextCodePoint() >= 0) {
208            --num;
209        }
210    }
211
212    @Override
213    protected void backwardNumCodePoints(int num) {
214        // Specify the class to avoid a virtual-function indirection.
215        // In Java, we would declare this class final.
216        while(num > 0 && previousCodePoint() >= 0) {
217            --num;
218        }
219    }
220
221    /**
222     * Switches to forward checking if possible.
223     */
224    private void switchToForward() {
225        assert(state == State.ITER_CHECK_BWD ||
226                (state == State.ITER_IN_FCD_SEGMENT && pos == limit) ||
227                (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == normalized.length()));
228        if(state == State.ITER_CHECK_BWD) {
229            // Turn around from backward checking.
230            start = pos = iter.getIndex();
231            if(pos == limit) {
232                state = State.ITER_CHECK_FWD;  // Check forward.
233            } else {  // pos < limit
234                state = State.ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
235            }
236        } else {
237            // Reached the end of the FCD segment.
238            if(state == State.ITER_IN_FCD_SEGMENT) {
239                // The input text segment is FCD, extend it forward.
240            } else {
241                // The input text segment needed to be normalized.
242                // Switch to checking forward from it.
243                if(state == State.IN_NORM_ITER_AT_START) {
244                    iter.moveIndex(limit - start);
245                }
246                start = limit;
247            }
248            state = State.ITER_CHECK_FWD;
249        }
250    }
251
252    /**
253     * Extends the FCD text segment forward or normalizes around pos.
254     * @return true if success
255     */
256    private boolean nextSegment() {
257        assert(state == State.ITER_CHECK_FWD);
258        // The input text [start..(iter index)[ passes the FCD check.
259        pos = iter.getIndex();
260        // Collect the characters being checked, in case they need to be normalized.
261        if(s == null) {
262            s = new StringBuilder();
263        } else {
264            s.setLength(0);
265        }
266        int prevCC = 0;
267        for(;;) {
268            // Fetch the next character and its fcd16 value.
269            int c = iter.nextCodePoint();
270            if(c < 0) { break; }
271            int fcd16 = nfcImpl.getFCD16(c);
272            int leadCC = fcd16 >> 8;
273            if(leadCC == 0 && s.length() != 0) {
274                // FCD boundary before this character.
275                iter.previousCodePoint();
276                break;
277            }
278            s.appendCodePoint(c);
279            if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
280                // Fails FCD check. Find the next FCD boundary and normalize.
281                for(;;) {
282                    c = iter.nextCodePoint();
283                    if(c < 0) { break; }
284                    if(nfcImpl.getFCD16(c) <= 0xff) {
285                        iter.previousCodePoint();
286                        break;
287                    }
288                    s.appendCodePoint(c);
289                }
290                normalize(s);
291                start = pos;
292                limit = pos + s.length();
293                state = State.IN_NORM_ITER_AT_LIMIT;
294                pos = 0;
295                return true;
296            }
297            prevCC = fcd16 & 0xff;
298            if(prevCC == 0) {
299                // FCD boundary after the last character.
300                break;
301            }
302        }
303        limit = pos + s.length();
304        assert(pos != limit);
305        iter.moveIndex(-s.length());
306        state = State.ITER_IN_FCD_SEGMENT;
307        return true;
308    }
309
310    /**
311     * Switches to backward checking.
312     */
313    private void switchToBackward() {
314        assert(state == State.ITER_CHECK_FWD ||
315                (state == State.ITER_IN_FCD_SEGMENT && pos == start) ||
316                (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == 0));
317        if(state == State.ITER_CHECK_FWD) {
318            // Turn around from forward checking.
319            limit = pos = iter.getIndex();
320            if(pos == start) {
321                state = State.ITER_CHECK_BWD;  // Check backward.
322            } else {  // pos > start
323                state = State.ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
324            }
325        } else {
326            // Reached the start of the FCD segment.
327            if(state == State.ITER_IN_FCD_SEGMENT) {
328                // The input text segment is FCD, extend it backward.
329            } else {
330                // The input text segment needed to be normalized.
331                // Switch to checking backward from it.
332                if(state == State.IN_NORM_ITER_AT_LIMIT) {
333                    iter.moveIndex(start - limit);
334                }
335                limit = start;
336            }
337            state = State.ITER_CHECK_BWD;
338        }
339    }
340
341    /**
342     * Extends the FCD text segment backward or normalizes around pos.
343     * @return true if success
344     */
345    private boolean previousSegment() {
346        assert(state == State.ITER_CHECK_BWD);
347        // The input text [(iter index)..limit[ passes the FCD check.
348        pos = iter.getIndex();
349        // Collect the characters being checked, in case they need to be normalized.
350        if(s == null) {
351            s = new StringBuilder();
352        } else {
353            s.setLength(0);
354        }
355        int nextCC = 0;
356        for(;;) {
357            // Fetch the previous character and its fcd16 value.
358            int c = iter.previousCodePoint();
359            if(c < 0) { break; }
360            int fcd16 = nfcImpl.getFCD16(c);
361            int trailCC = fcd16 & 0xff;
362            if(trailCC == 0 && s.length() != 0) {
363                // FCD boundary after this character.
364                iter.nextCodePoint();
365                break;
366            }
367            s.appendCodePoint(c);
368            if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
369                                CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
370                // Fails FCD check. Find the previous FCD boundary and normalize.
371                while(fcd16 > 0xff) {
372                    c = iter.previousCodePoint();
373                    if(c < 0) { break; }
374                    fcd16 = nfcImpl.getFCD16(c);
375                    if(fcd16 == 0) {
376                        iter.nextCodePoint();
377                        break;
378                    }
379                    s.appendCodePoint(c);
380                }
381                s.reverse();
382                normalize(s);
383                limit = pos;
384                start = pos - s.length();
385                state = State.IN_NORM_ITER_AT_START;
386                pos = normalized.length();
387                return true;
388            }
389            nextCC = fcd16 >> 8;
390            if(nextCC == 0) {
391                // FCD boundary before the following character.
392                break;
393            }
394        }
395        start = pos - s.length();
396        assert(pos != start);
397        iter.moveIndex(s.length());
398        state = State.ITER_IN_FCD_SEGMENT;
399        return true;
400    }
401
402    private void normalize(CharSequence s) {
403        if(normalized == null) {
404            normalized = new StringBuilder();
405        }
406        // NFD without argument checking.
407        nfcImpl.decompose(s, normalized);
408    }
409
410    private enum State {
411        /**
412         * The input text [start..(iter index)[ passes the FCD check.
413         * Moving forward checks incrementally.
414         * pos & limit are undefined.
415         */
416        ITER_CHECK_FWD,
417        /**
418         * The input text [(iter index)..limit[ passes the FCD check.
419         * Moving backward checks incrementally.
420         * start & pos are undefined.
421         */
422        ITER_CHECK_BWD,
423        /**
424         * The input text [start..limit[ passes the FCD check.
425         * pos tracks the current text index.
426         */
427        ITER_IN_FCD_SEGMENT,
428        /**
429         * The input text [start..limit[ failed the FCD check and was normalized.
430         * pos tracks the current index in the normalized string.
431         * The text iterator is at the limit index.
432         */
433        IN_NORM_ITER_AT_LIMIT,
434        /**
435         * The input text [start..limit[ failed the FCD check and was normalized.
436         * pos tracks the current index in the normalized string.
437         * The text iterator is at the start index.
438         */
439        IN_NORM_ITER_AT_START
440    }
441
442    private State state;
443
444    private int start;
445    private int pos;
446    private int limit;
447
448    private final Normalizer2Impl nfcImpl;
449    private StringBuilder s;
450    private StringBuilder normalized;
451}
452