1/*
2*******************************************************************************
3* Copyright (C) 2012-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* FCDIterCollationIterator.java, ported from uitercollationiterator.h/.cpp
7*
8* C++ version created on: 2012sep23 (from utf16collationiterator.h)
9* created by: Markus W. Scherer
10*/
11
12package com.ibm.icu.impl.coll;
13
14import com.ibm.icu.impl.Normalizer2Impl;
15import com.ibm.icu.text.UCharacterIterator;
16
17/**
18 * Incrementally checks the input text for FCD and normalizes where necessary.
19 */
20public final class FCDIterCollationIterator extends IterCollationIterator {
21    public FCDIterCollationIterator(CollationData data, boolean numeric,
22            UCharacterIterator ui, int startIndex) {
23        super(data, numeric, ui);
24        state = State.ITER_CHECK_FWD;
25        start = startIndex;
26        nfcImpl = data.nfcImpl;
27    }
28
29    @Override
30    public void resetToOffset(int newOffset) {
31        super.resetToOffset(newOffset);
32        start = newOffset;
33        state = State.ITER_CHECK_FWD;
34    }
35
36    @Override
37    public int getOffset() {
38        if(state.compareTo(State.ITER_CHECK_BWD) <= 0) {
39            return iter.getIndex();
40        } else if(state == State.ITER_IN_FCD_SEGMENT) {
41            return pos;
42        } else if(pos == 0) {
43            return start;
44        } else {
45            return limit;
46        }
47    }
48
49    @Override
50    public int nextCodePoint() {
51        int c;
52        for(;;) {
53            if(state == State.ITER_CHECK_FWD) {
54                c = iter.next();
55                if(c < 0) {
56                    return c;
57                }
58                if(CollationFCD.hasTccc(c)) {
59                    if(CollationFCD.maybeTibetanCompositeVowel(c) ||
60                            CollationFCD.hasLccc(iter.current())) {
61                        iter.previous();
62                        if(!nextSegment()) {
63                            return Collation.SENTINEL_CP;
64                        }
65                        continue;
66                    }
67                }
68                if(isLeadSurrogate(c)) {
69                    int trail = iter.next();
70                    if(isTrailSurrogate(trail)) {
71                        return Character.toCodePoint((char)c, (char)trail);
72                    } else if(trail >= 0) {
73                        iter.previous();
74                    }
75                }
76                return c;
77            } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
78                c = iter.nextCodePoint();
79                pos += Character.charCount(c);
80                assert(c >= 0);
81                return c;
82            } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
83                    pos != normalized.length()) {
84                c = normalized.codePointAt(pos);
85                pos += Character.charCount(c);
86                return c;
87            } else {
88                switchToForward();
89            }
90        }
91    }
92
93    @Override
94    public int previousCodePoint() {
95        int c;
96        for(;;) {
97            if(state == State.ITER_CHECK_BWD) {
98                c = iter.previous();
99                if(c < 0) {
100                    start = pos = 0;
101                    state = State.ITER_IN_FCD_SEGMENT;
102                    return Collation.SENTINEL_CP;
103                }
104                if(CollationFCD.hasLccc(c)) {
105                    int prev = Collation.SENTINEL_CP;
106                    if(CollationFCD.maybeTibetanCompositeVowel(c) ||
107                            CollationFCD.hasTccc(prev = iter.previous())) {
108                        iter.next();
109                        if(prev >= 0) {
110                            iter.next();
111                        }
112                        if(!previousSegment()) {
113                            return Collation.SENTINEL_CP;
114                        }
115                        continue;
116                    }
117                    // hasLccc(trail)=true for all trail surrogates
118                    if(isTrailSurrogate(c)) {
119                        if(prev < 0) {
120                            prev = iter.previous();
121                        }
122                        if(isLeadSurrogate(prev)) {
123                            return Character.toCodePoint((char)prev, (char)c);
124                        }
125                    }
126                    if(prev >= 0) {
127                        iter.next();
128                    }
129                }
130                return c;
131            } else if(state == State.ITER_IN_FCD_SEGMENT && pos != start) {
132                c = iter.previousCodePoint();
133                pos -= Character.charCount(c);
134                assert(c >= 0);
135                return c;
136            } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != 0) {
137                c = normalized.codePointBefore(pos);
138                pos -= Character.charCount(c);
139                return c;
140            } else {
141                switchToBackward();
142            }
143        }
144    }
145
146    @Override
147    protected long handleNextCE32() {
148        int c;
149        for(;;) {
150            if(state == State.ITER_CHECK_FWD) {
151                c = iter.next();
152                if(c < 0) {
153                    return NO_CP_AND_CE32;
154                }
155                if(CollationFCD.hasTccc(c)) {
156                    if(CollationFCD.maybeTibetanCompositeVowel(c) ||
157                            CollationFCD.hasLccc(iter.current())) {
158                        iter.previous();
159                        if(!nextSegment()) {
160                            c = Collation.SENTINEL_CP;
161                            return Collation.FALLBACK_CE32;
162                        }
163                        continue;
164                    }
165                }
166                break;
167            } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
168                c = iter.next();
169                ++pos;
170                assert(c >= 0);
171                break;
172            } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
173                    pos != normalized.length()) {
174                c = normalized.charAt(pos++);
175                break;
176            } else {
177                switchToForward();
178            }
179        }
180        return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead((char)c));
181    }
182
183    @Override
184    protected char handleGetTrailSurrogate() {
185        if(state.compareTo(State.ITER_IN_FCD_SEGMENT) <= 0) {
186            int trail = iter.next();
187            if(isTrailSurrogate(trail)) {
188                if(state == State.ITER_IN_FCD_SEGMENT) { ++pos; }
189            } else if(trail >= 0) {
190                iter.previous();
191            }
192            return (char)trail;
193        } else {
194            assert(pos < normalized.length());
195            char trail;
196            if(Character.isLowSurrogate(trail = normalized.charAt(pos))) { ++pos; }
197            return trail;
198        }
199    }
200
201    @Override
202    protected void forwardNumCodePoints(int num) {
203        // Specify the class to avoid a virtual-function indirection.
204        // In Java, we would declare this class final.
205        while(num > 0 && nextCodePoint() >= 0) {
206            --num;
207        }
208    }
209
210    @Override
211    protected void backwardNumCodePoints(int num) {
212        // Specify the class to avoid a virtual-function indirection.
213        // In Java, we would declare this class final.
214        while(num > 0 && previousCodePoint() >= 0) {
215            --num;
216        }
217    }
218
219    /**
220     * Switches to forward checking if possible.
221     */
222    private void switchToForward() {
223        assert(state == State.ITER_CHECK_BWD ||
224                (state == State.ITER_IN_FCD_SEGMENT && pos == limit) ||
225                (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == normalized.length()));
226        if(state == State.ITER_CHECK_BWD) {
227            // Turn around from backward checking.
228            start = pos = iter.getIndex();
229            if(pos == limit) {
230                state = State.ITER_CHECK_FWD;  // Check forward.
231            } else {  // pos < limit
232                state = State.ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
233            }
234        } else {
235            // Reached the end of the FCD segment.
236            if(state == State.ITER_IN_FCD_SEGMENT) {
237                // The input text segment is FCD, extend it forward.
238            } else {
239                // The input text segment needed to be normalized.
240                // Switch to checking forward from it.
241                if(state == State.IN_NORM_ITER_AT_START) {
242                    iter.moveIndex(limit - start);
243                }
244                start = limit;
245            }
246            state = State.ITER_CHECK_FWD;
247        }
248    }
249
250    /**
251     * Extends the FCD text segment forward or normalizes around pos.
252     * @return true if success
253     */
254    private boolean nextSegment() {
255        assert(state == State.ITER_CHECK_FWD);
256        // The input text [start..(iter index)[ passes the FCD check.
257        pos = iter.getIndex();
258        // Collect the characters being checked, in case they need to be normalized.
259        if(s == null) {
260            s = new StringBuilder();
261        } else {
262            s.setLength(0);
263        }
264        int prevCC = 0;
265        for(;;) {
266            // Fetch the next character and its fcd16 value.
267            int c = iter.nextCodePoint();
268            if(c < 0) { break; }
269            int fcd16 = nfcImpl.getFCD16(c);
270            int leadCC = fcd16 >> 8;
271            if(leadCC == 0 && s.length() != 0) {
272                // FCD boundary before this character.
273                iter.previousCodePoint();
274                break;
275            }
276            s.appendCodePoint(c);
277            if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
278                // Fails FCD check. Find the next FCD boundary and normalize.
279                for(;;) {
280                    c = iter.nextCodePoint();
281                    if(c < 0) { break; }
282                    if(nfcImpl.getFCD16(c) <= 0xff) {
283                        iter.previousCodePoint();
284                        break;
285                    }
286                    s.appendCodePoint(c);
287                }
288                normalize(s);
289                start = pos;
290                limit = pos + s.length();
291                state = State.IN_NORM_ITER_AT_LIMIT;
292                pos = 0;
293                return true;
294            }
295            prevCC = fcd16 & 0xff;
296            if(prevCC == 0) {
297                // FCD boundary after the last character.
298                break;
299            }
300        }
301        limit = pos + s.length();
302        assert(pos != limit);
303        iter.moveIndex(-s.length());
304        state = State.ITER_IN_FCD_SEGMENT;
305        return true;
306    }
307
308    /**
309     * Switches to backward checking.
310     */
311    private void switchToBackward() {
312        assert(state == State.ITER_CHECK_FWD ||
313                (state == State.ITER_IN_FCD_SEGMENT && pos == start) ||
314                (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == 0));
315        if(state == State.ITER_CHECK_FWD) {
316            // Turn around from forward checking.
317            limit = pos = iter.getIndex();
318            if(pos == start) {
319                state = State.ITER_CHECK_BWD;  // Check backward.
320            } else {  // pos > start
321                state = State.ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
322            }
323        } else {
324            // Reached the start of the FCD segment.
325            if(state == State.ITER_IN_FCD_SEGMENT) {
326                // The input text segment is FCD, extend it backward.
327            } else {
328                // The input text segment needed to be normalized.
329                // Switch to checking backward from it.
330                if(state == State.IN_NORM_ITER_AT_LIMIT) {
331                    iter.moveIndex(start - limit);
332                }
333                limit = start;
334            }
335            state = State.ITER_CHECK_BWD;
336        }
337    }
338
339    /**
340     * Extends the FCD text segment backward or normalizes around pos.
341     * @return true if success
342     */
343    private boolean previousSegment() {
344        assert(state == State.ITER_CHECK_BWD);
345        // The input text [(iter index)..limit[ passes the FCD check.
346        pos = iter.getIndex();
347        // Collect the characters being checked, in case they need to be normalized.
348        if(s == null) {
349            s = new StringBuilder();
350        } else {
351            s.setLength(0);
352        }
353        int nextCC = 0;
354        for(;;) {
355            // Fetch the previous character and its fcd16 value.
356            int c = iter.previousCodePoint();
357            if(c < 0) { break; }
358            int fcd16 = nfcImpl.getFCD16(c);
359            int trailCC = fcd16 & 0xff;
360            if(trailCC == 0 && s.length() != 0) {
361                // FCD boundary after this character.
362                iter.nextCodePoint();
363                break;
364            }
365            s.appendCodePoint(c);
366            if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
367                                CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
368                // Fails FCD check. Find the previous FCD boundary and normalize.
369                while(fcd16 > 0xff) {
370                    c = iter.previousCodePoint();
371                    if(c < 0) { break; }
372                    fcd16 = nfcImpl.getFCD16(c);
373                    if(fcd16 == 0) {
374                        iter.nextCodePoint();
375                        break;
376                    }
377                    s.appendCodePoint(c);
378                }
379                s.reverse();
380                normalize(s);
381                limit = pos;
382                start = pos - s.length();
383                state = State.IN_NORM_ITER_AT_START;
384                pos = normalized.length();
385                return true;
386            }
387            nextCC = fcd16 >> 8;
388            if(nextCC == 0) {
389                // FCD boundary before the following character.
390                break;
391            }
392        }
393        start = pos - s.length();
394        assert(pos != start);
395        iter.moveIndex(s.length());
396        state = State.ITER_IN_FCD_SEGMENT;
397        return true;
398    }
399
400    private void normalize(CharSequence s) {
401        if(normalized == null) {
402            normalized = new StringBuilder();
403        }
404        // NFD without argument checking.
405        nfcImpl.decompose(s, normalized);
406    }
407
408    private enum State {
409        /**
410         * The input text [start..(iter index)[ passes the FCD check.
411         * Moving forward checks incrementally.
412         * pos & limit are undefined.
413         */
414        ITER_CHECK_FWD,
415        /**
416         * The input text [(iter index)..limit[ passes the FCD check.
417         * Moving backward checks incrementally.
418         * start & pos are undefined.
419         */
420        ITER_CHECK_BWD,
421        /**
422         * The input text [start..limit[ passes the FCD check.
423         * pos tracks the current text index.
424         */
425        ITER_IN_FCD_SEGMENT,
426        /**
427         * The input text [start..limit[ failed the FCD check and was normalized.
428         * pos tracks the current index in the normalized string.
429         * The text iterator is at the limit index.
430         */
431        IN_NORM_ITER_AT_LIMIT,
432        /**
433         * The input text [start..limit[ failed the FCD check and was normalized.
434         * pos tracks the current index in the normalized string.
435         * The text iterator is at the start index.
436         */
437        IN_NORM_ITER_AT_START
438    }
439
440    private State state;
441
442    private int start;
443    private int pos;
444    private int limit;
445
446    private final Normalizer2Impl nfcImpl;
447    private StringBuilder s;
448    private StringBuilder normalized;
449}
450