/* ******************************************************************************* * Copyright (C) 2012-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * FCDIterCollationIterator.java, ported from uitercollationiterator.h/.cpp * * C++ version created on: 2012sep23 (from utf16collationiterator.h) * created by: Markus W. Scherer */ package com.ibm.icu.impl.coll; import com.ibm.icu.impl.Normalizer2Impl; import com.ibm.icu.text.UCharacterIterator; /** * Incrementally checks the input text for FCD and normalizes where necessary. */ public final class FCDIterCollationIterator extends IterCollationIterator { public FCDIterCollationIterator(CollationData data, boolean numeric, UCharacterIterator ui, int startIndex) { super(data, numeric, ui); state = State.ITER_CHECK_FWD; start = startIndex; nfcImpl = data.nfcImpl; } @Override public void resetToOffset(int newOffset) { super.resetToOffset(newOffset); start = newOffset; state = State.ITER_CHECK_FWD; } @Override public int getOffset() { if(state.compareTo(State.ITER_CHECK_BWD) <= 0) { return iter.getIndex(); } else if(state == State.ITER_IN_FCD_SEGMENT) { return pos; } else if(pos == 0) { return start; } else { return limit; } } @Override public int nextCodePoint() { int c; for(;;) { if(state == State.ITER_CHECK_FWD) { c = iter.next(); if(c < 0) { return c; } if(CollationFCD.hasTccc(c)) { if(CollationFCD.maybeTibetanCompositeVowel(c) || CollationFCD.hasLccc(iter.current())) { iter.previous(); if(!nextSegment()) { return Collation.SENTINEL_CP; } continue; } } if(isLeadSurrogate(c)) { int trail = iter.next(); if(isTrailSurrogate(trail)) { return Character.toCodePoint((char)c, (char)trail); } else if(trail >= 0) { iter.previous(); } } return c; } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) { c = iter.nextCodePoint(); pos += Character.charCount(c); assert(c >= 0); return c; } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != normalized.length()) { c = normalized.codePointAt(pos); pos += Character.charCount(c); return c; } else { switchToForward(); } } } @Override public int previousCodePoint() { int c; for(;;) { if(state == State.ITER_CHECK_BWD) { c = iter.previous(); if(c < 0) { start = pos = 0; state = State.ITER_IN_FCD_SEGMENT; return Collation.SENTINEL_CP; } if(CollationFCD.hasLccc(c)) { int prev = Collation.SENTINEL_CP; if(CollationFCD.maybeTibetanCompositeVowel(c) || CollationFCD.hasTccc(prev = iter.previous())) { iter.next(); if(prev >= 0) { iter.next(); } if(!previousSegment()) { return Collation.SENTINEL_CP; } continue; } // hasLccc(trail)=true for all trail surrogates if(isTrailSurrogate(c)) { if(prev < 0) { prev = iter.previous(); } if(isLeadSurrogate(prev)) { return Character.toCodePoint((char)prev, (char)c); } } if(prev >= 0) { iter.next(); } } return c; } else if(state == State.ITER_IN_FCD_SEGMENT && pos != start) { c = iter.previousCodePoint(); pos -= Character.charCount(c); assert(c >= 0); return c; } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != 0) { c = normalized.codePointBefore(pos); pos -= Character.charCount(c); return c; } else { switchToBackward(); } } } @Override protected long handleNextCE32() { int c; for(;;) { if(state == State.ITER_CHECK_FWD) { c = iter.next(); if(c < 0) { return NO_CP_AND_CE32; } if(CollationFCD.hasTccc(c)) { if(CollationFCD.maybeTibetanCompositeVowel(c) || CollationFCD.hasLccc(iter.current())) { iter.previous(); if(!nextSegment()) { c = Collation.SENTINEL_CP; return Collation.FALLBACK_CE32; } continue; } } break; } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) { c = iter.next(); ++pos; assert(c >= 0); break; } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != normalized.length()) { c = normalized.charAt(pos++); break; } else { switchToForward(); } } return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead((char)c)); } @Override protected char handleGetTrailSurrogate() { if(state.compareTo(State.ITER_IN_FCD_SEGMENT) <= 0) { int trail = iter.next(); if(isTrailSurrogate(trail)) { if(state == State.ITER_IN_FCD_SEGMENT) { ++pos; } } else if(trail >= 0) { iter.previous(); } return (char)trail; } else { assert(pos < normalized.length()); char trail; if(Character.isLowSurrogate(trail = normalized.charAt(pos))) { ++pos; } return trail; } } @Override protected void forwardNumCodePoints(int num) { // Specify the class to avoid a virtual-function indirection. // In Java, we would declare this class final. while(num > 0 && nextCodePoint() >= 0) { --num; } } @Override protected void backwardNumCodePoints(int num) { // Specify the class to avoid a virtual-function indirection. // In Java, we would declare this class final. while(num > 0 && previousCodePoint() >= 0) { --num; } } /** * Switches to forward checking if possible. */ private void switchToForward() { assert(state == State.ITER_CHECK_BWD || (state == State.ITER_IN_FCD_SEGMENT && pos == limit) || (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == normalized.length())); if(state == State.ITER_CHECK_BWD) { // Turn around from backward checking. start = pos = iter.getIndex(); if(pos == limit) { state = State.ITER_CHECK_FWD; // Check forward. } else { // pos < limit state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment. } } else { // Reached the end of the FCD segment. if(state == State.ITER_IN_FCD_SEGMENT) { // The input text segment is FCD, extend it forward. } else { // The input text segment needed to be normalized. // Switch to checking forward from it. if(state == State.IN_NORM_ITER_AT_START) { iter.moveIndex(limit - start); } start = limit; } state = State.ITER_CHECK_FWD; } } /** * Extends the FCD text segment forward or normalizes around pos. * @return true if success */ private boolean nextSegment() { assert(state == State.ITER_CHECK_FWD); // The input text [start..(iter index)[ passes the FCD check. pos = iter.getIndex(); // Collect the characters being checked, in case they need to be normalized. if(s == null) { s = new StringBuilder(); } else { s.setLength(0); } int prevCC = 0; for(;;) { // Fetch the next character and its fcd16 value. int c = iter.nextCodePoint(); if(c < 0) { break; } int fcd16 = nfcImpl.getFCD16(c); int leadCC = fcd16 >> 8; if(leadCC == 0 && s.length() != 0) { // FCD boundary before this character. iter.previousCodePoint(); break; } s.appendCodePoint(c); if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) { // Fails FCD check. Find the next FCD boundary and normalize. for(;;) { c = iter.nextCodePoint(); if(c < 0) { break; } if(nfcImpl.getFCD16(c) <= 0xff) { iter.previousCodePoint(); break; } s.appendCodePoint(c); } normalize(s); start = pos; limit = pos + s.length(); state = State.IN_NORM_ITER_AT_LIMIT; pos = 0; return true; } prevCC = fcd16 & 0xff; if(prevCC == 0) { // FCD boundary after the last character. break; } } limit = pos + s.length(); assert(pos != limit); iter.moveIndex(-s.length()); state = State.ITER_IN_FCD_SEGMENT; return true; } /** * Switches to backward checking. */ private void switchToBackward() { assert(state == State.ITER_CHECK_FWD || (state == State.ITER_IN_FCD_SEGMENT && pos == start) || (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == 0)); if(state == State.ITER_CHECK_FWD) { // Turn around from forward checking. limit = pos = iter.getIndex(); if(pos == start) { state = State.ITER_CHECK_BWD; // Check backward. } else { // pos > start state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment. } } else { // Reached the start of the FCD segment. if(state == State.ITER_IN_FCD_SEGMENT) { // The input text segment is FCD, extend it backward. } else { // The input text segment needed to be normalized. // Switch to checking backward from it. if(state == State.IN_NORM_ITER_AT_LIMIT) { iter.moveIndex(start - limit); } limit = start; } state = State.ITER_CHECK_BWD; } } /** * Extends the FCD text segment backward or normalizes around pos. * @return true if success */ private boolean previousSegment() { assert(state == State.ITER_CHECK_BWD); // The input text [(iter index)..limit[ passes the FCD check. pos = iter.getIndex(); // Collect the characters being checked, in case they need to be normalized. if(s == null) { s = new StringBuilder(); } else { s.setLength(0); } int nextCC = 0; for(;;) { // Fetch the previous character and its fcd16 value. int c = iter.previousCodePoint(); if(c < 0) { break; } int fcd16 = nfcImpl.getFCD16(c); int trailCC = fcd16 & 0xff; if(trailCC == 0 && s.length() != 0) { // FCD boundary after this character. iter.nextCodePoint(); break; } s.appendCodePoint(c); if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) { // Fails FCD check. Find the previous FCD boundary and normalize. while(fcd16 > 0xff) { c = iter.previousCodePoint(); if(c < 0) { break; } fcd16 = nfcImpl.getFCD16(c); if(fcd16 == 0) { iter.nextCodePoint(); break; } s.appendCodePoint(c); } s.reverse(); normalize(s); limit = pos; start = pos - s.length(); state = State.IN_NORM_ITER_AT_START; pos = normalized.length(); return true; } nextCC = fcd16 >> 8; if(nextCC == 0) { // FCD boundary before the following character. break; } } start = pos - s.length(); assert(pos != start); iter.moveIndex(s.length()); state = State.ITER_IN_FCD_SEGMENT; return true; } private void normalize(CharSequence s) { if(normalized == null) { normalized = new StringBuilder(); } // NFD without argument checking. nfcImpl.decompose(s, normalized); } private enum State { /** * The input text [start..(iter index)[ passes the FCD check. * Moving forward checks incrementally. * pos & limit are undefined. */ ITER_CHECK_FWD, /** * The input text [(iter index)..limit[ passes the FCD check. * Moving backward checks incrementally. * start & pos are undefined. */ ITER_CHECK_BWD, /** * The input text [start..limit[ passes the FCD check. * pos tracks the current text index. */ ITER_IN_FCD_SEGMENT, /** * The input text [start..limit[ failed the FCD check and was normalized. * pos tracks the current index in the normalized string. * The text iterator is at the limit index. */ IN_NORM_ITER_AT_LIMIT, /** * The input text [start..limit[ failed the FCD check and was normalized. * pos tracks the current index in the normalized string. * The text iterator is at the start index. */ IN_NORM_ITER_AT_START } private State state; private int start; private int pos; private int limit; private final Normalizer2Impl nfcImpl; private StringBuilder s; private StringBuilder normalized; }