12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others. 22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Copyright (C) 2010-2014, International Business Machines 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Corporation and others. All Rights Reserved. 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* FCDUTF16CollationIterator.java, ported from utf16collationiterator.h/.cpp 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* C++ version created on: 2010oct27 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* created by: Markus W. Scherer 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/ 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.impl.coll; 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Normalizer2Impl; 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Incrementally checks the input text for FCD and normalizes where necessary. 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic final class FCDUTF16CollationIterator extends UTF16CollationIterator { 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Partial constructor, see {@link CollationIterator#CollationIterator(CollationData)}. 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public FCDUTF16CollationIterator(CollationData d) { 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert super(d); 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfcImpl = d.nfcImpl; 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public FCDUTF16CollationIterator(CollationData data, boolean numeric, CharSequence s, int p) { 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert super(data, numeric, s, p); 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rawSeq = s; 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert segmentStart = p; 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rawLimit = s.length(); 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfcImpl = data.nfcImpl; 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 1; 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean equals(Object other) { 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Skip the UTF16CollationIterator and call its parent. 422d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert if (!(other instanceof CollationIterator) 432d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert || !((CollationIterator)this).equals(other) 442d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert || !(other instanceof FCDUTF16CollationIterator)) 452d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert { 462d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert return false; 472d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert } 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert FCDUTF16CollationIterator o = (FCDUTF16CollationIterator)other; 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Compare the iterator state but not the text: Assume that the caller does that. 502d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert if (checkDir != o.checkDir) { 512d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert return false; 522d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert } 532d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert if (checkDir == 0 && (seq == rawSeq) != (o.seq == o.rawSeq)) { 542d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert return false; 552d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert } 562d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert if (checkDir != 0 || seq == rawSeq) { 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (pos - rawStart) == (o.pos - /*o.*/ rawStart); 582d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert } 592d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert else { 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (segmentStart - rawStart) == (o.segmentStart - /*o.*/ rawStart) && 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (pos - start) == (o.pos - o.start); 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int hashCode() { 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert false : "hashCode not designed"; 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 42; // any arbitrary constant will do 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void resetToOffset(int newOffset) { 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reset(); 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert seq = rawSeq; 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = segmentStart = pos = rawStart + newOffset; 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = rawLimit; 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 1; 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getOffset() { 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir != 0 || seq == rawSeq) { 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return pos - rawStart; 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(pos == start) { 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return segmentStart - rawStart; 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return segmentLimit - rawStart; 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setText(boolean numeric, CharSequence s, int p) { 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert super.setText(numeric, s, p); 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rawSeq = s; 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert segmentStart = p; 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rawLimit = limit = s.length(); 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 1; 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int nextCodePoint() { 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c; 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir > 0) { 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(pos == limit) { 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Collation.SENTINEL_CP; 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(pos++); 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(CollationFCD.hasTccc(c)) { 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(CollationFCD.maybeTibetanCompositeVowel(c) || 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) { 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --pos; 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextSegment(); 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(pos++); 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(checkDir == 0 && pos != limit) { 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(pos++); 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switchToForward(); 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char trail; 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Character.isHighSurrogate(c) && pos != limit && 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Character.isLowSurrogate(trail = seq.charAt(pos))) { 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++pos; 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Character.toCodePoint(c, trail); 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return c; 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int previousCodePoint() { 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c; 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir < 0) { 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(pos == start) { 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Collation.SENTINEL_CP; 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(--pos); 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(CollationFCD.hasLccc(c)) { 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(CollationFCD.maybeTibetanCompositeVowel(c) || 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (pos != start && CollationFCD.hasTccc(seq.charAt(pos - 1)))) { 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++pos; 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert previousSegment(); 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(--pos); 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(checkDir == 0 && pos != start) { 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(--pos); 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switchToBackward(); 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char lead; 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Character.isLowSurrogate(c) && pos != start && 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Character.isHighSurrogate(lead = seq.charAt(pos - 1))) { 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --pos; 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Character.toCodePoint(lead, c); 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return c; 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected long handleNextCE32() { 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c; 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir > 0) { 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(pos == limit) { 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return NO_CP_AND_CE32; 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(pos++); 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(CollationFCD.hasTccc(c)) { 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(CollationFCD.maybeTibetanCompositeVowel(c) || 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) { 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --pos; 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextSegment(); 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(pos++); 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(checkDir == 0 && pos != limit) { 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(pos++); 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switchToForward(); 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead(c)); 1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* boolean foundNULTerminator(); */ 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected void forwardNumCodePoints(int num) { 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Specify the class to avoid a virtual-function indirection. 2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In Java, we would declare this class final. 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(num > 0 && nextCodePoint() >= 0) { 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --num; 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected void backwardNumCodePoints(int num) { 2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Specify the class to avoid a virtual-function indirection. 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In Java, we would declare this class final. 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(num > 0 && previousCodePoint() >= 0) { 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --num; 2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Switches to forward checking if possible. 2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * To be called when checkDir < 0 || (checkDir == 0 && pos == limit). 2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns with checkDir > 0 || (checkDir == 0 && pos != limit). 2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void switchToForward() { 2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert((checkDir < 0 && seq == rawSeq) || (checkDir == 0 && pos == limit)); 2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir < 0) { 2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Turn around from backward checking. 2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = segmentStart = pos; 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(pos == segmentLimit) { 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = rawLimit; 2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 1; // Check forward. 2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { // pos < segmentLimit 2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 0; // Stay in FCD segment. 2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Reached the end of the FCD segment. 2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(seq == rawSeq) { 2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text segment is FCD, extend it forward. 2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text segment needed to be normalized. 2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Switch to checking forward from it. 2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert seq = rawSeq; 2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos = start = segmentStart = segmentLimit; 2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: If this segment is at the end of the input text, 2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // then it might help to return false to indicate that, so that 2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we do not have to re-check and normalize when we turn around and go backwards. 2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // However, that would complicate the call sites for an optimization of an unusual case. 2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = rawLimit; 2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 1; 2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Extend the FCD text segment forward or normalize around pos. 2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * To be called when checkDir > 0 && pos != limit. 2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns with checkDir == 0 and pos != limit. 2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void nextSegment() { 2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(checkDir > 0 && seq == rawSeq && pos != limit); 2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text [segmentStart..pos[ passes the FCD check. 2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int p = pos; 2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevCC = 0; 2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fetch the next character's fcd16 value. 2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int q = p; 2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = Character.codePointAt(seq, p); 2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p += Character.charCount(c); 2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int fcd16 = nfcImpl.getFCD16(c); 2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int leadCC = fcd16 >> 8; 2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(leadCC == 0 && q != pos) { 2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // FCD boundary before the [q, p[ character. 2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = segmentLimit = q; 2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) { 2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fails FCD check. Find the next FCD boundary and normalize. 2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert q = p; 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(p == rawLimit) { break; } 2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = Character.codePointAt(seq, p); 2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p += Character.charCount(c); 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while(nfcImpl.getFCD16(c) > 0xff); 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert normalize(pos, q); 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos = start; 2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC = fcd16 & 0xff; 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(p == rawLimit || prevCC == 0) { 2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // FCD boundary after the last character. 2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = segmentLimit = p; 2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(pos != limit); 2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 0; 2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Switches to backward checking. 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * To be called when checkDir > 0 || (checkDir == 0 && pos == start). 3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns with checkDir < 0 || (checkDir == 0 && pos != start). 3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void switchToBackward() { 3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert((checkDir > 0 && seq == rawSeq) || (checkDir == 0 && pos == start)); 3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir > 0) { 3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Turn around from forward checking. 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = segmentLimit = pos; 3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(pos == segmentStart) { 3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = rawStart; 3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = -1; // Check backward. 3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { // pos > segmentStart 3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 0; // Stay in FCD segment. 3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Reached the start of the FCD segment. 3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(seq == rawSeq) { 3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text segment is FCD, extend it backward. 3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text segment needed to be normalized. 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Switch to checking backward from it. 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert seq = rawSeq; 3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos = limit = segmentLimit = segmentStart; 3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = rawStart; 3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = -1; 3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Extend the FCD text segment backward or normalize around pos. 3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * To be called when checkDir < 0 && pos != start. 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns with checkDir == 0 and pos != start. 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void previousSegment() { 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(checkDir < 0 && seq == rawSeq && pos != start); 3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text [pos..segmentLimit[ passes the FCD check. 3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int p = pos; 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nextCC = 0; 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fetch the previous character's fcd16 value. 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int q = p; 3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = Character.codePointBefore(seq, p); 3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p -= Character.charCount(c); 3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int fcd16 = nfcImpl.getFCD16(c); 3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int trailCC = fcd16 & 0xff; 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(trailCC == 0 && q != pos) { 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // FCD boundary after the [p, q[ character. 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = segmentStart = q; 3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || 3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) { 3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fails FCD check. Find the previous FCD boundary and normalize. 3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert q = p; 3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(fcd16 <= 0xff || p == rawStart) { break; } 3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = Character.codePointBefore(seq, p); 3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p -= Character.charCount(c); 3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while((fcd16 = nfcImpl.getFCD16(c)) != 0); 3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert normalize(q, pos); 3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos = limit; 3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextCC = fcd16 >> 8; 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(p == rawStart || nextCC == 0) { 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // FCD boundary before the following character. 3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = segmentStart = p; 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(pos != start); 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 0; 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void normalize(int from, int to) { 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(normalized == null) { 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert normalized = new StringBuilder(); 3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // NFD without argument checking. 3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfcImpl.decompose(rawSeq, from, to, normalized, to - from); 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Switch collation processing into the FCD buffer 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // with the result of normalizing [segmentStart, segmentLimit[. 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert segmentStart = from; 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert segmentLimit = to; 3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert seq = normalized; 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = 0; 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = start + normalized.length(); 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Text pointers: The input text is rawSeq[rawStart, rawLimit[. 3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (In C++, these are const UChar * pointers. 3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In Java, we use CharSequence rawSeq and the parent class' seq 3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // together with int indexes.) 3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 3957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // checkDir > 0: 3967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 3977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text rawSeq[segmentStart..pos[ passes the FCD check. 3987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Moving forward checks incrementally. 3997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // segmentLimit is undefined. seq == rawSeq. limit == rawLimit. 4007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 4017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // checkDir < 0: 4027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text rawSeq[pos..segmentLimit[ passes the FCD check. 4037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Moving backward checks incrementally. 4047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // segmentStart is undefined. seq == rawSeq. start == rawStart. 4057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 4067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // checkDir == 0: 4077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 4087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text rawSeq[segmentStart..segmentLimit[ is being processed. 4097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // These pointers are at FCD boundaries. 4107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Either this text segment already passes the FCD check 4117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and seq==rawSeq && segmentStart==start<=pos<=limit==segmentLimit, 4127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // or the current segment had to be normalized so that 4137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // rawSeq[segmentStart..segmentLimit[ turned into the normalized string, 4147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // corresponding to seq==normalized && 0==start<=pos<=limit==start+normalized.length(). 4157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CharSequence rawSeq; 4167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int rawStart = 0; 4177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int segmentStart; 4187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int segmentLimit; 4197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int rawLimit; 4207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final Normalizer2Impl nfcImpl; 4227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private StringBuilder normalized; 4237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Direction of incremental FCD check. See comments before rawStart. 4247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int checkDir; 4257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 426