17935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 27935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Copyright (C) 2010-2014, International Business Machines 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Corporation and others. All Rights Reserved. 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* FCDUTF16CollationIterator.java, ported from utf16collationiterator.h/.cpp 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* C++ version created on: 2010oct27 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* created by: Markus W. Scherer 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/ 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.impl.coll; 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Normalizer2Impl; 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Incrementally checks the input text for FCD and normalizes where necessary. 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic final class FCDUTF16CollationIterator extends UTF16CollationIterator { 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Partial constructor, see {@link CollationIterator#CollationIterator(CollationData)}. 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public FCDUTF16CollationIterator(CollationData d) { 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert super(d); 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfcImpl = d.nfcImpl; 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public FCDUTF16CollationIterator(CollationData data, boolean numeric, CharSequence s, int p) { 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert super(data, numeric, s, p); 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rawSeq = s; 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert segmentStart = p; 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rawLimit = s.length(); 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfcImpl = data.nfcImpl; 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 1; 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean equals(Object other) { 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Skip the UTF16CollationIterator and call its parent. 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!((CollationIterator)this).equals(other)) { return false; } 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert FCDUTF16CollationIterator o = (FCDUTF16CollationIterator)other; 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Compare the iterator state but not the text: Assume that the caller does that. 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir != o.checkDir) { return false; } 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir == 0 && (seq == rawSeq) != (o.seq == o.rawSeq)) { return false; } 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir != 0 || seq == rawSeq) { 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (pos - rawStart) == (o.pos - /*o.*/ rawStart); 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (segmentStart - rawStart) == (o.segmentStart - /*o.*/ rawStart) && 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (pos - start) == (o.pos - o.start); 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int hashCode() { 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert false : "hashCode not designed"; 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 42; // any arbitrary constant will do 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void resetToOffset(int newOffset) { 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reset(); 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert seq = rawSeq; 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = segmentStart = pos = rawStart + newOffset; 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = rawLimit; 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 1; 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getOffset() { 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir != 0 || seq == rawSeq) { 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return pos - rawStart; 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(pos == start) { 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return segmentStart - rawStart; 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return segmentLimit - rawStart; 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setText(boolean numeric, CharSequence s, int p) { 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert super.setText(numeric, s, p); 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rawSeq = s; 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert segmentStart = p; 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rawLimit = limit = s.length(); 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 1; 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int nextCodePoint() { 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c; 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir > 0) { 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(pos == limit) { 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Collation.SENTINEL_CP; 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(pos++); 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(CollationFCD.hasTccc(c)) { 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(CollationFCD.maybeTibetanCompositeVowel(c) || 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) { 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --pos; 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextSegment(); 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(pos++); 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(checkDir == 0 && pos != limit) { 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(pos++); 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switchToForward(); 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char trail; 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Character.isHighSurrogate(c) && pos != limit && 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Character.isLowSurrogate(trail = seq.charAt(pos))) { 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++pos; 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Character.toCodePoint(c, trail); 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return c; 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int previousCodePoint() { 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c; 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir < 0) { 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(pos == start) { 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Collation.SENTINEL_CP; 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(--pos); 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(CollationFCD.hasLccc(c)) { 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(CollationFCD.maybeTibetanCompositeVowel(c) || 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (pos != start && CollationFCD.hasTccc(seq.charAt(pos - 1)))) { 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++pos; 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert previousSegment(); 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(--pos); 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(checkDir == 0 && pos != start) { 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(--pos); 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switchToBackward(); 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char lead; 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Character.isLowSurrogate(c) && pos != start && 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Character.isHighSurrogate(lead = seq.charAt(pos - 1))) { 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --pos; 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Character.toCodePoint(lead, c); 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return c; 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected long handleNextCE32() { 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c; 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir > 0) { 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(pos == limit) { 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return NO_CP_AND_CE32; 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(pos++); 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(CollationFCD.hasTccc(c)) { 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(CollationFCD.maybeTibetanCompositeVowel(c) || 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) { 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --pos; 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextSegment(); 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(pos++); 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(checkDir == 0 && pos != limit) { 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = seq.charAt(pos++); 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switchToForward(); 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead(c)); 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* boolean foundNULTerminator(); */ 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected void forwardNumCodePoints(int num) { 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Specify the class to avoid a virtual-function indirection. 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In Java, we would declare this class final. 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(num > 0 && nextCodePoint() >= 0) { 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --num; 1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected void backwardNumCodePoints(int num) { 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Specify the class to avoid a virtual-function indirection. 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In Java, we would declare this class final. 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(num > 0 && previousCodePoint() >= 0) { 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --num; 2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Switches to forward checking if possible. 2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * To be called when checkDir < 0 || (checkDir == 0 && pos == limit). 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns with checkDir > 0 || (checkDir == 0 && pos != limit). 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void switchToForward() { 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert((checkDir < 0 && seq == rawSeq) || (checkDir == 0 && pos == limit)); 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir < 0) { 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Turn around from backward checking. 2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = segmentStart = pos; 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(pos == segmentLimit) { 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = rawLimit; 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 1; // Check forward. 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { // pos < segmentLimit 2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 0; // Stay in FCD segment. 2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Reached the end of the FCD segment. 2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(seq == rawSeq) { 2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text segment is FCD, extend it forward. 2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text segment needed to be normalized. 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Switch to checking forward from it. 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert seq = rawSeq; 2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos = start = segmentStart = segmentLimit; 2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: If this segment is at the end of the input text, 2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // then it might help to return false to indicate that, so that 2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we do not have to re-check and normalize when we turn around and go backwards. 2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // However, that would complicate the call sites for an optimization of an unusual case. 2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = rawLimit; 2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 1; 2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Extend the FCD text segment forward or normalize around pos. 2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * To be called when checkDir > 0 && pos != limit. 2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns with checkDir == 0 and pos != limit. 2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void nextSegment() { 2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(checkDir > 0 && seq == rawSeq && pos != limit); 2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text [segmentStart..pos[ passes the FCD check. 2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int p = pos; 2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevCC = 0; 2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fetch the next character's fcd16 value. 2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int q = p; 2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = Character.codePointAt(seq, p); 2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p += Character.charCount(c); 2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int fcd16 = nfcImpl.getFCD16(c); 2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int leadCC = fcd16 >> 8; 2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(leadCC == 0 && q != pos) { 2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // FCD boundary before the [q, p[ character. 2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = segmentLimit = q; 2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) { 2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fails FCD check. Find the next FCD boundary and normalize. 2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert q = p; 2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(p == rawLimit) { break; } 2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = Character.codePointAt(seq, p); 2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p += Character.charCount(c); 2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while(nfcImpl.getFCD16(c) > 0xff); 2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert normalize(pos, q); 2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos = start; 2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC = fcd16 & 0xff; 2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(p == rawLimit || prevCC == 0) { 2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // FCD boundary after the last character. 2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = segmentLimit = p; 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(pos != limit); 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 0; 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Switches to backward checking. 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * To be called when checkDir > 0 || (checkDir == 0 && pos == start). 2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns with checkDir < 0 || (checkDir == 0 && pos != start). 2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void switchToBackward() { 2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert((checkDir > 0 && seq == rawSeq) || (checkDir == 0 && pos == start)); 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(checkDir > 0) { 2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Turn around from forward checking. 2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = segmentLimit = pos; 2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(pos == segmentStart) { 2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = rawStart; 2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = -1; // Check backward. 2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { // pos > segmentStart 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 0; // Stay in FCD segment. 3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Reached the start of the FCD segment. 3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(seq == rawSeq) { 3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text segment is FCD, extend it backward. 3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text segment needed to be normalized. 3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Switch to checking backward from it. 3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert seq = rawSeq; 3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos = limit = segmentLimit = segmentStart; 3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = rawStart; 3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = -1; 3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Extend the FCD text segment backward or normalize around pos. 3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * To be called when checkDir < 0 && pos != start. 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns with checkDir == 0 and pos != start. 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void previousSegment() { 3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(checkDir < 0 && seq == rawSeq && pos != start); 3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text [pos..segmentLimit[ passes the FCD check. 3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int p = pos; 3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nextCC = 0; 3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fetch the previous character's fcd16 value. 3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int q = p; 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = Character.codePointBefore(seq, p); 3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p -= Character.charCount(c); 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int fcd16 = nfcImpl.getFCD16(c); 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int trailCC = fcd16 & 0xff; 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(trailCC == 0 && q != pos) { 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // FCD boundary after the [p, q[ character. 3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = segmentStart = q; 3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) { 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fails FCD check. Find the previous FCD boundary and normalize. 3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert q = p; 3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(fcd16 <= 0xff || p == rawStart) { break; } 3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = Character.codePointBefore(seq, p); 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p -= Character.charCount(c); 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while((fcd16 = nfcImpl.getFCD16(c)) != 0); 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert normalize(q, pos); 3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos = limit; 3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextCC = fcd16 >> 8; 3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(p == rawStart || nextCC == 0) { 3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // FCD boundary before the following character. 3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = segmentStart = p; 3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(pos != start); 3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkDir = 0; 3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void normalize(int from, int to) { 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(normalized == null) { 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert normalized = new StringBuilder(); 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // NFD without argument checking. 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfcImpl.decompose(rawSeq, from, to, normalized, to - from); 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Switch collation processing into the FCD buffer 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // with the result of normalizing [segmentStart, segmentLimit[. 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert segmentStart = from; 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert segmentLimit = to; 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert seq = normalized; 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = 0; 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = start + normalized.length(); 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Text pointers: The input text is rawSeq[rawStart, rawLimit[. 3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (In C++, these are const UChar * pointers. 3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In Java, we use CharSequence rawSeq and the parent class' seq 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // together with int indexes.) 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // checkDir > 0: 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text rawSeq[segmentStart..pos[ passes the FCD check. 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Moving forward checks incrementally. 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // segmentLimit is undefined. seq == rawSeq. limit == rawLimit. 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // checkDir < 0: 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text rawSeq[pos..segmentLimit[ passes the FCD check. 3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Moving backward checks incrementally. 3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // segmentStart is undefined. seq == rawSeq. start == rawStart. 3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // checkDir == 0: 3957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 3967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text rawSeq[segmentStart..segmentLimit[ is being processed. 3977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // These pointers are at FCD boundaries. 3987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Either this text segment already passes the FCD check 3997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and seq==rawSeq && segmentStart==start<=pos<=limit==segmentLimit, 4007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // or the current segment had to be normalized so that 4017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // rawSeq[segmentStart..segmentLimit[ turned into the normalized string, 4027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // corresponding to seq==normalized && 0==start<=pos<=limit==start+normalized.length(). 4037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CharSequence rawSeq; 4047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int rawStart = 0; 4057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int segmentStart; 4067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int segmentLimit; 4077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int rawLimit; 4087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final Normalizer2Impl nfcImpl; 4107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private StringBuilder normalized; 4117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Direction of incremental FCD check. See comments before rawStart. 4127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int checkDir; 4137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 414