12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others. 22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Copyright (C) 2010-2014, International Business Machines 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Corporation and others. All Rights Reserved. 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* CollationIterator.java, ported from collationiterator.h/.cpp 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* C++ version created on: 2010oct27 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* created by: Markus W. Scherer 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/ 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.impl.coll; 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Normalizer2Impl.Hangul; 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Trie2_32; 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.BytesTrie; 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.CharsTrie; 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.ICUException; 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Collation element iterator and abstract character iterator. 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * When a method returns a code point value, it must be in 0..10FFFF, 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * except it can be negative as a sentinel value. 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic abstract class CollationIterator { 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class CEBuffer { 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Large enough for CEs of most short strings. */ 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int INITIAL_CAPACITY = 40; 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEBuffer() {} 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void append(long ce) { 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(length >= INITIAL_CAPACITY) { 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ensureAppendCapacity(1); 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer[length++] = ce; 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void appendUnsafe(long ce) { 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer[length++] = ce; 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void ensureAppendCapacity(int appCap) { 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int capacity = buffer.length; 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((length + appCap) <= capacity) { return; } 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(capacity < 1000) { 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert capacity *= 4; 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert capacity *= 2; 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while(capacity < (length + appCap)); 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long[] newBuffer = new long[capacity]; 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.arraycopy(buffer, 0, newBuffer, 0, length); 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer = newBuffer; 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void incLength() { 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Use INITIAL_CAPACITY for a very simple fastpath. 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (Rather than buffer.getCapacity().) 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(length >= INITIAL_CAPACITY) { 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ensureAppendCapacity(1); 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++length; 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long set(int i, long ce) { 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return buffer[i] = ce; 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long get(int i) { return buffer[i]; } 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long[] getCEs() { return buffer; } 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length = 0; 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private long[] buffer = new long[INITIAL_CAPACITY]; 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // State of combining marks skipped in discontiguous contraction. 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We create a state object on first use and keep it around deactivated between uses. 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class SkippedState { 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Born active but empty. 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert SkippedState() {} 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void clear() { 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert oldBuffer.setLength(0); 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos = 0; 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The newBuffer is reset by setFirstSkipped(). 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean isEmpty() { return oldBuffer.length() == 0; } 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean hasNext() { return pos < oldBuffer.length(); } 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Requires hasNext(). 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int next() { 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = oldBuffer.codePointAt(pos); 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos += Character.charCount(c); 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return c; 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Accounts for one more input code point read beyond the end of the marks buffer. 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void incBeyond() { 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(!hasNext()); 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++pos; 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Goes backward through the skipped-marks buffer. 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Returns the number of code points read beyond the skipped marks 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // that need to be backtracked through normal input. 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int backwardNumCodePoints(int n) { 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length = oldBuffer.length(); 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int beyond = pos - length; 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(beyond > 0) { 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(beyond >= n) { 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Not back far enough to re-enter the oldBuffer. 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos -= n; 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return n; 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Back out all beyond-oldBuffer code points and re-enter the buffer. 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos = oldBuffer.offsetByCodePoints(length, beyond - n); 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return beyond; 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Go backwards from inside the oldBuffer. 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos = oldBuffer.offsetByCodePoints(pos, -n); 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void setFirstSkipped(int c) { 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert skipLengthAtMatch = 0; 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newBuffer.setLength(0); 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newBuffer.appendCodePoint(c); 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void skip(int c) { 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newBuffer.appendCodePoint(c); 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void recordMatch() { skipLengthAtMatch = newBuffer.length(); } 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Replaces the characters we consumed with the newly skipped ones. 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void replaceMatch() { 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: UnicodeString.replace() pins pos to at most length(). 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int oldLength = oldBuffer.length(); 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(pos > oldLength) { pos = oldLength; } 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert oldBuffer.delete(0, pos).insert(0, newBuffer, 0, skipLengthAtMatch); 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos = 0; 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void saveTrieState(CharsTrie trie) { trie.saveState(state); } 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void resetToTrieState(CharsTrie trie) { trie.resetToState(state); } 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Combining marks skipped in previous discontiguous-contraction matching. 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // After that discontiguous contraction was completed, we start reading them from here. 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final StringBuilder oldBuffer = new StringBuilder(); 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Combining marks newly skipped in current discontiguous-contraction matching. 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // These might have been read from the normal text or from the oldBuffer. 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final StringBuilder newBuffer = new StringBuilder(); 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Reading index in oldBuffer, 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()). 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int pos; 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // newBuffer.length() at the time of the last matching character. 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // When a partial match fails, we back out skipped and partial-matching input characters. 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int skipLengthAtMatch; 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We save the trie state before we attempt to match a character, 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // so that we can skip it and try the next one. 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CharsTrie.State state = new CharsTrie.State(); 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }; 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Partially constructs the iterator. 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * In Java, we cache partially constructed iterators 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and finish their setup when starting to work on text 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (via reset(boolean) and the setText(numeric, ...) methods of subclasses). 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This avoids memory allocations for iterators that remain unused. 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p>In C++, there is only one constructor, and iterators are 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * stack-allocated as needed. 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public CollationIterator(CollationData d) { 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert trie = d.trie; 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert data = d; 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert numCpFwd = -1; 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isNumeric = false; 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer = null; 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public CollationIterator(CollationData d, boolean numeric) { 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert trie = d.trie; 1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert data = d; 1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert numCpFwd = -1; 1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isNumeric = numeric; 1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer = new CEBuffer(); 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean equals(Object other) { 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Subclasses: Call this method and then add more specific checks. 2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Compare the iterator state but not the collation data (trie & data fields): 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Assume that the caller compares the data. 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Ignore skipped since that should be unused between calls to nextCE(). 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (It only stays around to avoid another memory allocation.) 2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(other == null) { return false; } 2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!this.getClass().equals(other.getClass())) { return false; } 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationIterator o = (CollationIterator)other; 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!(ceBuffer.length == o.ceBuffer.length && 2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cesIndex == o.cesIndex && 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert numCpFwd == o.numCpFwd && 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isNumeric == o.isNumeric)) { 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i = 0; i < ceBuffer.length; ++i) { 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ceBuffer.get(i) != o.ceBuffer.get(i)) { return false; } 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2222d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 2232d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert public int hashCode() { 2242d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert // Dummy return to prevent compile warnings. 2252d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert return 0; 2262d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert } 2272d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Resets the iterator state and sets the position to the specified offset. 2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Subclasses must implement, and must call the parent class method, 2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or CollationIterator.reset(). 2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public abstract void resetToOffset(int newOffset); 2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public abstract int getOffset(); 2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the next collation element. 2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public final long nextCE() { 2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(cesIndex < ceBuffer.length) { 2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Return the next buffered CE. 2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ceBuffer.get(cesIndex++); 2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert cesIndex == ceBuffer.length; 2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.incLength(); 2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long cAndCE32 = handleNextCE32(); 2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = (int)(cAndCE32 >> 32); 2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ce32 = (int)cAndCE32; 2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int t = ce32 & 0xff; 2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(t < Collation.SPECIAL_CE32_LOW_BYTE) { // Forced-inline of isSpecialCE32(ce32). 2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Normal CE from the main data. 2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Forced-inline of ceFromSimpleCE32(ce32). 2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ceBuffer.set(cesIndex++, 2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8)); 2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationData d; 2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The compiler should be able to optimize the previous and the following 2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // comparisons of t with the same constant. 2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(t == Collation.SPECIAL_CE32_LOW_BYTE) { 2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c < 0) { 2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ceBuffer.set(cesIndex++, Collation.NO_CE); 2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert d = data.base; 2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = d.getCE32(c); 2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert t = ce32 & 0xff; 2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(t < Collation.SPECIAL_CE32_LOW_BYTE) { 2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Normal CE from the base data. 2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ceBuffer.set(cesIndex++, 2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8)); 2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert d = data; 2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(t == Collation.LONG_PRIMARY_CE32_LOW_BYTE) { 2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Forced-inline of ceFromLongPrimaryCE32(ce32). 2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ceBuffer.set(cesIndex++, 2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((long)(ce32 - t) << 32) | Collation.COMMON_SEC_AND_TER_CE); 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return nextCEFromCE32(d, c, ce32); 2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Fetches all CEs. 2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return getCEsLength() 2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public final int fetchCEs() { 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(nextCE() != Collation.NO_CE) { 2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No need to loop for each expansion CE. 2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cesIndex = ceBuffer.length; 2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ceBuffer.length; 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Overwrites the current CE (the last one returned by nextCE()). 2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert final void setCurrentCE(long ce) { 2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert cesIndex > 0; 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.set(cesIndex - 1, ce); 3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the previous collation element. 3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public final long previousCE(UVector32 offsets) { 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ceBuffer.length > 0) { 3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Return the previous buffered CE. 3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ceBuffer.get(--ceBuffer.length); 3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert offsets.removeAllElements(); 3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int limitOffset = getOffset(); 3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = previousCodePoint(); 3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c < 0) { return Collation.NO_CE; } 3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(data.isUnsafeBackward(c, isNumeric)) { 3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return previousCEUnsafe(c, offsets); 3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Simple, safe-backwards iteration: 3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Get a CE going backwards, handle prefixes but no contractions. 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ce32 = data.getCE32(c); 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationData d; 3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ce32 == Collation.FALLBACK_CE32) { 3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert d = data.base; 3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = d.getCE32(c); 3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert d = data; 3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Collation.isSimpleOrLongCE32(ce32)) { 3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Collation.ceFromCE32(ce32); 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert appendCEsFromCE32(d, c, ce32, false); 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ceBuffer.length > 1) { 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert offsets.addElement(getOffset()); 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // For an expansion, the offset of each non-initial CE is the limit offset, 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // consistent with forward iteration. 3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(offsets.size() <= ceBuffer.length) { 3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert offsets.addElement(limitOffset); 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }; 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ceBuffer.get(--ceBuffer.length); 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public final int getCEsLength() { 3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ceBuffer.length; 3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public final long getCE(int i) { 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ceBuffer.get(i); 3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public final long[] getCEs() { 3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ceBuffer.getCEs(); 3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert final void clearCEs() { 3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cesIndex = ceBuffer.length = 0; 3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public final void clearCEsIfNoneRemaining() { 3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(cesIndex == ceBuffer.length) { clearCEs(); } 3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the next code point (with post-increment). 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Public for identical-level comparison and for testing. 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public abstract int nextCodePoint(); 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the previous code point (with pre-decrement). 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Public for identical-level comparison and for testing. 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public abstract int previousCodePoint(); 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected final void reset() { 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cesIndex = ceBuffer.length = 0; 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(skipped != null) { skipped.clear(); } 3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Resets the state as well as the numeric setting, 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and completes the initialization. 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Only exists in Java where we reset cached CollationIterator instances 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * rather than stack-allocating temporary ones. 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (See also the constructor comments.) 3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected final void reset(boolean numeric) { 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ceBuffer == null) { 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer = new CEBuffer(); 3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reset(); 3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isNumeric = numeric; 3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the next code point and its local CE32 value. 3967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns Collation.FALLBACK_CE32 at the end of the text (c<0) 3977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or when c's CE32 value is to be looked up in the base data (fallback). 3987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 3997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The code point is used for fallbacks, context and implicit weights. 4007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32). 4017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 4027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the code point in bits 63..32 (signed) and the CE32 in bits 31..0. 4037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected long handleNextCE32() { 4057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = nextCodePoint(); 4067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c < 0) { return NO_CP_AND_CE32; } 4077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return makeCodePointAndCE32Pair(c, data.getCE32(c)); 4087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected long makeCodePointAndCE32Pair(int c, int ce32) { 4107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ((long)c << 32) | (ce32 & 0xffffffffL); 4117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected static final long NO_CP_AND_CE32 = (-1L << 32) | (Collation.FALLBACK_CE32 & 0xffffffffL); 4137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit. 4167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the trail surrogate in that case and advances past it, 4177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * if a trail surrogate follows the lead surrogate. 4187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Otherwise returns any other code unit and does not advance. 4197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected char handleGetTrailSurrogate() { 4217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 4227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator. 4267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (Not needed in Java.) 4277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /*protected boolean foundNULTerminator() { 4297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 4307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }*/ 4317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return false if surrogate code points U+D800..U+DFFF 4347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * map to their own implicit primary weights (for UTF-16), 4357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or true if they map to CE(U+FFFD) (for UTF-8) 4367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected boolean forbidSurrogateCodePoints() { 4387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 4397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected abstract void forwardNumCodePoints(int num); 4427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected abstract void backwardNumCodePoints(int num); 4447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the CE32 from the data trie. 4477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normally the same as data.getCE32(), but overridden in the builder. 4487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Call this only when the faster data.getCE32() cannot be used. 4497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected int getDataCE32(int c) { 4517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return data.getCE32(c); 4527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected int getCE32FromBuilderData(int ce32) { 4557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUException("internal program error: should be unreachable"); 4567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected final void appendCEsFromCE32(CollationData d, int c, int ce32, 4597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean forward) { 4607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(Collation.isSpecialCE32(ce32)) { 4617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch(Collation.tagFromCE32(ce32)) { 4627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.FALLBACK_TAG: 4637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.RESERVED_TAG_3: 4647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUException("internal program error: should be unreachable"); 4657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.LONG_PRIMARY_TAG: 4667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.append(Collation.ceFromLongPrimaryCE32(ce32)); 4677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 4687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.LONG_SECONDARY_TAG: 4697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.append(Collation.ceFromLongSecondaryCE32(ce32)); 4707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 4717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.LATIN_EXPANSION_TAG: 4727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.ensureAppendCapacity(2); 4737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.set(ceBuffer.length, Collation.latinCE0FromCE32(ce32)); 4747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.set(ceBuffer.length + 1, Collation.latinCE1FromCE32(ce32)); 4757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.length += 2; 4767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 4777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.EXPANSION32_TAG: { 4787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int index = Collation.indexFromCE32(ce32); 4797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length = Collation.lengthFromCE32(ce32); 4807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.ensureAppendCapacity(length); 4817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 4827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.appendUnsafe(Collation.ceFromCE32(d.ce32s[index++])); 4837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while(--length > 0); 4847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 4857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.EXPANSION_TAG: { 4877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int index = Collation.indexFromCE32(ce32); 4887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length = Collation.lengthFromCE32(ce32); 4897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.ensureAppendCapacity(length); 4907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 4917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.appendUnsafe(d.ces[index++]); 4927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while(--length > 0); 4937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 4947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.BUILDER_DATA_TAG: 4967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = getCE32FromBuilderData(ce32); 4977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ce32 == Collation.FALLBACK_CE32) { 4987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert d = data.base; 4997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = d.getCE32(c); 5007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 5027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.PREFIX_TAG: 5037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(forward) { backwardNumCodePoints(1); } 5047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = getCE32FromPrefix(d, ce32); 5057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(forward) { forwardNumCodePoints(1); } 5067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 5077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.CONTRACTION_TAG: { 5087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int index = Collation.indexFromCE32(ce32); 5097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int defaultCE32 = d.getCE32FromContexts(index); // Default if no suffix match. 5107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!forward) { 5117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Backward contractions are handled by previousCEUnsafe(). 5127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c has contractions but they were not found. 5137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = defaultCE32; 5147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 5157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nextCp; 5177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(skipped == null && numCpFwd < 0) { 5187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path, 5197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // avoiding the function call and the nextSkippedCodePoint() overhead. 5207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextCp = nextCodePoint(); 5217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nextCp < 0) { 5227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No more text. 5237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = defaultCE32; 5247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 5257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 && 5267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert !CollationFCD.mayHaveLccc(nextCp)) { 5277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // All contraction suffixes start with characters with lccc!=0 5287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // but the next code point has lccc==0. 5297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert backwardNumCodePoints(1); 5307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = defaultCE32; 5317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 5327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 5347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextCp = nextSkippedCodePoint(); 5357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nextCp < 0) { 5367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No more text. 5377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = defaultCE32; 5387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 5397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 && 5407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert !CollationFCD.mayHaveLccc(nextCp)) { 5417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // All contraction suffixes start with characters with lccc!=0 5427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // but the next code point has lccc==0. 5437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert backwardNumSkipped(1); 5447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = defaultCE32; 5457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 5467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = nextCE32FromContraction(d, ce32, d.contexts, index + 2, defaultCE32, nextCp); 5497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ce32 == Collation.NO_CE32) { 5507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // CEs from a discontiguous contraction plus the skipped combining marks 5517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // have been appended already. 5527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 5537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 5557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.DIGIT_TAG: 5577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isNumeric) { 5587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert appendNumericCEs(ce32, forward); 5597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 5607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 5617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fetch the non-numeric-collation CE32 and continue. 5627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = d.ce32s[Collation.indexFromCE32(ce32)]; 5637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 5647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.U0000_TAG: 5667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(c == 0); 5677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // NUL-terminated input not supported in Java. 5687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fetch the normal ce32 for U+0000 and continue. 5697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = d.ce32s[0]; 5707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 5717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.HANGUL_TAG: { 5727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int[] jamoCE32s = d.jamoCE32s; 5737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c -= Hangul.HANGUL_BASE; 5747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int t = c % Hangul.JAMO_T_COUNT; 5757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c /= Hangul.JAMO_T_COUNT; 5767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int v = c % Hangul.JAMO_V_COUNT; 5777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c /= Hangul.JAMO_V_COUNT; 5787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((ce32 & Collation.HANGUL_NO_SPECIAL_JAMO) != 0) { 5797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // None of the Jamo CE32s are isSpecialCE32(). 5807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Avoid recursive function calls and per-Jamo tests. 5817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3); 5827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.set(ceBuffer.length, Collation.ceFromCE32(jamoCE32s[c])); 5837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.set(ceBuffer.length + 1, Collation.ceFromCE32(jamoCE32s[19 + v])); 5847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.length += 2; 5857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(t != 0) { 5867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.appendUnsafe(Collation.ceFromCE32(jamoCE32s[39 + t])); 5877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 5897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 5907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We should not need to compute each Jamo code point. 5917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In particular, there should be no offset or implicit ce32. 5927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[c], forward); 5937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[19 + v], forward); 5947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(t == 0) { return; } 5957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // offset 39 = 19 + 21 - 1: 5967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 19 = JAMO_L_COUNT 5977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 21 = JAMO_T_COUNT 5987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // -1 = omit t==0 5997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = jamoCE32s[39 + t]; 6007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = Collation.SENTINEL_CP; 6017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 6027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.LEAD_SURROGATE_TAG: { 6057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(forward); // Backward iteration should never see lead surrogate code _unit_ data. 6067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(isLeadSurrogate(c)); 6077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char trail; 6087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Character.isLowSurrogate(trail = handleGetTrailSurrogate())) { 6097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = Character.toCodePoint((char)c, trail); 6107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 &= Collation.LEAD_TYPE_MASK; 6117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ce32 == Collation.LEAD_ALL_UNASSIGNED) { 6127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = Collation.UNASSIGNED_CE32; // unassigned-implicit 6137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(ce32 == Collation.LEAD_ALL_FALLBACK || 6147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (ce32 = d.getCE32FromSupplementary(c)) == Collation.FALLBACK_CE32) { 6157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // fall back to the base data 6167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert d = d.base; 6177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = d.getCE32FromSupplementary(c); 6187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 6207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c is an unpaired surrogate. 6217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = Collation.UNASSIGNED_CE32; 6227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 6247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.OFFSET_TAG: 6267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(c >= 0); 6277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.append(d.getCEFromOffsetCE32(c, ce32)); 6287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 6297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.IMPLICIT_TAG: 6307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(c >= 0); 6317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isSurrogate(c) && forbidSurrogateCodePoints()) { 6327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = Collation.FFFD_CE32; 6337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 6347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 6357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.append(Collation.unassignedCEFromCodePoint(c)); 6367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 6377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.append(Collation.ceFromSimpleCE32(ce32)); 6417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: Propose widening the UTF16 method. 6447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final boolean isSurrogate(int c) { 6457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (c & 0xfffff800) == 0xd800; 6467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: Propose widening the UTF16 method. 6497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected static final boolean isLeadSurrogate(int c) { 6507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (c & 0xfffffc00) == 0xd800; 6517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: Propose widening the UTF16 method. 6547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected static final boolean isTrailSurrogate(int c) { 6557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (c & 0xfffffc00) == 0xdc00; 6567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Main lookup trie of the data object. 6597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected final Trie2_32 trie; 6607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected final CollationData data; 6617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final long nextCEFromCE32(CollationData d, int c, int ce32) { 6637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --ceBuffer.length; // Undo ceBuffer.incLength(). 6647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert appendCEsFromCE32(d, c, ce32, true); 6657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ceBuffer.get(cesIndex++); 6667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final int getCE32FromPrefix(CollationData d, int ce32) { 6697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int index = Collation.indexFromCE32(ce32); 6707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = d.getCE32FromContexts(index); // Default if no prefix match. 6717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index += 2; 6727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Number of code points read before the original code point. 6737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lookBehind = 0; 6747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsTrie prefixes = new CharsTrie(d.contexts, index); 6757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 6767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = previousCodePoint(); 6777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c < 0) { break; } 6787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++lookBehind; 6797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert BytesTrie.Result match = prefixes.nextForCodePoint(c); 6807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(match.hasValue()) { 6817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = prefixes.getValue(); 6827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!match.hasNext()) { break; } 6847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert forwardNumCodePoints(lookBehind); 6867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ce32; 6877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final int nextSkippedCodePoint() { 6907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(skipped != null && skipped.hasNext()) { return skipped.next(); } 6917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(numCpFwd == 0) { return Collation.SENTINEL_CP; } 6927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = nextCodePoint(); 6937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(skipped != null && !skipped.isEmpty() && c >= 0) { skipped.incBeyond(); } 6947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(numCpFwd > 0 && c >= 0) { --numCpFwd; } 6957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return c; 6967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final void backwardNumSkipped(int n) { 6997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(skipped != null && !skipped.isEmpty()) { 7007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert n = skipped.backwardNumCodePoints(n); 7017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert backwardNumCodePoints(n); 7037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(numCpFwd >= 0) { numCpFwd += n; } 7047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final int nextCE32FromContraction( 7077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationData d, int contractionCE32, 7087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharSequence trieChars, int trieOffset, int ce32, int c) { 7097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c: next code point after the original one 7107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Number of code points read beyond the original code point. 7127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Needed for discontiguous contraction matching. 7137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lookAhead = 1; 7147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Number of code points read since the last match (initially only c). 7157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int sinceMatch = 1; 7167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Normally we only need a contiguous match, 7177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and therefore need not remember the suffixes state from before a mismatch for retrying. 7187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If we are already processing skipped combining marks, then we do track the state. 7197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsTrie suffixes = new CharsTrie(trieChars, trieOffset); 7207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); } 7217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert BytesTrie.Result match = suffixes.firstForCodePoint(c); 7227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 7237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nextCp; 7247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(match.hasValue()) { 7257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = suffixes.getValue(); 7267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!match.hasNext() || (c = nextSkippedCodePoint()) < 0) { 7277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ce32; 7287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); } 7307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sinceMatch = 1; 7317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(match == BytesTrie.Result.NO_MATCH || (nextCp = nextSkippedCodePoint()) < 0) { 7327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No match for c, or partial match (BytesTrie.Result.NO_VALUE) and no further text. 7337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Back up if necessary, and try a discontiguous contraction. 7347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((contractionCE32 & Collation.CONTRACT_TRAILING_CCC) != 0 && 7357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Discontiguous contraction matching extends an existing match. 7367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If there is no match yet, then there is nothing to do. 7377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((contractionCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) == 0 || 7387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sinceMatch < lookAhead)) { 7397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The last character of at least one suffix has lccc!=0, 7407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // allowing for discontiguous contractions. 7417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // UCA S2.1.1 only processes non-starters immediately following 7427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // "a match in the table" (sinceMatch=1). 7437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(sinceMatch > 1) { 7447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Return to the state after the last match. 7457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (Return to sinceMatch=0 and re-fetch the first partially-matched character.) 7467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert backwardNumSkipped(sinceMatch); 7477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = nextSkippedCodePoint(); 7487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lookAhead -= sinceMatch - 1; 7497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sinceMatch = 1; 7507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(d.getFCD16(c) > 0xff) { 7527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return nextCE32FromDiscontiguousContraction( 7537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert d, suffixes, ce32, lookAhead, c); 7547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 7577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 7587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Continue after partial match (BytesTrie.Result.NO_VALUE) for c. 7597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // It does not have a result value, therefore it is not itself "a match in the table". 7607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If a partially-matched c has ccc!=0 then 7617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // it might be skipped in discontiguous contraction. 7627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = nextCp; 7637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++sinceMatch; 7647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++lookAhead; 7667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert match = suffixes.nextForCodePoint(c); 7677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert backwardNumSkipped(sinceMatch); 7697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ce32; 7707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final int nextCE32FromDiscontiguousContraction( 7737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationData d, CharsTrie suffixes, int ce32, 7747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lookAhead, int c) { 7757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // UCA section 3.3.2 Contractions: 7767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Contractions that end with non-starter characters 7777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // are known as discontiguous contractions. 7787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ... discontiguous contractions must be detected in input text 7797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // whenever the final sequence of non-starter characters could be rearranged 7807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // so as to make a contiguous matching sequence that is canonically equivalent. 7817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // UCA: http://www.unicode.org/reports/tr10/#S2.1 7837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // S2.1 Find the longest initial substring S at each point that has a match in the table. 7847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // S2.1.1 If there are any non-starters following S, process each non-starter C. 7857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // S2.1.2 If C is not blocked from S, find if S + C has a match in the table. 7867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: A non-starter in a string is called blocked 7877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if there is another non-starter of the same canonical combining class or zero 7887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // between it and the last character of canonical combining class 0. 7897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // S2.1.3 If there is a match, replace S by S + C, and remove C. 7907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // First: Is a discontiguous contraction even possible? 7927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int fcd16 = d.getFCD16(c); 7937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(fcd16 > 0xff); // The caller checked this already, as a shortcut. 7947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nextCp = nextSkippedCodePoint(); 7957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nextCp < 0) { 7967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No further text. 7977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert backwardNumSkipped(1); 7987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ce32; 7997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++lookAhead; 8017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevCC = fcd16 & 0xff; 8027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fcd16 = d.getFCD16(nextCp); 8037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(fcd16 <= 0xff) { 8047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The next code point after c is a starter (S2.1.1 "process each non-starter"). 8057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert backwardNumSkipped(2); 8067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ce32; 8077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We have read and matched (lookAhead-2) code points, 8107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // read non-matching c and peeked ahead at nextCp. 8117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Return to the state before the mismatch and continue matching with nextCp. 8127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(skipped == null || skipped.isEmpty()) { 8137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(skipped == null) { 8147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert skipped = new SkippedState(); 8157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert suffixes.reset(); 8177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(lookAhead > 2) { 8187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Replay the partial match so far. 8197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert backwardNumCodePoints(lookAhead); 8207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert suffixes.firstForCodePoint(nextCodePoint()); 8217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i = 3; i < lookAhead; ++i) { 8227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert suffixes.nextForCodePoint(nextCodePoint()); 8237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Skip c (which did not match) and nextCp (which we will try now). 8257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert forwardNumCodePoints(2); 8267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert skipped.saveTrieState(suffixes); 8287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 8297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Reset to the trie state before the failed match of c. 8307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert skipped.resetToTrieState(suffixes); 8317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert skipped.setFirstSkipped(c); 8347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Number of code points read since the last match (at this point: c and nextCp). 8357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int sinceMatch = 2; 8367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = nextCp; 8377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 8387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert BytesTrie.Result match; 8397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2) 8407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(prevCC < (fcd16 >> 8) && (match = suffixes.nextForCodePoint(c)).hasValue()) { 8417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // "If there is a match, replace S by S + C, and remove C." (S2.1.3) 8427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Keep prevCC unchanged. 8437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = suffixes.getValue(); 8447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sinceMatch = 0; 8457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert skipped.recordMatch(); 8467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!match.hasNext()) { break; } 8477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert skipped.saveTrieState(suffixes); 8487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 8497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No match for "S + C", skip C. 8507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert skipped.skip(c); 8517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert skipped.resetToTrieState(suffixes); 8527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC = fcd16 & 0xff; 8537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((c = nextSkippedCodePoint()) < 0) { break; } 8557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++sinceMatch; 8567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fcd16 = d.getFCD16(c); 8577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(fcd16 <= 0xff) { 8587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The next code point after c is a starter (S2.1.1 "process each non-starter"). 8597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 8607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert backwardNumSkipped(sinceMatch); 8637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean isTopDiscontiguous = skipped.isEmpty(); 8647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert skipped.replaceMatch(); 8657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isTopDiscontiguous && !skipped.isEmpty()) { 8667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We did get a match after skipping one or more combining marks, 8677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and we are not in a recursive discontiguous contraction. 8687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Append CEs from the contraction ce32 8697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and then from the combining marks that we skipped before the match. 8707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = Collation.SENTINEL_CP; 8717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 8727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert appendCEsFromCE32(d, c, ce32, true); 8737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fetch CE32s for skipped combining marks from the normal data, with fallback, 8747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // rather than from the CollationData where we found the contraction. 8757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!skipped.hasNext()) { break; } 8767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = skipped.next(); 8777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = getDataCE32(c); 8787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ce32 == Collation.FALLBACK_CE32) { 8797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert d = data.base; 8807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = d.getCE32(c); 8817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 8827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert d = data; 8837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: A nested discontiguous-contraction match 8857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // replaces consumed combining marks with newly skipped ones 8867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and resets the reading position to the beginning. 8877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert skipped.clear(); 8897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = Collation.NO_CE32; // Signal to the caller that the result is in the ceBuffer. 8907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ce32; 8927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 8957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the previous CE when data.isUnsafeBackward(c, isNumeric). 8967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final long previousCEUnsafe(int c, UVector32 offsets) { 8987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We just move through the input counting safe and unsafe code points 8997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // without collecting the unsafe-backward substring into a buffer and 9007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // switching to it. 9017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This is to keep the logic simple. Otherwise we would have to handle 9027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // prefix matching going before the backward buffer, switching 9037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // to iteration and back, etc. 9047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In the most important case of iterating over a normal string, 9057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // reading from the string itself is already maximally fast. 9067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The only drawback there is that after getting the CEs we always 9077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // skip backward to the safe character rather than switching out 9087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // of a backwardBuffer. 9097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // But this should not be the common case for previousCE(), 9107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and correctness and maintainability are more important than 9117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // complex optimizations. 9127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Find the first safe character before c. 9137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int numBackward = 1; 9147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while((c = previousCodePoint()) >= 0) { 9157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++numBackward; 9167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!data.isUnsafeBackward(c, isNumeric)) { 9177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 9187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Set the forward iteration limit. 9217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: This counts code points. 9227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We cannot enforce a limit in the middle of a surrogate pair or similar. 9237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert numCpFwd = numBackward; 9247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Reset the forward iterator. 9257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cesIndex = 0; 9267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(ceBuffer.length == 0); 9277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Go forward and collect the CEs. 9287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int offset = getOffset(); 9297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(numCpFwd > 0) { 9307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // nextCE() normally reads one code point. 9317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Contraction matching and digit specials read more and check numCpFwd. 9327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --numCpFwd; 9337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Append one or more CEs to the ceBuffer. 9347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextCE(); 9357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(ceBuffer.get(ceBuffer.length - 1) != Collation.NO_CE); 9367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No need to loop for getting each expansion CE from nextCE(). 9377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cesIndex = ceBuffer.length; 9387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // However, we need to write an offset for each CE. 9397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This is for CollationElementIterator.getOffset() to return 9407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // intermediate offsets from the unsafe-backwards segment. 9417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(offsets.size() < ceBuffer.length); 9427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert offsets.addElement(offset); 9437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // For an expansion, the offset of each non-initial CE is the limit offset, 9447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // consistent with forward iteration. 9457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert offset = getOffset(); 9467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(offsets.size() < ceBuffer.length) { 9477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert offsets.addElement(offset); 9487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }; 9497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(offsets.size() == ceBuffer.length); 9517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // End offset corresponding to just after the unsafe-backwards segment. 9527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert offsets.addElement(offset); 9537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Reset the forward iteration limit 9547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and move backward to before the segment for which we fetched CEs. 9557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert numCpFwd = -1; 9567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert backwardNumCodePoints(numBackward); 9577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Use the collected CEs and return the last one. 9587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cesIndex = 0; // Avoid cesIndex > ceBuffer.length when that gets decremented. 9597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ceBuffer.get(--ceBuffer.length); 9607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 9637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Turns a string of digits (bytes 0..9) 9647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * into a sequence of CEs that will sort in numeric order. 9657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 9667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Starts from this ce32's digit value and consumes the following/preceding digits. 9677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The digits string must not be empty and must not have leading zeros. 9687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 9697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final void appendNumericCEs(int ce32, boolean forward) { 9707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Collect digits. 9717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: Use some kind of a byte buffer? We only store values 0..9. 9727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder digits = new StringBuilder(); 9737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(forward) { 9747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 9757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char digit = Collation.digitFromCE32(ce32); 9767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert digits.append(digit); 9777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(numCpFwd == 0) { break; } 9787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = nextCodePoint(); 9797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c < 0) { break; } 9807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = data.getCE32(c); 9817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ce32 == Collation.FALLBACK_CE32) { 9827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = data.base.getCE32(c); 9837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) { 9857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert backwardNumCodePoints(1); 9867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 9877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(numCpFwd > 0) { --numCpFwd; } 9897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 9917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 9927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char digit = Collation.digitFromCE32(ce32); 9937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert digits.append(digit); 9947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = previousCodePoint(); 9957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c < 0) { break; } 9967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = data.getCE32(c); 9977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ce32 == Collation.FALLBACK_CE32) { 9987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = data.base.getCE32(c); 9997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) { 10017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert forwardNumCodePoints(1); 10027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 10037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Reverse the digit string. 10067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert digits.reverse(); 10077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int pos = 0; 10097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 10107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Skip leading zeros. 10117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(pos < (digits.length() - 1) && digits.charAt(pos) == 0) { ++pos; } 10127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Write a sequence of CEs for at most 254 digits at a time. 10137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int segmentLength = digits.length() - pos; 10147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(segmentLength > 254) { segmentLength = 254; } 10157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert appendNumericSegmentCEs(digits.subSequence(pos, pos + segmentLength)); 10167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos += segmentLength; 10177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while(pos < digits.length()); 10187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 10217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Turns 1..254 digits into a sequence of CEs. 10227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Called by appendNumericCEs() for each segment of at most 254 digits. 10237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 10247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final void appendNumericSegmentCEs(CharSequence digits) { 10257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length = digits.length(); 10267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(1 <= length && length <= 254); 10277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(length == 1 || digits.charAt(0) != 0); 10287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long numericPrimary = data.numericPrimary; 10297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: We use primary byte values 2..255: digits are not compressible. 10307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(length <= 7) { 10317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Very dense encoding for small numbers. 10327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int value = digits.charAt(0); 10337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i = 1; i < length; ++i) { 10347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert value = value * 10 + digits.charAt(i); 10357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Primary weight second byte values: 10377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 74 byte values 2.. 75 for small numbers in two-byte primary weights. 10387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 40 byte values 76..115 for medium numbers in three-byte primary weights. 10397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 16 byte values 116..131 for large numbers in four-byte primary weights. 10407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 124 byte values 132..255 for very large numbers with 4..127 digit pairs. 10417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstByte = 2; 10427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int numBytes = 74; 10437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(value < numBytes) { 10447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Two-byte primary for 0..73, good for day & month numbers etc. 10457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long primary = numericPrimary | ((firstByte + value) << 16); 10467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.append(Collation.makeCE(primary)); 10477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 10487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert value -= numBytes; 10507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert firstByte += numBytes; 10517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert numBytes = 40; 10527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(value < numBytes * 254) { 10537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more. 10547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long primary = numericPrimary | 10557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8); 10567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.append(Collation.makeCE(primary)); 10577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 10587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert value -= numBytes * 254; 10607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert firstByte += numBytes; 10617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert numBytes = 16; 10627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(value < numBytes * 254 * 254) { 10637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Four-byte primary for 10234..1042489=10234+16*254*254-1. 10647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long primary = numericPrimary | (2 + value % 254); 10657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert value /= 254; 10667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert primary |= (2 + value % 254) << 8; 10677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert value /= 254; 10687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert primary |= (firstByte + value % 254) << 16; 10697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.append(Collation.makeCE(primary)); 10707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 10717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // original value > 1042489 10737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(length >= 7); 10757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The second primary byte value 132..255 indicates the number of digit pairs (4..127), 10777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // then we generate primary bytes with those pairs. 10787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Omit trailing 00 pairs. 10797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Decrement the value for the last pair. 10807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Set the exponent. 4 pairs.132, 5 pairs.133, ..., 127 pairs.255. 10827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int numPairs = (length + 1) / 2; 10837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long primary = numericPrimary | ((132 - 4 + numPairs) << 16); 10847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Find the length without trailing 00 pairs. 10857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(digits.charAt(length - 1) == 0 && digits.charAt(length - 2) == 0) { 10867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert length -= 2; 10877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Read the first pair. 10897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int pair; 10907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int pos; 10917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((length & 1) != 0) { 10927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Only "half a pair" if we have an odd number of digits. 10937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pair = digits.charAt(0); 10947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos = 1; 10957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 10967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pair = digits.charAt(0) * 10 + digits.charAt(1); 10977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos = 2; 10987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pair = 11 + 2 * pair; 11007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Add the pairs of digits between pos and length. 11017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int shift = 8; 11027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(pos < length) { 11037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(shift == 0) { 11047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Every three pairs/bytes we need to store a 4-byte-primary CE 11057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and start with a new CE with the '0' primary lead byte. 11067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert primary |= pair; 11077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.append(Collation.makeCE(primary)); 11087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert primary = numericPrimary; 11097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert shift = 16; 11107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 11117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert primary |= pair << shift; 11127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert shift -= 8; 11137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pair = 11 + 2 * (digits.charAt(pos) * 10 + digits.charAt(pos + 1)); 11157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pos += 2; 11167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert primary |= (pair - 1) << shift; 11187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ceBuffer.append(Collation.makeCE(primary)); 11197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CEBuffer ceBuffer; 11227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int cesIndex; 11237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private SkippedState skipped; 11257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Number of code points to read forward, or -1. 11277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Used as a forward iteration limit in previousCEUnsafe(). 11287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int numCpFwd; 11297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Numeric collation (CollationSettings.NUMERIC). 11307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean isNumeric; 11317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 1132