12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others. 22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 5f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert* Copyright (C) 2010-2015, International Business Machines 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Corporation and others. All Rights Reserved. 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* CollationData.java, ported from collationdata.h/.cpp 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* C++ version created on: 2010oct27 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* created by: Markus W. Scherer 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/ 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.impl.coll; 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Normalizer2Impl; 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Trie2_32; 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UScript; 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.Collator; 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.UnicodeSet; 21f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubertimport com.ibm.icu.util.ICUException; 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Collation data container. 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Immutable data created by a CollationDataBuilder, or loaded from a file, 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or deserialized from API-provided binary data. 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Includes data for the collation base (root/default), aliased if this is not the base. 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic final class CollationData { 31f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Note: The ucadata.icu loader could discover the reserved ranges by setting an array 32f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // parallel with the ranges, and resetting ranges that are indexed. 33f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // The reordering builder code could clone the resulting template array. 34f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert static final int REORDER_RESERVED_BEFORE_LATIN = Collator.ReorderCodes.FIRST + 14; 35f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert static final int REORDER_RESERVED_AFTER_LATIN = Collator.ReorderCodes.FIRST + 15; 36f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert 37f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert static final int MAX_NUM_SPECIAL_REORDER_CODES = 8; 38f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationData(Normalizer2Impl nfc) { 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfcImpl = nfc; 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getCE32(int c) { 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return trie.get(c); 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int getCE32FromSupplementary(int c) { 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return trie.get(c); // TODO: port UTRIE2_GET32_FROM_SUPP(trie, c) to Java? 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean isDigit(int c) { 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return c < 0x660 ? c <= 0x39 && 0x30 <= c : 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Collation.hasCE32Tag(getCE32(c), Collation.DIGIT_TAG); 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isUnsafeBackward(int c, boolean numeric) { 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return unsafeBackwardSet.contains(c) || (numeric && isDigit(c)); 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isCompressibleLeadByte(int b) { 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return compressibleBytes[b]; 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isCompressiblePrimary(long p) { 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return isCompressibleLeadByte((int)p >>> 24); 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the CE32 from two contexts words. 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Access to the defaultCE32 for contraction and prefix matching. 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int getCE32FromContexts(int index) { 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ((int)contexts.charAt(index) << 16) | contexts.charAt(index + 1); 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG). 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Requires that ce32 is special. 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int getIndirectCE32(int ce32) { 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(Collation.isSpecialCE32(ce32)); 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int tag = Collation.tagFromCE32(ce32); 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(tag == Collation.DIGIT_TAG) { 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fetch the non-numeric-collation CE32. 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = ce32s[Collation.indexFromCE32(ce32)]; 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(tag == Collation.LEAD_SURROGATE_TAG) { 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = Collation.UNASSIGNED_CE32; 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(tag == Collation.U0000_TAG) { 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fetch the normal ce32 for U+0000. 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = ce32s[0]; 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ce32; 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG), 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * if ce32 is special. 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int getFinalCE32(int ce32) { 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Collation.isSpecialCE32(ce32)) { 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = getIndirectCE32(ce32); 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ce32; 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Computes a CE from c's ce32 which has the OFFSET_TAG. 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long getCEFromOffsetCE32(int c, int ce32) { 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long dataCE = ces[Collation.indexFromCE32(ce32)]; 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Collation.makeCE(Collation.getThreeBytePrimaryForOffsetData(c, dataCE)); 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the single CE that c maps to. 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Throws UnsupportedOperationException if c does not map to a single CE. 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long getSingleCE(int c) { 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationData d; 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ce32 = getCE32(c); 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ce32 == Collation.FALLBACK_CE32) { 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert d = base; 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = base.getCE32(c); 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert d = this; 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(Collation.isSpecialCE32(ce32)) { 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch(Collation.tagFromCE32(ce32)) { 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.LATIN_EXPANSION_TAG: 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.BUILDER_DATA_TAG: 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.PREFIX_TAG: 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.CONTRACTION_TAG: 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.HANGUL_TAG: 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.LEAD_SURROGATE_TAG: 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException(String.format( 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "there is not exactly one collation element for U+%04X (CE32 0x%08x)", 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c, ce32)); 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.FALLBACK_TAG: 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.RESERVED_TAG_3: 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new AssertionError(String.format( 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "unexpected CE32 tag for U+%04X (CE32 0x%08x)", c, ce32)); 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.LONG_PRIMARY_TAG: 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Collation.ceFromLongPrimaryCE32(ce32); 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.LONG_SECONDARY_TAG: 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Collation.ceFromLongSecondaryCE32(ce32); 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.EXPANSION32_TAG: 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Collation.lengthFromCE32(ce32) == 1) { 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = d.ce32s[Collation.indexFromCE32(ce32)]; 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException(String.format( 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "there is not exactly one collation element for U+%04X (CE32 0x%08x)", 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c, ce32)); 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.EXPANSION_TAG: { 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Collation.lengthFromCE32(ce32) == 1) { 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return d.ces[Collation.indexFromCE32(ce32)]; 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException(String.format( 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "there is not exactly one collation element for U+%04X (CE32 0x%08x)", 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c, ce32)); 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.DIGIT_TAG: 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fetch the non-numeric-collation CE32 and continue. 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = d.ce32s[Collation.indexFromCE32(ce32)]; 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.U0000_TAG: 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(c == 0); 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fetch the normal ce32 for U+0000 and continue. 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = d.ce32s[0]; 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.OFFSET_TAG: 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return d.getCEFromOffsetCE32(c, ce32); 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case Collation.IMPLICIT_TAG: 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Collation.unassignedCEFromCodePoint(c); 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Collation.ceFromSimpleCE32(ce32); 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the FCD16 value for code point c. c must be >= 0. 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int getFCD16(int c) { 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return nfcImpl.getFCD16(c); 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the first primary for the script's reordering group. 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the primary with only the first primary lead byte of the group 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (not necessarily an actual root collator primary weight), 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or 0 if the script is unknown 1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long getFirstPrimaryForGroup(int script) { 196f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int index = getScriptIndex(script); 197f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return index == 0 ? 0 : (long)scriptStarts[index] << 16; 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the last primary for the script's reordering group. 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the last primary of the group 2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (not an actual root collator primary weight), 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or 0 if the script is unknown 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public long getLastPrimaryForGroup(int script) { 207f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int index = getScriptIndex(script); 208f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(index == 0) { 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 211f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert long limit = scriptStarts[index + 1]; 212f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return (limit << 16) - 1; 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Finds the reordering group which contains the primary weight. 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the first script of the group, or -1 if the weight is beyond the last group 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getGroupForPrimary(long p) { 220f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert p >>= 16; 221f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(p < scriptStarts[1] || scriptStarts[scriptStarts.length - 1] <= p) { 222f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return -1; 223f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 224f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int index = 1; 225f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert while(p >= scriptStarts[index + 1]) { ++index; } 226f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert for(int i = 0; i < numScripts; ++i) { 227f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(scriptsIndex[i] == index) { 228f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return i; 229f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 230f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 231f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { 232f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(scriptsIndex[numScripts + i] == index) { 233f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return Collator.ReorderCodes.FIRST + i; 2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return -1; 2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 239f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert private int getScriptIndex(int script) { 240f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(script < 0) { 241f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return 0; 242f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } else if(script < numScripts) { 243f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return scriptsIndex[script]; 244f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } else if(script < Collator.ReorderCodes.FIRST) { 245f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return 0; 246f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } else { 247f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert script -= Collator.ReorderCodes.FIRST; 248f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(script < MAX_NUM_SPECIAL_REORDER_CODES) { 249f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return scriptsIndex[numScripts + script]; 250f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } else { 251f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return 0; 2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int[] getEquivalentScripts(int script) { 257f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int index = getScriptIndex(script); 258f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(index == 0) { return EMPTY_INT_ARRAY; } 259f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(script >= Collator.ReorderCodes.FIRST) { 260f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Special groups have no aliases. 261f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return new int[] { script }; 262f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 263f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert 264f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int length = 0; 265f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert for(int i = 0; i < numScripts; ++i) { 266f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(scriptsIndex[i] == index) { 267f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert ++length; 268f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 269f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 270f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int[] dest = new int[length]; 271f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(length == 1) { 272f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert dest[0] = script; 273f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return dest; 274f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 275f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert length = 0; 276f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert for(int i = 0; i < numScripts; ++i) { 277f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(scriptsIndex[i] == index) { 278f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert dest[length++] = i; 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return dest; 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 285f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * Writes the permutation of primary-weight ranges 286f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * for the given reordering of scripts and groups. 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The caller checks for illegal arguments and 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * takes care of [DEFAULT] and memory allocation. 289f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * 290f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * <p>Each list element will be a (limit, offset) pair as described 291f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * for the CollationSettings.reorderRanges. 292f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * The list will be empty if no ranges are reordered. 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 294f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert void makeReorderRanges(int[] reorder, UVector32 ranges) { 295f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert makeReorderRanges(reorder, false, ranges); 296f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 297f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert 298f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert private void makeReorderRanges(int[] reorder, boolean latinMustMove, UVector32 ranges) { 299f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert ranges.removeAllElements(); 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length = reorder.length; 301f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(length == 0 || (length == 1 && reorder[0] == UScript.UNKNOWN)) { 302f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return; 3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 305f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Maps each script-or-group range to a new lead byte. 306f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert short[] table = new short[scriptStarts.length - 1]; // C++: uint8_t[] 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 308f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert { 309f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Set "don't care" values for reserved ranges. 310f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int index = scriptsIndex[ 311f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert numScripts + REORDER_RESERVED_BEFORE_LATIN - Collator.ReorderCodes.FIRST]; 312f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(index != 0) { 313f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert table[index] = 0xff; 314f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 315f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert index = scriptsIndex[ 316f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert numScripts + REORDER_RESERVED_AFTER_LATIN - Collator.ReorderCodes.FIRST]; 317f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(index != 0) { 318f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert table[index] = 0xff; 319f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 322f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Never reorder special low and high primary lead bytes. 323f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert assert(scriptStarts.length >= 2); 324f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert assert(scriptStarts[0] == 0); 325f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int lowStart = scriptStarts[1]; 326f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert assert(lowStart == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8)); 327f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int highLimit = scriptStarts[scriptStarts.length - 1]; 328f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert assert(highLimit == (Collation.TRAIL_WEIGHT_BYTE << 8)); 329f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Get the set of special reorder codes in the input list. 331f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // This supports a fixed number of special reorder codes; 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // it works for data with codes beyond Collator.ReorderCodes.LIMIT. 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int specials = 0; 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i = 0; i < length; ++i) { 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int reorderCode = reorder[i] - Collator.ReorderCodes.FIRST; 336f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) { 3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert specials |= 1 << reorderCode; 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Start the reordering with the special low reorder codes that do not occur in the input. 342f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { 343f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int index = scriptsIndex[numScripts + i]; 344f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(index != 0 && (specials & (1 << i)) == 0) { 345f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert lowStart = addLowScriptRange(table, index, lowStart); 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 349f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Skip the reserved range before Latin if Latin is the first script, 350f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // so that we do not move it unnecessarily. 351f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int skippedReserved = 0; 352f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(specials == 0 && reorder[0] == UScript.LATIN && !latinMustMove) { 353f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int index = scriptsIndex[UScript.LATIN]; 354f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert assert(index != 0); 355f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int start = scriptStarts[index]; 356f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert assert(lowStart <= start); 357f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert skippedReserved = start - lowStart; 358f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert lowStart = start; 359f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 360f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert 361f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Reorder according to the input scripts, continuing from the bottom of the primary range. 362f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert boolean hasReorderToEnd = false; 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i = 0; i < length;) { 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int script = reorder[i++]; 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(script == UScript.UNKNOWN) { 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Put the remaining scripts at the top. 367f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert hasReorderToEnd = true; 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(i < length) { 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert script = reorder[--length]; 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(script == UScript.UNKNOWN) { // Must occur at most once. 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException( 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "setReorderCodes(): duplicate UScript.UNKNOWN"); 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(script == Collator.ReorderCodes.DEFAULT) { 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException( 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "setReorderCodes(): UScript.DEFAULT together with other scripts"); 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 378f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int index = getScriptIndex(script); 379f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(index == 0) { continue; } 380f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(table[index] != 0) { // Duplicate or equivalent script. 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException( 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "setReorderCodes(): duplicate or equivalent script " + 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert scriptCodeString(script)); 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 385f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert highLimit = addHighScriptRange(table, index, highLimit); 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(script == Collator.ReorderCodes.DEFAULT) { 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The default code must be the only one in the list, and that is handled by the caller. 3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Otherwise it must not be used. 3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException( 3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "setReorderCodes(): UScript.DEFAULT together with other scripts"); 3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 395f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int index = getScriptIndex(script); 396f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(index == 0) { continue; } 397f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(table[index] != 0) { // Duplicate or equivalent script. 3987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException( 3997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "setReorderCodes(): duplicate or equivalent script " + 4007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert scriptCodeString(script)); 4017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 402f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert lowStart = addLowScriptRange(table, index, lowStart); 4037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Put all remaining scripts into the middle. 406f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert for(int i = 1; i < scriptStarts.length - 1; ++i) { 407f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int leadByte = table[i]; 408f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(leadByte != 0) { continue; } 409f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int start = scriptStarts[i]; 410f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(!hasReorderToEnd && start > lowStart) { 411f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // No need to move this script. 412f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert lowStart = start; 413f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 414f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert lowStart = addLowScriptRange(table, i, lowStart); 4157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 416f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(lowStart > highLimit) { 417f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if((lowStart - (skippedReserved & 0xff00)) <= highLimit) { 418f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Try not skipping the before-Latin reserved range. 419f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert makeReorderRanges(reorder, true, ranges); 420f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return; 421f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 422f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // We need more primary lead bytes than available, despite the reserved ranges. 423f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert throw new ICUException( 424f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert "setReorderCodes(): reordering too many partial-primary-lead-byte scripts"); 425f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 426f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert 427f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Turn lead bytes into a list of (limit, offset) pairs. 428f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Encode each pair in one list element: 429f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Upper 16 bits = limit, lower 16 = signed lead byte offset. 430f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int offset = 0; 431f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert for(int i = 1;; ++i) { 432f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int nextOffset = offset; 433f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert while(i < scriptStarts.length - 1) { 434f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int newLeadByte = table[i]; 435f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(newLeadByte == 0xff) { 436f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // "Don't care" lead byte for reserved range, continue with current offset. 437f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } else { 438f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert nextOffset = newLeadByte - (scriptStarts[i] >> 8); 439f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(nextOffset != offset) { break; } 440f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 441f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert ++i; 442f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 443f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(offset != 0 || i < scriptStarts.length - 1) { 444f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert ranges.addElement(((int)scriptStarts[i] << 16) | (offset & 0xffff)); 445f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 446f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(i == scriptStarts.length - 1) { break; } 447f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert offset = nextOffset; 448f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 449f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 450f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert 451f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert private int addLowScriptRange(short[] table, int index, int lowStart) { 452f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int start = scriptStarts[index]; 453f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if((start & 0xff) < (lowStart & 0xff)) { 454f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert lowStart += 0x100; 455f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 456f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert table[index] = (short)(lowStart >> 8); 457f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int limit = scriptStarts[index + 1]; 458f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff); 459f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return lowStart; 460f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 461f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert 462f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert private int addHighScriptRange(short[] table, int index, int highLimit) { 463f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int limit = scriptStarts[index + 1]; 464f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if((limit & 0xff) > (highLimit & 0xff)) { 465f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert highLimit -= 0x100; 466f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 467f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int start = scriptStarts[index]; 468f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff); 469f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert table[index] = (short)(highLimit >> 8); 470f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return highLimit; 4717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static String scriptCodeString(int script) { 4747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Do not use the script name here: We do not want to depend on that data. 4757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (script < Collator.ReorderCodes.FIRST) ? 4767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Integer.toString(script) : "0x" + Integer.toHexString(script); 4777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int[] EMPTY_INT_ARRAY = new int[0]; 4807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** @see jamoCE32s */ 4827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert static final int JAMO_CE32S_LENGTH = 19 + 21 + 27; 4837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Main lookup trie. */ 4857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Trie2_32 trie; 4867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Array of CE32 values. 4887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * At index 0 there must be CE32(U+0000) 4897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * to support U+0000's special-tag for NUL-termination handling. 4907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int[] ce32s; 4927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Array of CE values for expansions and OFFSET_TAG. */ 4937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long[] ces; 4947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Array of prefix and contraction-suffix matching data. */ 4957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String contexts; 4967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Base collation data, or null if this data itself is a base. */ 4977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public CollationData base; 4987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T. 5007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * They are normally simple CE32s, rarely expansions. 5017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * For fast handling of HANGUL_TAG. 5027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int[] jamoCE32s = new int[JAMO_CE32S_LENGTH]; 5047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Normalizer2Impl nfcImpl; 5057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** The single-byte primary weight (xx000000) for numeric collation. */ 5067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long numericPrimary = 0x12000000; 5077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 256 flags for which primary-weight lead bytes are compressible. */ 5097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean[] compressibleBytes; 5107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set of code points that are unsafe for starting string comparison after an identical prefix, 5127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or in backwards CE iteration. 5137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UnicodeSet unsafeBackwardSet; 5157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Fast Latin table for common-Latin-text string comparisons. 5187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Data structure see class CollationFastLatin. 5197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public char[] fastLatinTable; 5217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Header portion of the fastLatinTable. 5237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * In C++, these are one array, and the header is skipped for mapping characters. 5247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * In Java, two arrays work better. 5257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char[] fastLatinTableHeader; 5277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Data for scripts and reordering groups. 5307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Uses include building a reordering permutation table and 5317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * providing script boundaries to AlphabeticIndex. 532f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert */ 533f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int numScripts; 534f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert /** 535f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * The length of scriptsIndex is numScripts+16. 536f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts. 537f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * 16 special reorder codes (not all used) are mapped starting at numScripts. 538f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit. 539f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * There are special codes at the end for reorder-reserved primary ranges. 5407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 541f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * <p>Multiple scripts may share a range and index, for example Hira & Kana. 542f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert */ 543f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert char[] scriptsIndex; 544f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert /** 545f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * Start primary weight (top 16 bits only) for a group/script/reserved range 546f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * indexed by scriptsIndex. 547f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * The first range (separators & terminators) and the last range (trailing weights) 548f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * are not reorderable, and no scriptsIndex entry points to them. 5497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 550f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert char[] scriptStarts; 5517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Collation elements in the root collator. 5547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Used by the CollationRootElements class. The data structure is described there. 5557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * null in a tailoring. 5567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public long[] rootElements; 5587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 559