17935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 27935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Copyright (C) 2000-2014, International Business Machines Corporation and 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * others. All Rights Reserved. 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.text; 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.nio.CharBuffer; 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.text.CharacterIterator; 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Norm2AllModes; 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Normalizer2Impl; 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.UCaseProps; 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UCharacter; 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.ICUCloneNotSupportedException; 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Unicode Normalization 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <h2>Unicode normalization API</h2> 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <code>normalize</code> transforms Unicode text into an equivalent composed or 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * decomposed form, allowing for easier sorting and searching of text. 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <code>normalize</code> supports the standard normalization forms described in 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Unicode Standard Annex #15 — Unicode Normalization Forms</a>. 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Characters with accents or other adornments can be encoded in 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * several different ways in Unicode. For example, take the character A-acute. 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * In Unicode, this can be encoded as a single character (the 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * "composed" form): 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <pre> 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 00C1 LATIN CAPITAL LETTER A WITH ACUTE 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </pre> 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or as two separate characters (the "decomposed" form): 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <pre> 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 0041 LATIN CAPITAL LETTER A 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 0301 COMBINING ACUTE ACCENT 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </pre> 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * To a user of your program, however, both of these sequences should be 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * treated as the same "user-level" character "A with acute accent". When you 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * are searching or comparing text, you must ensure that these two sequences are 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * treated equivalently. In addition, you must handle characters with more than 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * one accent. Sometimes the order of a character's combining accents is 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * significant, while in other cases accent sequences in different orders are 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * really equivalent. 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Similarly, the string "ffi" can be encoded as three separate letters: 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <pre> 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 0066 LATIN SMALL LETTER F 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 0066 LATIN SMALL LETTER F 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 0069 LATIN SMALL LETTER I 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </pre> 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or as the single character 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <pre> 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * FB03 LATIN SMALL LIGATURE FFI 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </pre> 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The ffi ligature is not a distinct semantic character, and strictly speaking 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * it shouldn't be in Unicode at all, but it was included for compatibility 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * with existing character sets that already provided it. The Unicode standard 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * identifies such characters by giving them "compatibility" decompositions 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * into the corresponding semantic characters. When sorting and searching, you 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * will often want to use these mappings. 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <code>normalize</code> helps solve these problems by transforming text into 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the canonical composed and decomposed forms as shown in the first example 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * above. In addition, you can have it perform compatibility decompositions so 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * that you can treat compatibility characters the same as their equivalents. 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Finally, <code>normalize</code> rearranges accents into the proper canonical 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * order, so that you do not have to worry about accent rearrangement on your 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * own. 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Form FCD, "Fast C or D", is also designed for collation. 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * It allows to work on strings that are not necessarily normalized 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * with an algorithm (like in collation) that works under "canonical closure", 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * i.e., it treats precomposed characters and their decomposed equivalents the 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * same. 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * It is not a normalization form because it does not provide for uniqueness of 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * representation. Multiple strings may be canonically equivalent (their NFDs 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * are identical) and may all conform to FCD without being identical themselves. 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The form is defined such that the "raw decomposition", the recursive 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * canonical decomposition of each character, results in a string that is 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * canonically ordered. This means that precomposed characters are allowed for 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * as long as their decompositions do not need canonical reordering. 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Its advantage for a process like collation is that all NFD and most NFC texts 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - and many unnormalized texts - already conform to FCD and do not need to be 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * normalized (NFD) for such a process. The FCD quick check will return YES for 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * most strings in practice. 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * normalize(FCD) may be implemented with NFD. 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications): 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * http://www.unicode.org/notes/tn5/#FCD 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * ICU collation performs either NFD or FCD normalization automatically if 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * normalization is turned on for the collator object. Beyond collation and 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * string search, normalized strings may be useful for string equivalence 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * comparisons, transliteration/transcription, unique representations, etc. 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The W3C generally recommends to exchange texts in NFC. 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Note also that most legacy character encodings use only precomposed forms and 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * often do not encode any combining marks by themselves. For conversion to such 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * character encodings the Unicode text needs to be normalized to NFC. 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * For more usage examples, see the Unicode Standard Annex. 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Note: The Normalizer class also provides API for iterative normalization. 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * While the setIndex() and getIndex() refer to indices in the 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * underlying Unicode input text, the next() and previous() methods 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * iterate through characters in the normalized output. 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This means that there is not necessarily a one-to-one correspondence 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * between characters returned by next() and previous() and the indices 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * passed to and returned from setIndex() and getIndex(). 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * It is for this reason that Normalizer does not implement the CharacterIterator interface. 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic final class Normalizer implements Cloneable { 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The input text and our position in it 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private UCharacterIterator text; 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private Normalizer2 norm2; 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private Mode mode; 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int options; 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The normalization buffer is the result of normalization 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // of the source in [currentIndex..nextIndex[ . 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int currentIndex; 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int nextIndex; 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // A buffer for holding intermediate results 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private StringBuilder buffer; 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int bufferPos; 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Helper classes to defer loading of normalization data. 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class ModeImpl { 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private ModeImpl(Normalizer2 n2) { 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert normalizer2 = n2; 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final Normalizer2 normalizer2; 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class NFDModeImpl { 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance()); 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class NFKDModeImpl { 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance()); 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class NFCModeImpl { 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance()); 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class NFKCModeImpl { 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance()); 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class FCDModeImpl { 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final ModeImpl INSTANCE = new ModeImpl(Norm2AllModes.getFCDNormalizer2()); 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class Unicode32 { 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze(); 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class NFD32ModeImpl { 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final ModeImpl INSTANCE = 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(), 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Unicode32.INSTANCE)); 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class NFKD32ModeImpl { 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final ModeImpl INSTANCE = 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(), 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Unicode32.INSTANCE)); 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class NFC32ModeImpl { 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final ModeImpl INSTANCE = 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(), 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Unicode32.INSTANCE)); 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class NFKC32ModeImpl { 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final ModeImpl INSTANCE = 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(), 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Unicode32.INSTANCE)); 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class FCD32ModeImpl { 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final ModeImpl INSTANCE = 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(), 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Unicode32.INSTANCE)); 1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Options bit set value to select Unicode 3.2 normalization 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (except NormalizationCorrections). 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * At most one Unicode version can be selected at a time. 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int UNICODE_3_2=0x20; 2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Constant indicating that the end of the iteration has been reached. 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}. 2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int DONE = UCharacterIterator.DONE; 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Constants for normalization modes. 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The Mode class is not intended for public subclassing. 2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Only the Mode constants provided by the Normalizer class should be used, 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and any fields or methods should not be called or overridden by users. 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static abstract class Mode { 2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Sole constructor 2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @internal 2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated This API is ICU internal only. 2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected Mode() { 2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @internal 2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated This API is ICU internal only. 2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected abstract Normalizer2 getNormalizer2(int options); 2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class NONEMode extends Mode { 2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } 2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class NFDMode extends Mode { 2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected Normalizer2 getNormalizer2(int options) { 2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (options&UNICODE_3_2) != 0 ? 2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2; 2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class NFKDMode extends Mode { 2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected Normalizer2 getNormalizer2(int options) { 2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (options&UNICODE_3_2) != 0 ? 2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2; 2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class NFCMode extends Mode { 2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected Normalizer2 getNormalizer2(int options) { 2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (options&UNICODE_3_2) != 0 ? 2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2; 2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class NFKCMode extends Mode { 2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected Normalizer2 getNormalizer2(int options) { 2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (options&UNICODE_3_2) != 0 ? 2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2; 2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class FCDMode extends Mode { 2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected Normalizer2 getNormalizer2(int options) { 2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (options&UNICODE_3_2) != 0 ? 2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2; 2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * No decomposition/composition. 2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final Mode NONE = new NONEMode(); 2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Canonical decomposition. 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final Mode NFD = new NFDMode(); 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compatibility decomposition. 2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final Mode NFKD = new NFKDMode(); 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Canonical decomposition followed by canonical composition. 2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final Mode NFC = new NFCMode(); 2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Default normalization. 2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final Mode DEFAULT = NFC; 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compatibility decomposition followed by canonical composition. 3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final Mode NFKC =new NFKCMode(); 3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * "Fast C or D" form. 3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final Mode FCD = new FCDMode(); 3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Null operation for use with the {@link com.ibm.icu.text.Normalizer constructors} 3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and the static {@link #normalize normalize} method. This value tells 3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the <tt>Normalizer</tt> to do nothing but return unprocessed characters 3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * from the underlying String or CharacterIterator. If you have code which 3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * requires raw text at some times and normalized text at others, you can 3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * use <tt>NO_OP</tt> for the cases where you want raw text, rather 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * than having a separate code path that bypasses <tt>Normalizer</tt> 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * altogether. 3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #setMode 3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated ICU 2.8. Use Nomalizer.NONE 3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #NONE 3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final Mode NO_OP = NONE; 3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Canonical decomposition followed by canonical composition. Used with the 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link com.ibm.icu.text.Normalizer constructors} and the static 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link #normalize normalize} method to determine the operation to be 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * performed. 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * off, this operation produces output that is in 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Form</a> 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <b>C</b>. 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #setMode 3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated ICU 2.8. Use Normalier.NFC 3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #NFC 3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final Mode COMPOSE = NFC; 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compatibility decomposition followed by canonical composition. 3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Used with the {@link com.ibm.icu.text.Normalizer constructors} and the static 3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link #normalize normalize} method to determine the operation to be 3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * performed. 3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * off, this operation produces output that is in 3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Form</a> 3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <b>KC</b>. 3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #setMode 3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated ICU 2.8. Use Normalizer.NFKC 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #NFKC 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final Mode COMPOSE_COMPAT = NFKC; 3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Canonical decomposition. This value is passed to the 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link com.ibm.icu.text.Normalizer constructors} and the static 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link #normalize normalize} 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * method to determine the operation to be performed. 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * off, this operation produces output that is in 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Form</a> 3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <b>D</b>. 3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #setMode 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated ICU 2.8. Use Normalizer.NFD 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #NFD 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final Mode DECOMP = NFD; 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compatibility decomposition. This value is passed to the 3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link com.ibm.icu.text.Normalizer constructors} and the static 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link #normalize normalize} 3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * method to determine the operation to be performed. 3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * off, this operation produces output that is in 3957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 3967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Form</a> 3977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <b>KD</b>. 3987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 3997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #setMode 4007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated ICU 2.8. Use Normalizer.NFKD 4017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #NFKD 4027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 4047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final Mode DECOMP_COMPAT = NFKD; 4057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Option to disable Hangul/Jamo composition and decomposition. 4087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This option applies to Korean text, 4097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * which can be represented either in the Jamo alphabet or in Hangul 4107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * characters, which are really just two or three Jamo combined 4117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * into one visual glyph. Since Jamo takes up more storage space than 4127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Hangul, applications that process only Hangul text may wish to turn 4137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * this option on when decomposing text. 4147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 4157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The Unicode standard treates Hangul to Jamo conversion as a 4167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * canonical decomposition, so this option must be turned <b>off</b> if you 4177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * wish to transform strings into one of the standard 4187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> 4197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Unicode Normalization Forms</a>. 4207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 4217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #setOption 4227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated ICU 2.8. This option is no longer supported. 4237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 4257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int IGNORE_HANGUL = 0x0001; 4267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Result values for quickCheck(). 4297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * For details see Unicode Technical Report 15. 4307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 4317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final class QuickCheckResult{ 4337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //private int resultValue; 4347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private QuickCheckResult(int value) { 4357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //resultValue=value; 4367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Indicates that string is not in the normalized format 4407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 4417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final QuickCheckResult NO = new QuickCheckResult(0); 4437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Indicates that string is in the normalized format 4467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 4477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final QuickCheckResult YES = new QuickCheckResult(1); 4497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Indicates it cannot be determined if string is in the normalized 4527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * format without further thorough checks. 4537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 4547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final QuickCheckResult MAYBE = new QuickCheckResult(2); 4567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Option bit for compare: 4597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Case sensitively compare the strings 4607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 4617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int FOLD_CASE_DEFAULT = UCharacter.FOLD_CASE_DEFAULT; 4637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Option bit for compare: 4667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Both input strings are assumed to fulfill FCD conditions. 4677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 4687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int INPUT_IS_FCD = 0x20000; 4707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Option bit for compare: 4737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Perform case-insensitive comparison. 4747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 4757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int COMPARE_IGNORE_CASE = 0x10000; 4777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Option bit for compare: 4807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compare strings in code point order instead of code unit order. 4817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 4827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int COMPARE_CODE_POINT_ORDER = 0x8000; 4847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Option value for case folding: 4877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I 4887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and dotless i appropriately for Turkic languages (tr, az). 4897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I 4907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 4917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I; 4937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Lowest-order bit number of compare() options bits corresponding to 4967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * normalization options bits. 4977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 4987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The options parameter for compare() uses most bits for 4997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * itself and for various comparison and folding flags. 5007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The most significant bits, however, are shifted down and passed on 5017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * to the normalization implementation. 5027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (That is, from compare(..., options, ...), 5037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * options>>COMPARE_NORM_OPTIONS_SHIFT will be passed on to the 5047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * internal normalization functions.) 5057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 5067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #compare 5077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 5087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int COMPARE_NORM_OPTIONS_SHIFT = 20; 5107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //------------------------------------------------------------------------- 5127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Iterator constructors 5137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //------------------------------------------------------------------------- 5147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Creates a new <tt>Normalizer</tt> object for iterating over the 5177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * normalized form of a given string. 5187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 5197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The <tt>options</tt> parameter specifies which optional 5207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <tt>Normalizer</tt> features are to be enabled for this object. 5217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 5227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param str The string to be normalized. The normalization 5237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * will start at the beginning of the string. 5247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 5257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode The normalization mode. 5267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 5277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param opt Any optional features to be enabled. 5287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Currently the only available option is {@link #UNICODE_3_2}. 5297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If you want the default behavior corresponding to one of the 5307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * standard Unicode Normalization Forms, use 0 for this argument. 5317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 5327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Normalizer(String str, Mode mode, int opt) { 5347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.text = UCharacterIterator.getInstance(str); 5357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.mode = mode; 5367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.options=opt; 5377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm2 = mode.getNormalizer2(opt); 5387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer = new StringBuilder(); 5397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Creates a new <tt>Normalizer</tt> object for iterating over the 5437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * normalized form of the given text. 5447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 5457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param iter The input text to be normalized. The normalization 5467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * will start at the beginning of the string. 5477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 5487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode The normalization mode. 5497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 5507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param opt Any optional features to be enabled. 5517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Currently the only available option is {@link #UNICODE_3_2}. 5527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If you want the default behavior corresponding to one of the 5537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * standard Unicode Normalization Forms, use 0 for this argument. 5547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 5557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Normalizer(CharacterIterator iter, Mode mode, int opt) { 5577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone()); 5587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.mode = mode; 5597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.options = opt; 5607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm2 = mode.getNormalizer2(opt); 5617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer = new StringBuilder(); 5627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Creates a new <tt>Normalizer</tt> object for iterating over the 5667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * normalized form of the given text. 5677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 5687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param iter The input text to be normalized. The normalization 5697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * will start at the beginning of the string. 5707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 5717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode The normalization mode. 5727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options The normalization options, ORed together (0 for no options). 5737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 5747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Normalizer(UCharacterIterator iter, Mode mode, int options) { 5767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert try { 5777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.text = (UCharacterIterator)iter.clone(); 5787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.mode = mode; 5797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.options = options; 5807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm2 = mode.getNormalizer2(options); 5817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer = new StringBuilder(); 5827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } catch (CloneNotSupportedException e) { 5837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUCloneNotSupportedException(e); 5847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Clones this <tt>Normalizer</tt> object. All properties of this 5897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * object are duplicated in the new object, including the cloning of any 5907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link CharacterIterator} that was passed in to the constructor 5917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or to {@link #setText(CharacterIterator) setText}. 5927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * However, the text storage underlying 5937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the <tt>CharacterIterator</tt> is not duplicated unless the 5947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * iterator's <tt>clone</tt> method does so. 5957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 5967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Object clone() { 5987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert try { 5997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Normalizer copy = (Normalizer) super.clone(); 6007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert copy.text = (UCharacterIterator) text.clone(); 6017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert copy.mode = mode; 6027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert copy.options = options; 6037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert copy.norm2 = norm2; 6047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert copy.buffer = new StringBuilder(buffer); 6057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert copy.bufferPos = bufferPos; 6067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert copy.currentIndex = currentIndex; 6077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert copy.nextIndex = nextIndex; 6087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return copy; 6097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert catch (CloneNotSupportedException e) { 6117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUCloneNotSupportedException(e); 6127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //-------------------------------------------------------------------------- 6167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Static Utility methods 6177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //-------------------------------------------------------------------------- 6187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) { 6207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (compat ? NFKC : NFC).getNormalizer2(options); 6217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) { 6237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (compat ? NFKD : NFD).getNormalizer2(options); 6247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 6277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compose a string. 6287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The string will be composed to according to the specified mode. 6297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param str The string to compose. 6307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param compat If true the string will be composed according to 6317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * NFKC rules and if false will be composed according to 6327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * NFC rules. 6337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return String The composed string 6347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 6357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 6367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static String compose(String str, boolean compat) { 6377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return compose(str,compat,0); 6387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 6417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compose a string. 6427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The string will be composed to according to the specified mode. 6437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param str The string to compose. 6447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param compat If true the string will be composed according to 6457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * NFKC rules and if false will be composed according to 6467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * NFC rules. 6477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options The only recognized option is UNICODE_3_2 6487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return String The composed string 6497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 6507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 6517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static String compose(String str, boolean compat, int options) { 6527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return getComposeNormalizer2(compat, options).normalize(str); 6537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 6567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compose a string. 6577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The string will be composed to according to the specified mode. 6587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param source The char array to compose. 6597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param target A char buffer to receive the normalized text. 6607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param compat If true the char array will be composed according to 6617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * NFKC rules and if false will be composed according to 6627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * NFC rules. 6637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options The normalization options, ORed together (0 for no options). 6647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return int The total buffer size needed;if greater than length of 6657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * result, the output was truncated. 6667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @exception IndexOutOfBoundsException if target.length is less than the 6677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * required length 6687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 6697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 6707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int compose(char[] source,char[] target, boolean compat, int options) { 6717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return compose(source, 0, source.length, target, 0, target.length, compat, options); 6727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 6757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compose a string. 6767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The string will be composed to according to the specified mode. 6777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param src The char array to compose. 6787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param srcStart Start index of the source 6797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param srcLimit Limit index of the source 6807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param dest The char buffer to fill in 6817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param destStart Start index of the destination buffer 6827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param destLimit End index of the destination buffer 6837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param compat If true the char array will be composed according to 6847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * NFKC rules and if false will be composed according to 6857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * NFC rules. 6867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options The normalization options, ORed together (0 for no options). 6877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return int The total buffer size needed;if greater than length of 6887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * result, the output was truncated. 6897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @exception IndexOutOfBoundsException if target.length is less than the 6907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * required length 6917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 6927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 6937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int compose(char[] src,int srcStart, int srcLimit, 6947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char[] dest,int destStart, int destLimit, 6957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean compat, int options) { 6967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart); 6977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsAppendable app = new CharsAppendable(dest, destStart, destLimit); 6987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert getComposeNormalizer2(compat, options).normalize(srcBuffer, app); 6997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return app.length(); 7007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 7037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Decompose a string. 7047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The string will be decomposed to according to the specified mode. 7057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param str The string to decompose. 7067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param compat If true the string will be decomposed according to NFKD 7077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * rules and if false will be decomposed according to NFD 7087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * rules. 7097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return String The decomposed string 7107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 7117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 7127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static String decompose(String str, boolean compat) { 7137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return decompose(str,compat,0); 7147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 7177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Decompose a string. 7187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The string will be decomposed to according to the specified mode. 7197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param str The string to decompose. 7207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param compat If true the string will be decomposed according to NFKD 7217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * rules and if false will be decomposed according to NFD 7227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * rules. 7237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options The normalization options, ORed together (0 for no options). 7247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return String The decomposed string 7257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 7267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 7277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static String decompose(String str, boolean compat, int options) { 7287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return getDecomposeNormalizer2(compat, options).normalize(str); 7297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 7327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Decompose a string. 7337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The string will be decomposed to according to the specified mode. 7347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param source The char array to decompose. 7357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param target A char buffer to receive the normalized text. 7367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param compat If true the char array will be decomposed according to NFKD 7377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * rules and if false will be decomposed according to 7387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * NFD rules. 7397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return int The total buffer size needed;if greater than length of 7407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * result,the output was truncated. 7417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options The normalization options, ORed together (0 for no options). 7427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @exception IndexOutOfBoundsException if the target capacity is less than 7437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the required length 7447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 7457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 7467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int decompose(char[] source,char[] target, boolean compat, int options) { 7477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return decompose(source, 0, source.length, target, 0, target.length, compat, options); 7487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 7517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Decompose a string. 7527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The string will be decomposed to according to the specified mode. 7537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param src The char array to compose. 7547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param srcStart Start index of the source 7557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param srcLimit Limit index of the source 7567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param dest The char buffer to fill in 7577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param destStart Start index of the destination buffer 7587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param destLimit End index of the destination buffer 7597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param compat If true the char array will be decomposed according to NFKD 7607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * rules and if false will be decomposed according to 7617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * NFD rules. 7627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options The normalization options, ORed together (0 for no options). 7637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return int The total buffer size needed;if greater than length of 7647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * result,the output was truncated. 7657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @exception IndexOutOfBoundsException if the target capacity is less than 7667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the required length 7677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 7687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 7697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int decompose(char[] src,int srcStart, int srcLimit, 7707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char[] dest,int destStart, int destLimit, 7717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean compat, int options) { 7727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart); 7737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsAppendable app = new CharsAppendable(dest, destStart, destLimit); 7747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app); 7757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return app.length(); 7767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 7797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizes a <tt>String</tt> using the given normalization operation. 7807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 7817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The <tt>options</tt> parameter specifies which optional 7827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <tt>Normalizer</tt> features are to be enabled for this operation. 7837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Currently the only available option is {@link #UNICODE_3_2}. 7847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If you want the default behavior corresponding to one of the standard 7857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Unicode Normalization Forms, use 0 for this argument. 7867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 7877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param str the input string to be normalized. 7887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode the normalization mode 7897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options the optional features to be enabled. 7907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return String the normalized string 7917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 7927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 7937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static String normalize(String str, Mode mode, int options) { 7947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return mode.getNormalizer2(options).normalize(str); 7957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 7987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalize a string. 7997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The string will be normalized according to the specified normalization 8007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * mode and options. 8017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param src The string to normalize. 8027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode The normalization mode; one of Normalizer.NONE, 8037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 8047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.NFKD, Normalizer.DEFAULT 8057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the normalized string 8067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 8077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 8087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static String normalize(String src,Mode mode) { 8107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return normalize(src, mode, 0); 8117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 8137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalize a string. 8147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The string will be normalized according to the specified normalization 8157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * mode and options. 8167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param source The char array to normalize. 8177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param target A char buffer to receive the normalized text. 8187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode The normalization mode; one of Normalizer.NONE, 8197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 8207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.NFKD, Normalizer.DEFAULT 8217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options The normalization options, ORed together (0 for no options). 8227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return int The total buffer size needed;if greater than length of 8237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * result, the output was truncated. 8247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @exception IndexOutOfBoundsException if the target capacity is less 8257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * than the required length 8267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 8277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int normalize(char[] source,char[] target, Mode mode, int options) { 8297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return normalize(source,0,source.length,target,0,target.length,mode, options); 8307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 8337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalize a string. 8347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The string will be normalized according to the specified normalization 8357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * mode and options. 8367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param src The char array to compose. 8377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param srcStart Start index of the source 8387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param srcLimit Limit index of the source 8397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param dest The char buffer to fill in 8407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param destStart Start index of the destination buffer 8417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param destLimit End index of the destination buffer 8427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode The normalization mode; one of Normalizer.NONE, 8437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 8447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.NFKD, Normalizer.DEFAULT 8457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options The normalization options, ORed together (0 for no options). 8467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return int The total buffer size needed;if greater than length of 8477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * result, the output was truncated. 8487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @exception IndexOutOfBoundsException if the target capacity is 8497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * less than the required length 8507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 8517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int normalize(char[] src,int srcStart, int srcLimit, 8537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char[] dest,int destStart, int destLimit, 8547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Mode mode, int options) { 8557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart); 8567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsAppendable app = new CharsAppendable(dest, destStart, destLimit); 8577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mode.getNormalizer2(options).normalize(srcBuffer, app); 8587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return app.length(); 8597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 8627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalize a codepoint according to the given mode 8637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param char32 The input string to be normalized. 8647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode The normalization mode 8657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options Options for use with exclusion set and tailored Normalization 8667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The only option that is currently recognized is UNICODE_3_2 8677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return String The normalized string 8687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 8697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #UNICODE_3_2 8707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static String normalize(int char32, Mode mode, int options) { 8727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(mode == NFD && options == 0) { 8737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String decomposition = Normalizer2.getNFCInstance().getDecomposition(char32); 8747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(decomposition == null) { 8757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decomposition = UTF16.valueOf(char32); 8767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return decomposition; 8787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return normalize(UTF16.valueOf(char32), mode, options); 8807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 8837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Convenience method to normalize a codepoint according to the given mode 8847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param char32 The input string to be normalized. 8857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode The normalization mode 8867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return String The normalized string 8877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 8887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static String normalize(int char32, Mode mode) { 8907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return normalize(char32, mode, 0); 8917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 8947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Convenience method. 8957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 8967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param source string for determining if it is in a normalized format 8977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 8987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.NFKC,Normalizer.NFKD) 8997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return Return code to specify if the text is normalized or not 9007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE) 9017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 9027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 9037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static QuickCheckResult quickCheck(String source, Mode mode) { 9047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return quickCheck(source, mode, 0); 9057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 9087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Performing quick check on a string, to quickly determine if the string is 9097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * in a particular normalization format. 9107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Three types of result can be returned Normalizer.YES, Normalizer.NO or 9117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument 9127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * string is in the desired normalized format, Normalizer.NO determines that 9137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * argument string is not in the desired normalized format. A 9147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.MAYBE result indicates that a more thorough check is required, 9157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the user may have to put the string in its normalized form and compare 9167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the results. 9177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 9187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param source string for determining if it is in a normalized format 9197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 9207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.NFKC,Normalizer.NFKD) 9217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options Options for use with exclusion set and tailored Normalization 9227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The only option that is currently recognized is UNICODE_3_2 9237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return Return code to specify if the text is normalized or not 9247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE) 9257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 9267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 9277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static QuickCheckResult quickCheck(String source, Mode mode, int options) { 9287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return mode.getNormalizer2(options).quickCheck(source); 9297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 9327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Convenience method. 9337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 9347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param source Array of characters for determining if it is in a 9357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * normalized format 9367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 9377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.NFKC,Normalizer.NFKD) 9387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options Options for use with exclusion set and tailored Normalization 9397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The only option that is currently recognized is UNICODE_3_2 9407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return Return code to specify if the text is normalized or not 9417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE) 9427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 9437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 9447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) { 9457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return quickCheck(source, 0, source.length, mode, options); 9467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 9497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Performing quick check on a string, to quickly determine if the string is 9507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * in a particular normalization format. 9517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Three types of result can be returned Normalizer.YES, Normalizer.NO or 9527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument 9537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * string is in the desired normalized format, Normalizer.NO determines that 9547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * argument string is not in the desired normalized format. A 9557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.MAYBE result indicates that a more thorough check is required, 9567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the user may have to put the string in its normalized form and compare 9577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the results. 9587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 9597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param source string for determining if it is in a normalized format 9607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param start the start index of the source 9617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param limit the limit index of the source it is equal to the length 9627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 9637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.NFKC,Normalizer.NFKD) 9647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options Options for use with exclusion set and tailored Normalization 9657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The only option that is currently recognized is UNICODE_3_2 9667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return Return code to specify if the text is normalized or not 9677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (Normalizer.YES, Normalizer.NO or 9687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normalizer.MAYBE) 9697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 9707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 9717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static QuickCheckResult quickCheck(char[] source,int start, 9737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int limit, Mode mode,int options) { 9747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start); 9757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return mode.getNormalizer2(options).quickCheck(srcBuffer); 9767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 9797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Test if a string is in a given normalization form. 9807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This is semantically equivalent to source.equals(normalize(source, mode)). 9817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 9827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Unlike quickCheck(), this function returns a definitive result, 9837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * never a "maybe". 9847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * For NFD, NFKD, and FCD, both functions work exactly the same. 9857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * For NFC and NFKC where quickCheck may return "maybe", this function will 9867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * perform further tests to arrive at a true/false result. 9877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param src The input array of characters to be checked to see if 9887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * it is normalized 9897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param start The strart index in the source 9907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param limit The limit index in the source 9917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode the normalization mode 9927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options Options for use with exclusion set and tailored Normalization 9937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The only option that is currently recognized is UNICODE_3_2 9947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return Boolean value indicating whether the source string is in the 9957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * "mode" normalization form 9967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 9977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 9987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static boolean isNormalized(char[] src,int start, 9997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int limit, Mode mode, 10007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int options) { 10017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start); 10027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return mode.getNormalizer2(options).isNormalized(srcBuffer); 10037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 10067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Test if a string is in a given normalization form. 10077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This is semantically equivalent to source.equals(normalize(source, mode)). 10087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Unlike quickCheck(), this function returns a definitive result, 10107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * never a "maybe". 10117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * For NFD, NFKD, and FCD, both functions work exactly the same. 10127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * For NFC and NFKC where quickCheck may return "maybe", this function will 10137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * perform further tests to arrive at a true/false result. 10147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param str the input string to be checked to see if it is 10157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * normalized 10167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode the normalization mode 10177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options Options for use with exclusion set and tailored Normalization 10187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The only option that is currently recognized is UNICODE_3_2 10197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #isNormalized 10207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 10217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 10227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static boolean isNormalized(String str, Mode mode, int options) { 10237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return mode.getNormalizer2(options).isNormalized(str); 10247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 10277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Convenience Method 10287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param char32 the input code point to be checked to see if it is 10297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * normalized 10307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode the normalization mode 10317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options Options for use with exclusion set and tailored Normalization 10327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The only option that is currently recognized is UNICODE_3_2 10337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #isNormalized 10357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 10367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 10377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static boolean isNormalized(int char32, Mode mode,int options) { 10387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return isNormalized(UTF16.valueOf(char32), mode, options); 10397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 10427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compare two strings for canonical equivalence. 10437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Further options include case-insensitive comparison and 10447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * code point order (as opposed to code unit order). 10457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Canonical equivalence between two strings is defined as their normalized 10477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * forms (NFD or NFC) being identical. 10487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This function compares strings incrementally instead of normalizing 10497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (and optionally case-folding) both strings entirely, 10507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * improving performance significantly. 10517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Bulk normalization is only necessary if the strings do not fulfill the 10537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * FCD conditions. Only in this case, and only if the strings are relatively 10547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * long, is memory allocated temporarily. 10557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * For FCD strings and short non-FCD strings there is no memory allocation. 10567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Semantically, this is equivalent to 10587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2))) 10597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * where code point order and foldCase are all optional. 10607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param s1 First source character array. 10627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param s1Start start index of source 10637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param s1Limit limit of the source 10647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param s2 Second source character array. 10667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param s2Start start index of the source 10677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param s2Limit limit of the source 10687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options A bit set of options: 10707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - FOLD_CASE_DEFAULT or 0 is used for default options: 10717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Case-sensitive comparison in code unit order, and the input strings 10727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * are quick-checked for FCD. 10737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - INPUT_IS_FCD 10757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set if the caller knows that both s1 and s2 fulfill the FCD 10767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * conditions.If not set, the function will quickCheck for FCD 10777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and normalize if necessary. 10787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - COMPARE_CODE_POINT_ORDER 10807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set to choose code point order instead of code unit order 10817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - COMPARE_IGNORE_CASE 10837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set to compare strings case-insensitively using case folding, 10847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * instead of case-sensitively. 10857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If set, then the following case folding options are used. 10867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return <0 or 0 or >0 as usual for string comparisons 10897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #normalize 10917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #FCD 10927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 10937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 10947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int compare(char[] s1, int s1Start, int s1Limit, 10957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char[] s2, int s2Start, int s2Limit, 10967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int options) { 10977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( s1==null || s1Start<0 || s1Limit<0 || 10987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s2==null || s2Start<0 || s2Limit<0 || 10997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s1Limit<s1Start || s2Limit<s2Start 11007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 11017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException(); 11027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start), 11047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharBuffer.wrap(s2, s2Start, s2Limit-s2Start), 11057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert options); 11067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 11097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compare two strings for canonical equivalence. 11107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Further options include case-insensitive comparison and 11117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * code point order (as opposed to code unit order). 11127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Canonical equivalence between two strings is defined as their normalized 11147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * forms (NFD or NFC) being identical. 11157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This function compares strings incrementally instead of normalizing 11167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (and optionally case-folding) both strings entirely, 11177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * improving performance significantly. 11187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Bulk normalization is only necessary if the strings do not fulfill the 11207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * FCD conditions. Only in this case, and only if the strings are relatively 11217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * long, is memory allocated temporarily. 11227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * For FCD strings and short non-FCD strings there is no memory allocation. 11237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Semantically, this is equivalent to 11257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2))) 11267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * where code point order and foldCase are all optional. 11277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param s1 First source string. 11297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param s2 Second source string. 11307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options A bit set of options: 11327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - FOLD_CASE_DEFAULT or 0 is used for default options: 11337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Case-sensitive comparison in code unit order, and the input strings 11347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * are quick-checked for FCD. 11357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - INPUT_IS_FCD 11377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set if the caller knows that both s1 and s2 fulfill the FCD 11387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * conditions. If not set, the function will quickCheck for FCD 11397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and normalize if necessary. 11407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - COMPARE_CODE_POINT_ORDER 11427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set to choose code point order instead of code unit order 11437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - COMPARE_IGNORE_CASE 11457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set to compare strings case-insensitively using case folding, 11467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * instead of case-sensitively. 11477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If set, then the following case folding options are used. 11487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return <0 or 0 or >0 as usual for string comparisons 11507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #normalize 11527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #FCD 11537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 11547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 11557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int compare(String s1, String s2, int options) { 11567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return internalCompare(s1, s2, options); 11577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 11607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compare two strings for canonical equivalence. 11617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Further options include case-insensitive comparison and 11627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * code point order (as opposed to code unit order). 11637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Convenience method. 11647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param s1 First source string. 11667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param s2 Second source string. 11677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options A bit set of options: 11697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - FOLD_CASE_DEFAULT or 0 is used for default options: 11707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Case-sensitive comparison in code unit order, and the input strings 11717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * are quick-checked for FCD. 11727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - INPUT_IS_FCD 11747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set if the caller knows that both s1 and s2 fulfill the FCD 11757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * conditions. If not set, the function will quickCheck for FCD 11767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and normalize if necessary. 11777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - COMPARE_CODE_POINT_ORDER 11797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set to choose code point order instead of code unit order 11807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - COMPARE_IGNORE_CASE 11827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set to compare strings case-insensitively using case folding, 11837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * instead of case-sensitively. 11847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If set, then the following case folding options are used. 11857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return <0 or 0 or >0 as usual for string comparisons 11877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #normalize 11897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #FCD 11907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 11917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 11927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int compare(char[] s1, char[] s2, int options) { 11937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options); 11947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 11977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Convenience method that can have faster implementation 11987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * by not allocating buffers. 11997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param char32a the first code point to be checked against the 12007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param char32b the second code point 12017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options A bit set of options 12027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 12037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 12047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int compare(int char32a, int char32b, int options) { 12057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD); 12067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 12097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Convenience method that can have faster implementation 12107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * by not allocating buffers. 12117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param char32a the first code point to be checked against 12127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param str2 the second string 12137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options A bit set of options 12147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 12157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 12167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int compare(int char32a, String str2, int options) { 12177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return internalCompare(UTF16.valueOf(char32a), str2, options); 12187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* Concatenation of normalized strings --------------------------------- */ 12217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 12227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Concatenate normalized strings, making sure that the result is normalized 12237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * as well. 12247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 12257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If both the left and the right strings are in 12267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the normalization form according to "mode", 12277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * then the result will be 12287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 12297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <code> 12307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * dest=normalize(left+right, mode) 12317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </code> 12327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 12337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * With the input strings already being normalized, 12347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * this function will use next() and previous() 12357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * to find the adjacent end pieces of the input strings. 12367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Only the concatenation of these end pieces will be normalized and 12377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * then concatenated with the remaining parts of the input strings. 12387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 12397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * It is allowed to have dest==left to avoid copying the entire left string. 12407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 12417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param left Left source array, may be same as dest. 12427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param leftStart start in the left array. 12437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param leftLimit limit in the left array (==length) 12447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param right Right source array. 12457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param rightStart start in the right array. 12467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param rightLimit limit in the right array (==length) 12477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param dest The output buffer; can be null if destStart==destLimit==0 12487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * for pure preflighting. 12497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param destStart start in the destination array 12507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param destLimit limit in the destination array (==length) 12517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode The normalization mode. 12527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options The normalization options, ORed together (0 for no options). 12537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return Length of output (number of chars) when successful or 12547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * IndexOutOfBoundsException 12557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @exception IndexOutOfBoundsException whose message has the string 12567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * representation of destination capacity required. 12577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #normalize 12587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #next 12597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #previous 12607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @exception IndexOutOfBoundsException if target capacity is less than the 12617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * required length 12627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 12637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 12647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int concatenate(char[] left, int leftStart, int leftLimit, 12657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char[] right, int rightStart, int rightLimit, 12667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char[] dest, int destStart, int destLimit, 12677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Normalizer.Mode mode, int options) { 12687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(dest == null) { 12697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException(); 12707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* check for overlapping right and destination */ 12737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (right == dest && rightStart < destLimit && destStart < rightLimit) { 12747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException("overlapping right and dst ranges"); 12757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* allow left==dest */ 12787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16); 12797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert destBuilder.append(left, leftStart, leftLimit-leftStart); 12807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart); 12817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mode.getNormalizer2(options).append(destBuilder, rightBuffer); 12827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int destLength=destBuilder.length(); 12837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(destLength<=(destLimit-destStart)) { 12847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert destBuilder.getChars(0, destLength, dest, destStart); 12857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return destLength; 12867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 12877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IndexOutOfBoundsException(Integer.toString(destLength)); 12887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 12927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Concatenate normalized strings, making sure that the result is normalized 12937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * as well. 12947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 12957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If both the left and the right strings are in 12967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the normalization form according to "mode", 12977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * then the result will be 12987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 12997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <code> 13007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * dest=normalize(left+right, mode) 13017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </code> 13027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 13037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * For details see concatenate 13047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 13057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param left Left source string. 13067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param right Right source string. 13077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode The normalization mode. 13087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options The normalization options, ORed together (0 for no options). 13097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return result 13107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 13117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #concatenate 13127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #normalize 13137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #next 13147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #previous 13157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #concatenate 13167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 13177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 13187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static String concatenate(char[] left, char[] right,Mode mode, int options) { 13197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left); 13207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString(); 13217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 13247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Concatenate normalized strings, making sure that the result is normalized 13257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * as well. 13267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 13277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If both the left and the right strings are in 13287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the normalization form according to "mode", 13297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * then the result will be 13307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 13317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <code> 13327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * dest=normalize(left+right, mode) 13337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </code> 13347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 13357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * With the input strings already being normalized, 13367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * this function will use next() and previous() 13377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * to find the adjacent end pieces of the input strings. 13387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Only the concatenation of these end pieces will be normalized and 13397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * then concatenated with the remaining parts of the input strings. 13407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 13417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param left Left source string. 13427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param right Right source string. 13437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param mode The normalization mode. 13447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param options The normalization options, ORed together (0 for no options). 13457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return result 13467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 13477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #concatenate 13487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #normalize 13497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #next 13507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #previous 13517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #concatenate 13527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 13537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 13547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static String concatenate(String left, String right, Mode mode, int options) { 13557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left); 13567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return mode.getNormalizer2(options).append(dest, right).toString(); 13577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 13607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Gets the FC_NFKC closure value. 13617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param c The code point whose closure value is to be retrieved 13627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param dest The char array to receive the closure value 13637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the length of the closure value; 0 if there is none 13647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 3.8 13657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 13667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int getFC_NFKC_Closure(int c,char[] dest) { 13677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String closure=getFC_NFKC_Closure(c); 13687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length=closure.length(); 13697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(length!=0 && dest!=null && length<=dest.length) { 13707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert closure.getChars(0, length, dest, 0); 13717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return length; 13737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 13757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Gets the FC_NFKC closure value. 13767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param c The code point whose closure value is to be retrieved 13777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return String representation of the closure value; "" if there is none 13787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 3.8 13797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 13807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static String getFC_NFKC_Closure(int c) { 13817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Compute the FC_NFKC_Closure on the fly: 13827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We have the API for complete coverage of Unicode properties, although 13837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // this value by itself is not useful via API. 13847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (What could be useful is a custom normalization table that combines 13857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // case folding and NFKC.) 13867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // For the derivation, see Unicode's DerivedNormalizationProps.txt. 13877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2; 13887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UCaseProps csp=UCaseProps.INSTANCE; 13897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // first: b = NFKC(Fold(a)) 13907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder folded=new StringBuilder(); 13917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int folded1Length=csp.toFullFolding(c, folded, 0); 13927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(folded1Length<0) { 13937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl; 13947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) { 13957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ""; // c does not change at all under CaseFolding+NFKC 13967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert folded.appendCodePoint(c); 13987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 13997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(folded1Length>UCaseProps.MAX_STRING_LENGTH) { 14007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert folded.appendCodePoint(folded1Length); 14017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String kc1=nfkc.normalize(folded); 14047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // second: c = NFKC(Fold(b)) 14057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0)); 14067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if (c != b) add the mapping from a to c 14077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(kc1.equals(kc2)) { 14087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ""; 14097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 14107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return kc2; 14117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //------------------------------------------------------------------------- 14157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Iteration API 14167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //------------------------------------------------------------------------- 14177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 14197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Return the current character in the normalized text. 14207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The codepoint as an int 14217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 14227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 14237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int current() { 14247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(bufferPos<buffer.length() || nextNormalize()) { 14257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return buffer.codePointAt(bufferPos); 14267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 14277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return DONE; 14287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 14327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Return the next character in the normalized text and advance 14337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the iteration position by one. If the end 14347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * of the text has already been reached, {@link #DONE} is returned. 14357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The codepoint as an int 14367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 14377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 14387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int next() { 14397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(bufferPos<buffer.length() || nextNormalize()) { 14407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=buffer.codePointAt(bufferPos); 14417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bufferPos+=Character.charCount(c); 14427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return c; 14437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 14447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return DONE; 14457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 14507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Return the previous character in the normalized text and decrement 14517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the iteration position by one. If the beginning 14527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * of the text has already been reached, {@link #DONE} is returned. 14537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The codepoint as an int 14547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 14557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 14567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int previous() { 14577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(bufferPos>0 || previousNormalize()) { 14587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=buffer.codePointBefore(bufferPos); 14597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bufferPos-=Character.charCount(c); 14607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return c; 14617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 14627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return DONE; 14637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 14677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Reset the index to the beginning of the text. 14687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This is equivalent to setIndexOnly(startIndex)). 14697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 14707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 14717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void reset() { 14727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text.setToStart(); 14737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert currentIndex=nextIndex=0; 14747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert clearBuffer(); 14757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 14787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set the iteration position in the input text that is being normalized, 14797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * without any immediate normalization. 14807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * After setIndexOnly(), getIndex() will return the same index that is 14817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * specified here. 14827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 14837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param index the desired index in the input text. 14847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 14857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 14867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setIndexOnly(int index) { 14877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text.setIndex(index); // validates index 14887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert currentIndex=nextIndex=index; 14897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert clearBuffer(); 14907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 14937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set the iteration position in the input text that is being normalized 14947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and return the first normalized character at that position. 14957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 14967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <b>Note:</b> This method sets the position in the <em>input</em> text, 14977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * while {@link #next} and {@link #previous} iterate through characters 14987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * in the normalized <em>output</em>. This means that there is not 14997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * necessarily a one-to-one correspondence between characters returned 15007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and 15017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * returned from <tt>setIndex</tt> and {@link #getIndex}. 15027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 15037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param index the desired index in the input text. 15047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the first normalized character that is the result of iterating 15067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * forward starting at the given index. 15077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @throws IllegalArgumentException if the given index is less than 15097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link #getBeginIndex} or greater than {@link #getEndIndex}. 15107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated ICU 3.2 15117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @obsolete ICU 3.2 15127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 15137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 15147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ///CLOVER:OFF 15157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int setIndex(int index) { 15167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setIndexOnly(index); 15177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return current(); 15187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ///CLOVER:ON 15207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 15217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Retrieve the index of the start of the input text. This is the begin 15227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the 15237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating 15247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated ICU 2.2. Use startIndex() instead. 15257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The codepoint as an int 15267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #startIndex 15277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 15287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 15297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getBeginIndex() { 15307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 15317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 15347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Retrieve the index of the end of the input text. This is the end index 15357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> 15367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * over which this <tt>Normalizer</tt> is iterating 15377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated ICU 2.2. Use endIndex() instead. 15387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The codepoint as an int 15397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #endIndex 15407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 15417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 15427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getEndIndex() { 15437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return endIndex(); 15447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 15467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Return the first character in the normalized text. This resets 15477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the <tt>Normalizer's</tt> position to the beginning of the text. 15487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The codepoint as an int 15497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 15507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 15517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int first() { 15527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reset(); 15537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return next(); 15547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 15577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Return the last character in the normalized text. This resets 15587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the <tt>Normalizer's</tt> position to be just before the 15597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the input text corresponding to that normalized character. 15607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The codepoint as an int 15617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 15627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 15637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int last() { 15647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text.setToLimit(); 15657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert currentIndex=nextIndex=text.getIndex(); 15667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert clearBuffer(); 15677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return previous(); 15687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 15717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Retrieve the current iteration position in the input text that is 15727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * being normalized. This method is useful in applications such as 15737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * searching, where you need to be able to determine the position in 15747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the input text that corresponds to a given normalized output character. 15757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 15767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <b>Note:</b> This method sets the position in the <em>input</em>, while 15777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link #next} and {@link #previous} iterate through characters in the 15787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <em>output</em>. This means that there is not necessarily a one-to-one 15797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * correspondence between characters returned by <tt>next</tt> and 15807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <tt>previous</tt> and the indices passed to and returned from 15817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <tt>setIndex</tt> and {@link #getIndex}. 15827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The current iteration position 15837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 15847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 15857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getIndex() { 15867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(bufferPos<buffer.length()) { 15877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return currentIndex; 15887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 15897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return nextIndex; 15907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 15947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Retrieve the index of the start of the input text. This is the begin 15957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the 15967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating 15977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The current iteration position 15987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 15997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 16007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int startIndex() { 16017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 16027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 16057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Retrieve the index of the end of the input text. This is the end index 16067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> 16077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * over which this <tt>Normalizer</tt> is iterating 16087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The current iteration position 16097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 16107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 16117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int endIndex() { 16127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return text.getLength(); 16137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //------------------------------------------------------------------------- 16167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Iterator attributes 16177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //------------------------------------------------------------------------- 16187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 16197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set the normalization mode for this object. 16207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 16217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <b>Note:</b>If the normalization mode is changed while iterating 16227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * over a string, calls to {@link #next} and {@link #previous} may 16237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * return previously buffers characters in the old normalization mode 16247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * until the iteration is able to re-sync at the next base character. 16257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * It is safest to call {@link #setText setText()}, {@link #first}, 16267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link #last}, etc. after calling <tt>setMode</tt>. 16277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 16287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param newMode the new mode for this <tt>Normalizer</tt>. 16297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The supported modes are: 16307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <ul> 16317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>{@link #NFC} - Unicode canonical decompositiion 16327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * followed by canonical composition. 16337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>{@link #NFKC} - Unicode compatibility decompositiion 16347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * follwed by canonical composition. 16357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>{@link #NFD} - Unicode canonical decomposition 16367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>{@link #NFKD} - Unicode compatibility decomposition. 16377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>{@link #NONE} - Do nothing but return characters 16387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * from the underlying input text. 16397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </ul> 16407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 16417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #getMode 16427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 16437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 16447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setMode(Mode newMode) { 16457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mode = newMode; 16467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm2 = mode.getNormalizer2(options); 16477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 16497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Return the basic operation performed by this <tt>Normalizer</tt> 16507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 16517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #setMode 16527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 16537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 16547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Mode getMode() { 16557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return mode; 16567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 16587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set options that affect this <tt>Normalizer</tt>'s operation. 16597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Options do not change the basic composition or decomposition operation 16607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * that is being performed , but they control whether 16617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * certain optional portions of the operation are done. 16627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Currently the only available option is: 16637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 16647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <ul> 16657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2. 16667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </ul> 16677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 16687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param option the option whose value is to be set. 16697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param value the new setting for the option. Use <tt>true</tt> to 16707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * turn the option on and <tt>false</tt> to turn it off. 16717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 16727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #getOption 16737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 16747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 16757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setOption(int option,boolean value) { 16767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (value) { 16777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert options |= option; 16787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 16797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert options &= (~option); 16807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm2 = mode.getNormalizer2(options); 16827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 16857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Determine whether an option is turned on or off. 16867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 16877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #setOption 16887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.6 16897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 16907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getOption(int option) { 16917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((options & option)!=0) { 16927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 1 ; 16937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 16947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 16957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 16997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Gets the underlying text storage 17007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param fillIn the char buffer to fill the UTF-16 units. 17017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The length of the buffer should be equal to the length of the 17027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * underlying text storage 17037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @throws IndexOutOfBoundsException If the index passed for the array is invalid. 17047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #getLength 17057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 17067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 17077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getText(char[] fillIn) { 17087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return text.getText(fillIn); 17097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 17127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Gets the length of underlying text storage 17137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the length 17147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 17157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 17167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getLength() { 17177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return text.getLength(); 17187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 17217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the text under iteration as a string 17227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return a copy of the text under iteration. 17237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 17247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 17257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public String getText() { 17267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return text.getText(); 17277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 17307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set the input text over which this <tt>Normalizer</tt> will iterate. 17317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The iteration position is set to the beginning of the input text. 17327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param newText The new string to be normalized. 17337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 17347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 17357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setText(StringBuffer newText) { 17367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 17377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (newIter == null) { 17387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalStateException("Could not create a new UCharacterIterator"); 17397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text = newIter; 17417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reset(); 17427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 17457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set the input text over which this <tt>Normalizer</tt> will iterate. 17467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The iteration position is set to the beginning of the input text. 17477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param newText The new string to be normalized. 17487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 17497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 17507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setText(char[] newText) { 17517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 17527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (newIter == null) { 17537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalStateException("Could not create a new UCharacterIterator"); 17547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text = newIter; 17567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reset(); 17577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 17607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set the input text over which this <tt>Normalizer</tt> will iterate. 17617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The iteration position is set to the beginning of the input text. 17627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param newText The new string to be normalized. 17637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 17647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 17657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setText(String newText) { 17667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 17677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (newIter == null) { 17687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalStateException("Could not create a new UCharacterIterator"); 17697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text = newIter; 17717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reset(); 17727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 17757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set the input text over which this <tt>Normalizer</tt> will iterate. 17767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The iteration position is set to the beginning of the input text. 17777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param newText The new string to be normalized. 17787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 17797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 17807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setText(CharacterIterator newText) { 17817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 17827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (newIter == null) { 17837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalStateException("Could not create a new UCharacterIterator"); 17847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text = newIter; 17867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reset(); 17877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 17907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set the input text over which this <tt>Normalizer</tt> will iterate. 17917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The iteration position is set to the beginning of the string. 17927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param newText The new string to be normalized. 17937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.8 17947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 17957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setText(UCharacterIterator newText) { 17967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert try{ 17977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UCharacterIterator newIter = (UCharacterIterator)newText.clone(); 17987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (newIter == null) { 17997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalStateException("Could not create a new UCharacterIterator"); 18007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text = newIter; 18027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reset(); 18037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }catch(CloneNotSupportedException e) { 18047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUCloneNotSupportedException("Could not clone the UCharacterIterator", e); 18057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void clearBuffer() { 18097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.setLength(0); 18107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bufferPos=0; 18117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean nextNormalize() { 18147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert clearBuffer(); 18157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert currentIndex=nextIndex; 18167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text.setIndex(nextIndex); 18177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Skip at least one character so we make progress. 18187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=text.nextCodePoint(); 18197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<0) { 18207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 18217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder segment=new StringBuilder().appendCodePoint(c); 18237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while((c=text.nextCodePoint())>=0) { 18247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm2.hasBoundaryBefore(c)) { 18257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text.moveCodePointIndex(-1); 18267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 18277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert segment.appendCodePoint(c); 18297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextIndex=text.getIndex(); 18317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm2.normalize(segment, buffer); 18327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return buffer.length()!=0; 18337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean previousNormalize() { 18367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert clearBuffer(); 18377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextIndex=currentIndex; 18387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text.setIndex(currentIndex); 18397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder segment=new StringBuilder(); 18407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c; 18417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while((c=text.previousCodePoint())>=0) { 18427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<=0xffff) { 18437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert segment.insert(0, (char)c); 18447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 18457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert segment.insert(0, Character.toChars(c)); 18467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm2.hasBoundaryBefore(c)) { 18487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 18497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert currentIndex=text.getIndex(); 18527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm2.normalize(segment, buffer); 18537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bufferPos=buffer.length(); 18547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return buffer.length()!=0; 18557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* compare canonically equivalent ------------------------------------------- */ 18587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407 18607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int internalCompare(CharSequence s1, CharSequence s2, int options) { 18617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT; 18627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert options|= COMPARE_EQUIV; 18637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 18657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * UAX #21 Case Mappings, as fixed for Unicode version 4 18667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (see Jitterbug 2021), defines a canonical caseless match as 18677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 18687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A string X is a canonical caseless match 18697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * for a string Y if and only if 18707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) 18717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 18727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * For better performance, we check for FCD (or let the caller tell us that 18737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * both strings are in FCD) for the inner normalization. 18747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that 18757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * case-folding preserves the FCD-ness of a string. 18767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold() 18777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * when there is a difference. 18787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 18797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Exception: When using the Turkic case-folding option, we do perform 18807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * full NFD first. This is because in the Turkic case precomposed characters 18817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * with 0049 capital I or 0069 small i fold differently whether they 18827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * are first decomposed or not, so an FCD check - a check only for 18837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * canonical order - is not sufficient. 18847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 18857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) { 18867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Normalizer2 n2; 18877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) { 18887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert n2=NFD.getNormalizer2(normOptions); 18897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 18907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert n2=FCD.getNormalizer2(normOptions); 18917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // check if s1 and/or s2 fulfill the FCD conditions 18947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int spanQCYes1=n2.spanQuickCheckYes(s1); 18957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int spanQCYes2=n2.spanQuickCheckYes(s2); 18967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 18977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 18987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * ICU 2.4 had a further optimization: 18997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If both strings were not in FCD, then they were both NFD'ed, 19007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and the COMPARE_EQUIV option was turned off. 19017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * It is not entirely clear that this is valid with the current 19027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * definition of the canonical caseless match. 19037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Therefore, ICU 2.6 removes that optimization. 19047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 19057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(spanQCYes1<s1.length()) { 19077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1); 19087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length())); 19097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(spanQCYes2<s2.length()) { 19117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2); 19127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length())); 19137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return cmpEquivFold(s1, s2, options); 19177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 19207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compare two strings for canonical equivalence. 19217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Further options include case-insensitive comparison and 19227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * code point order (as opposed to code unit order). 19237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * In this function, canonical equivalence is optional as well. 19257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If canonical equivalence is tested, then both strings must fulfill 19267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the FCD check. 19277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Semantically, this is equivalent to 19297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) 19307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * where code point order, NFD and foldCase are all optional. 19317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * String comparisons almost always yield results before processing both strings 19337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * completely. 19347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * They are generally more efficient working incrementally instead of 19357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * performing the sub-processing (strlen, normalization, case-folding) 19367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * on the entire strings first. 19377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * It is also unnecessary to not normalize identical characters. 19397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This function works in principle as follows: 19417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * loop { 19437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * get one code unit c1 from s1 (-1 if end of source) 19447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * get one code unit c2 from s2 (-1 if end of source) 19457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * if(either string finished) { 19477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * return result; 19487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * } 19497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * if(c1==c2) { 19507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * continue; 19517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * } 19527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * // c1!=c2 19547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * try to decompose/case-fold c1/c2, and continue if one does; 19557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * // still c1!=c2 and neither decomposes/case-folds, return result 19577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * return c1-c2; 19587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * } 19597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * When a character decomposes, then the pointer for that source changes to 19617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the decomposition, pushing the previous pointer onto a stack. 19627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * When the end of the decomposition is reached, then the code unit reader 19637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * pops the previous source from the stack. 19647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (Same for case-folding.) 19657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This is complicated further by operating on variable-width UTF-16. 19677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The top part of the loop works on code units, while lookups for decomposition 19687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and case-folding need code points. 19697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Code points are assembled after the equality/end-of-source part. 19707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The source pointer is only advanced beyond all code units when the code point 19717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * actually decomposes/case-folds. 19727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If we were on a trail surrogate unit when assembling a code point, 19747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and the code point decomposes/case-folds, then the decomposition/folding 19757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * result must be compared with the part of the other string that corresponds to 19767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * this string's lead surrogate. 19777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Since we only assemble a code point when hitting a trail unit when the 19787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * preceding lead units were identical, we back up the other string by one unit 19797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * in such a case. 19807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The optional code point order comparison at the end works with 19827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the same fix-up as the other code point order comparison functions. 19837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * See ustring.c and the comment near the end of this function. 19847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Assumption: A decomposition or case-folding result string never contains 19867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * a single surrogate. This is a safe assumption in the Unicode Standard. 19877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Therefore, we do not need to check for surrogate pairs across 19887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * decomposition/case-folding boundaries. 19897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Further assumptions (see verifications tstnorm.cpp): 19917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The API function checks for FCD first, while the core function 19927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * first case-folds and then decomposes. This requires that case-folding does not 19937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * un-FCD any strings. 19947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The API function may also NFD the input and turn off decomposition. 19967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This requires that case-folding does not un-NFD strings either. 19977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 19987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * TODO If any of the above two assumptions is violated, 19997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * then this entire code must be re-thought. 20007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If this happens, then a simple solution is to case-fold both strings up front 20017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and to turn off UNORM_INPUT_IS_FCD. 20027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * We already do this when not both strings are in FCD because makeFCD 20037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * would be a partial NFD before the case folding, which does not work. 20047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Note that all of this is only a problem when case-folding _and_ 20057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * canonical equivalence come together. 20067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (Comments in unorm_compare() are more up to date than this TODO.) 20077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 20087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* stack element for previous-level source/decomposition pointers */ 20107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class CmpEquivLevel { 20117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharSequence cs; 20127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int s; 20137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }; 20147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final CmpEquivLevel[] createCmpEquivLevelStack() { 20157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return new CmpEquivLevel[] { 20167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert new CmpEquivLevel(), new CmpEquivLevel() 20177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }; 20187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 20217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Internal option for unorm_cmpEquivFold() for decomposing. 20227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If not set, just do strcasecmp(). 20237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 20247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int COMPARE_EQUIV=0x80000; 20257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* internal function; package visibility for use by UTF16.StringComparator */ 20277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) { 20287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Normalizer2Impl nfcImpl; 20297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UCaseProps csp; 20307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* current-level start/limit - s1/s2 as current */ 20327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int s1, s2, limit1, limit2; 20337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* decomposition and case folding variables */ 20357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length; 20367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* stacks of previous-level start/current/limit */ 20387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CmpEquivLevel[] stack1=null, stack2=null; 20397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* buffers for algorithmic decompositions */ 20417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String decomp1, decomp2; 20427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* case folding buffers, only use current-level start/limit */ 20447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder fold1, fold2; 20457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* track which is the current level per string */ 20477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int level1, level2; 20487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* current code units, and code points for lookups */ 20507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c1, c2, cp1, cp2; 20517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* no argument error checking because this itself is not an API */ 20537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 20557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set 20567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * otherwise this function must behave exactly as uprv_strCompare() 20577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * not checking for that here makes testing this function easier 20587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 20597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* normalization/properties data loaded? */ 20617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((options&COMPARE_EQUIV)!=0) { 20627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfcImpl=Norm2AllModes.getNFCInstance().impl; 20637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 20647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfcImpl=null; 20657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((options&COMPARE_IGNORE_CASE)!=0) { 20677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert csp=UCaseProps.INSTANCE; 20687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fold1=new StringBuilder(); 20697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fold2=new StringBuilder(); 20707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 20717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert csp=null; 20727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fold1=fold2=null; 20737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* initialize */ 20767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s1=0; 20777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit1=cs1.length(); 20787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s2=0; 20797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit2=cs2.length(); 20807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert level1=level2=0; 20827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c1=c2=-1; 20837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* comparison loop */ 20857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 20867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 20877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * here a code unit value of -1 means "get another code unit" 20887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * below it will mean "this source is finished" 20897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 20907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c1<0) { 20927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* get next code unit from string 1, post-increment */ 20937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 20947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(s1==limit1) { 20957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(level1==0) { 20967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c1=-1; 20977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 20987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 21007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c1=cs1.charAt(s1++); 21017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 21027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* reached end of level buffer, pop one level */ 21057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 21067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --level1; 21077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cs1=stack1[level1].cs; 21087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while(cs1==null); 21097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s1=stack1[level1].s; 21107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit1=cs1.length(); 21117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c2<0) { 21157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* get next code unit from string 2, post-increment */ 21167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 21177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(s2==limit2) { 21187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(level2==0) { 21197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c2=-1; 21207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 21217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 21237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c2=cs2.charAt(s2++); 21247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 21257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* reached end of level buffer, pop one level */ 21287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 21297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --level2; 21307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cs2=stack2[level2].cs; 21317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while(cs2==null); 21327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s2=stack2[level2].s; 21337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit2=cs2.length(); 21347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 21387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * compare c1 and c2 21397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * either variable c1, c2 is -1 only if the corresponding string is finished 21407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 21417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c1==c2) { 21427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c1<0) { 21437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; /* c1==c2==-1 indicating end of strings */ 21447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c1=c2=-1; /* make us fetch new code units */ 21467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 21477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(c1<0) { 21487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return -1; /* string 1 ends before string 2 */ 21497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(c2<0) { 21507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 1; /* string 2 ends before string 1 */ 21517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* c1!=c2 && c1>=0 && c2>=0 */ 21537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* get complete code points for c1, c2 for lookups if either is a surrogate */ 21557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp1=c1; 21567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(UTF16.isSurrogate((char)c1)) { 21577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c; 21587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) { 21607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) { 21617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* advance ++s1; only below if cp1 decomposes/case-folds */ 21627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp1=Character.toCodePoint((char)c1, c); 21637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else /* isTrail(c1) */ { 21657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) { 21667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp1=Character.toCodePoint(c, (char)c1); 21677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp2=c2; 21727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(UTF16.isSurrogate((char)c2)) { 21737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c; 21747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) { 21767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) { 21777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* advance ++s2; only below if cp2 decomposes/case-folds */ 21787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp2=Character.toCodePoint((char)c2, c); 21797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else /* isTrail(c2) */ { 21817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) { 21827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cp2=Character.toCodePoint(c, (char)c2); 21837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 21887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * go down one level for each string 21897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * continue with the main loop as soon as there is a real change 21907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 21917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 && 21937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (length=csp.toFullFolding(cp1, fold1, options))>=0 21947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 21957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* cp1 case-folds to the code point "length" or to p[length] */ 21967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(UTF16.isSurrogate((char)c1)) { 21977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) { 21987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* advance beyond source surrogate pair if it case-folds */ 21997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++s1; 22007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else /* isTrail(c1) */ { 22017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 22027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * we got a supplementary code point when hitting its trail surrogate, 22037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * therefore the lead surrogate must have been the same as in the other string; 22047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * compare this decomposition with the lead surrogate in the other string 22057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * remember that this simulates bulk text replacement: 22067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the decomposition would replace the entire code point 22077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 22087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --s2; 22097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c2=cs2.charAt(s2-1); 22107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 22117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 22127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 22137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* push current level pointers */ 22147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(stack1==null) { 22157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stack1=createCmpEquivLevelStack(); 22167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 22177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stack1[0].cs=cs1; 22187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stack1[0].s=s1; 22197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++level1; 22207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 22217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* copy the folding result to fold1[] */ 22227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* Java: the buffer was probably not empty, remove the old contents */ 22237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(length<=UCaseProps.MAX_STRING_LENGTH) { 22247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fold1.delete(0, fold1.length()-length); 22257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 22267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fold1.setLength(0); 22277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fold1.appendCodePoint(length); 22287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 22297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 22307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* set next level pointers to case folding */ 22317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cs1=fold1; 22327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s1=0; 22337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit1=fold1.length(); 22347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 22357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* get ready to read from decomposition, continue with loop */ 22367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c1=-1; 22377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 22387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 22397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 22407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 && 22417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (length=csp.toFullFolding(cp2, fold2, options))>=0 22427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 22437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* cp2 case-folds to the code point "length" or to p[length] */ 22447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(UTF16.isSurrogate((char)c2)) { 22457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) { 22467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* advance beyond source surrogate pair if it case-folds */ 22477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++s2; 22487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else /* isTrail(c2) */ { 22497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 22507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * we got a supplementary code point when hitting its trail surrogate, 22517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * therefore the lead surrogate must have been the same as in the other string; 22527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * compare this decomposition with the lead surrogate in the other string 22537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * remember that this simulates bulk text replacement: 22547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the decomposition would replace the entire code point 22557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 22567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --s1; 22577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c1=cs1.charAt(s1-1); 22587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 22597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 22607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 22617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* push current level pointers */ 22627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(stack2==null) { 22637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stack2=createCmpEquivLevelStack(); 22647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 22657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stack2[0].cs=cs2; 22667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stack2[0].s=s2; 22677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++level2; 22687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 22697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* copy the folding result to fold2[] */ 22707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* Java: the buffer was probably not empty, remove the old contents */ 22717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(length<=UCaseProps.MAX_STRING_LENGTH) { 22727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fold2.delete(0, fold2.length()-length); 22737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 22747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fold2.setLength(0); 22757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fold2.appendCodePoint(length); 22767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 22777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 22787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* set next level pointers to case folding */ 22797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cs2=fold2; 22807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s2=0; 22817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit2=fold2.length(); 22827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 22837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* get ready to read from decomposition, continue with loop */ 22847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c2=-1; 22857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 22867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 22877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 22887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( level1<2 && (options&COMPARE_EQUIV)!=0 && 22897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (decomp1=nfcImpl.getDecomposition(cp1))!=null 22907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 22917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* cp1 decomposes into p[length] */ 22927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(UTF16.isSurrogate((char)c1)) { 22937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) { 22947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* advance beyond source surrogate pair if it decomposes */ 22957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++s1; 22967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else /* isTrail(c1) */ { 22977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 22987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * we got a supplementary code point when hitting its trail surrogate, 22997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * therefore the lead surrogate must have been the same as in the other string; 23007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * compare this decomposition with the lead surrogate in the other string 23017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * remember that this simulates bulk text replacement: 23027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the decomposition would replace the entire code point 23037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 23047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --s2; 23057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c2=cs2.charAt(s2-1); 23067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 23077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 23087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 23097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* push current level pointers */ 23107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(stack1==null) { 23117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stack1=createCmpEquivLevelStack(); 23127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 23137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stack1[level1].cs=cs1; 23147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stack1[level1].s=s1; 23157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++level1; 23167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 23177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* set empty intermediate level if skipped */ 23187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(level1<2) { 23197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stack1[level1++].cs=null; 23207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 23217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 23227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* set next level pointers to decomposition */ 23237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cs1=decomp1; 23247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s1=0; 23257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit1=decomp1.length(); 23267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 23277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* get ready to read from decomposition, continue with loop */ 23287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c1=-1; 23297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 23307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 23317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 23327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( level2<2 && (options&COMPARE_EQUIV)!=0 && 23337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (decomp2=nfcImpl.getDecomposition(cp2))!=null 23347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 23357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* cp2 decomposes into p[length] */ 23367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(UTF16.isSurrogate((char)c2)) { 23377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) { 23387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* advance beyond source surrogate pair if it decomposes */ 23397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++s2; 23407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else /* isTrail(c2) */ { 23417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 23427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * we got a supplementary code point when hitting its trail surrogate, 23437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * therefore the lead surrogate must have been the same as in the other string; 23447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * compare this decomposition with the lead surrogate in the other string 23457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * remember that this simulates bulk text replacement: 23467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the decomposition would replace the entire code point 23477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 23487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --s1; 23497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c1=cs1.charAt(s1-1); 23507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 23517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 23527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 23537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* push current level pointers */ 23547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(stack2==null) { 23557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stack2=createCmpEquivLevelStack(); 23567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 23577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stack2[level2].cs=cs2; 23587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stack2[level2].s=s2; 23597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++level2; 23607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 23617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* set empty intermediate level if skipped */ 23627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(level2<2) { 23637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stack2[level2++].cs=null; 23647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 23657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 23667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* set next level pointers to decomposition */ 23677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cs2=decomp2; 23687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s2=0; 23697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit2=decomp2.length(); 23707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 23717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* get ready to read from decomposition, continue with loop */ 23727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c2=-1; 23737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 23747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 23757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 23767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 23777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * no decomposition/case folding, max level for both sides: 23787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * return difference result 23797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 23807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * code point order comparison must not just return cp1-cp2 23817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * because when single surrogates are present then the surrogate pairs 23827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * that formed cp1 and cp2 may be from different string indexes 23837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 23847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units 23857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * c1=d800 cp1=10001 c2=dc00 cp2=10000 23867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } 23877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 23887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * therefore, use same fix-up as in ustring.c/uprv_strCompare() 23897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ 23907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * so we have slightly different pointer/start/limit comparisons here 23917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 23927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 23937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) { 23947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 23957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( 23967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) || 23977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2))) 23987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 23997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* part of a surrogate pair, leave >=d800 */ 24007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 24017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* BMP code point - may be surrogate code point - make <d800 */ 24027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c1-=0x2800; 24037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 24057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( 24067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) || 24077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2))) 24087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 24097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* part of a surrogate pair, leave >=d800 */ 24107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 24117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* BMP code point - may be surrogate code point - make <d800 */ 24127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c2-=0x2800; 24137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 24167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return c1-c2; 24177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 24207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 24217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * An Appendable that writes into a char array with a capacity that may be 24227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * less than array.length. 24237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.) 24247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 24257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * An overflow is only reported at the end, for the old Normalizer API functions that write 24267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * to char arrays. 24277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 24287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class CharsAppendable implements Appendable { 24297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public CharsAppendable(char[] dest, int destStart, int destLimit) { 24307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert chars=dest; 24317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start=offset=destStart; 24327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit=destLimit; 24337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int length() { 24357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int len=offset-start; 24367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(offset<=limit) { 24377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return len; 24387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 24397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IndexOutOfBoundsException(Integer.toString(len)); 24407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Appendable append(char c) { 24437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(offset<limit) { 24447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert chars[offset]=c; 24457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++offset; 24477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 24487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Appendable append(CharSequence s) { 24507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return append(s, 0, s.length()); 24517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Appendable append(CharSequence s, int sStart, int sLimit) { 24537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int len=sLimit-sStart; 24547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(len<=(limit-offset)) { 24557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(sStart<sLimit) { // TODO: Is there a better way to copy the characters? 24567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert chars[offset++]=s.charAt(sStart++); 24577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 24597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert offset+=len; 24607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 24627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 24647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final char[] chars; 24657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final int start, limit; 24667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int offset; 24677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 24687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 2469