17935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 27935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 3f8a0c400bbd62a2ea4ee9b77641f79cb443d2187Neil Fuller * Copyright (C) 2009-2015, International Business Machines 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Corporation and others. All Rights Reserved. 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.impl; 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.io.IOException; 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.nio.ByteBuffer; 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.ArrayList; 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.Iterator; 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.UTF16; 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.UnicodeSet; 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.ICUUncheckedIOException; 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.VersionInfo; 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic final class Normalizer2Impl { 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final class Hangul { 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* Korean Hangul and Jamo constants */ 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int JAMO_L_END=0x1112; 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int JAMO_V_END=0x1175; 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int JAMO_T_END=0x11c2; 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int HANGUL_BASE=0xac00; 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int HANGUL_END=0xd7a3; 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int JAMO_L_COUNT=19; 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int JAMO_V_COUNT=21; 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int JAMO_T_COUNT=28; 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT; 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT; 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT; 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static boolean isHangul(int c) { 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return HANGUL_BASE<=c && c<HANGUL_LIMIT; 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static boolean isHangulWithoutJamoT(char c) { 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c-=HANGUL_BASE; 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static boolean isJamoL(int c) { 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return JAMO_L_BASE<=c && c<JAMO_L_LIMIT; 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static boolean isJamoV(int c) { 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return JAMO_V_BASE<=c && c<JAMO_V_LIMIT; 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Decomposes c, which must be a Hangul syllable, into buffer 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and returns the length of the decomposition (2 or 3). 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int decompose(int c, Appendable buffer) { 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert try { 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c-=HANGUL_BASE; 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c2=c%JAMO_T_COUNT; 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c/=JAMO_T_COUNT; 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c2==0) { 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 2; 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append((char)(JAMO_T_BASE+c2)); 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 3; 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } catch(IOException e) { 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Will not occur because we do not write to I/O. 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUUncheckedIOException(e); 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Decomposes c, which must be a Hangul syllable, into buffer. 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This is the raw, not recursive, decomposition. Its length is always 2. 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static void getRawDecomposition(int c, Appendable buffer) { 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert try { 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int orig=c; 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c-=HANGUL_BASE; 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c2=c%JAMO_T_COUNT; 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c2==0) { 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c/=JAMO_T_COUNT; 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append((char)(orig-c2)); // LV syllable 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append((char)(JAMO_T_BASE+c2)); 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } catch(IOException e) { 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Will not occur because we do not write to I/O. 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUUncheckedIOException(e); 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Writable buffer that takes care of canonical ordering. 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Its Appendable methods behave like the C++ implementation's 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * appendZeroCC() methods. 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If dest is a StringBuilder, then the buffer writes directly to it. 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Otherwise, the buffer maintains a StringBuilder for intermediate text segments 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * until no further changes are necessary and whole segments are appended. 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * append() methods that take combining-class values always write to the StringBuilder. 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Other append() methods flush and append to the Appendable. 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final class ReorderingBuffer implements Appendable { 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) { 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert impl=ni; 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert app=dest; 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(app instanceof StringBuilder) { 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert appIsStringBuilder=true; 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str=(StringBuilder)dest; 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In Java, the constructor subsumes public void init(int destCapacity) { 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.ensureCapacity(destCapacity); 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=0; 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(str.length()==0) { 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCC=0; 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setIterator(); 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCC=previousCC(); 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Set reorderStart after the last code point with cc<=1 if there is one. 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(lastCC>1) { 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(previousCC()>1) {} 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=codePointLimit; 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert appIsStringBuilder=false; 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str=new StringBuilder(); 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=0; 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCC=0; 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isEmpty() { return str.length()==0; } 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int length() { return str.length(); } 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getLastCC() { return lastCC; } 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public StringBuilder getStringBuilder() { return str; } 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean equals(CharSequence s, int start, int limit) { 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return UTF16Plus.equal(str, 0, str.length(), s, start, limit); 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // For Hangul composition, replacing the Leading consonant Jamo with the syllable. 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setLastChar(char c) { 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.setCharAt(str.length()-1, c); 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void append(int c, int cc) { 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(lastCC<=cc || cc==0) { 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.appendCodePoint(c); 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCC=cc; 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(cc<=1) { 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=str.length(); 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert insert(c, cc); 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // s must be in NFD, otherwise change the implementation. 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void append(CharSequence s, int start, int limit, 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int leadCC, int trailCC) { 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(start==limit) { 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(lastCC<=leadCC || leadCC==0) { 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(trailCC<=1) { 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=str.length()+(limit-start); 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(leadCC<=1) { 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=str.length()+1; // Ok if not a code point boundary. 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.append(s, start, limit); 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCC=trailCC; 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=Character.codePointAt(s, start); 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start+=Character.charCount(c); 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert insert(c, leadCC); // insert first code point 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(start<limit) { 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.codePointAt(s, start); 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start+=Character.charCount(c); 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(start<limit) { 1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // s must be in NFD, otherwise we need to use getCC(). 1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); 1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert leadCC=trailCC; 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert append(c, leadCC); 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The following append() methods work like C++ appendZeroCC(). 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // They assume that the cc or trailCC of their input is 0. 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Most of them implement Appendable interface methods. 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // @Override when we switch to Java 6 2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public ReorderingBuffer append(char c) { 2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.append(c); 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCC=0; 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=str.length(); 2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void appendZeroCC(int c) { 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.appendCodePoint(c); 2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCC=0; 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=str.length(); 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // @Override when we switch to Java 6 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public ReorderingBuffer append(CharSequence s) { 2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(s.length()!=0) { 2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.append(s); 2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCC=0; 2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=str.length(); 2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // @Override when we switch to Java 6 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public ReorderingBuffer append(CharSequence s, int start, int limit) { 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(start!=limit) { 2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.append(s, start, limit); 2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCC=0; 2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=str.length(); 2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Flushes from the intermediate StringBuilder to the Appendable, 2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * if they are different objects. 2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Used after recomposition. 2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Must be called at the end when writing to a non-StringBuilder Appendable. 2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void flush() { 2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(appIsStringBuilder) { 2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=str.length(); 2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert try { 2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert app.append(str); 2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.setLength(0); 2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=0; 2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } catch(IOException e) { 2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCC=0; 2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Flushes from the intermediate StringBuilder to the Appendable, 2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * if they are different objects. 2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Then appends the new text to the Appendable or StringBuilder. 2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Normally used after quick check loops find a non-empty sequence. 2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { 2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(appIsStringBuilder) { 2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.append(s, start, limit); 2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=str.length(); 2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert try { 2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert app.append(str).append(s, start, limit); 2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.setLength(0); 2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=0; 2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } catch(IOException e) { 2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCC=0; 2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void remove() { 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.setLength(0); 2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCC=0; 2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=0; 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void removeSuffix(int suffixLength) { 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int oldLength=str.length(); 2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.delete(oldLength-suffixLength, oldLength); 2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCC=0; 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=str.length(); 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * TODO: Revisit whether it makes sense to track reorderStart. 2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * It is set to after the last known character with cc<=1, 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * which stops previousCC() before it reads that character and looks up its cc. 2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * previousCC() is normally only called from insert(). 2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * In other words, reorderStart speeds up the insertion of a combining mark 2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * into a multi-combining mark sequence where it does not belong at the end. 2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This might not be worth the trouble. 2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * On the other hand, it's not a huge amount of trouble. 2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * We probably need it for UNORM_SIMPLE_APPEND. 3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Inserts c somewhere before the last character. 3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Requires 0<cc<lastCC which implies reorderStart<limit. 3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void insert(int c, int cc) { 3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(setIterator(), skipPrevious(); previousCC()>cc;) {} 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // insert c at codePointLimit, after the character with prevCC<=cc 3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<=0xffff) { 3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.insert(codePointLimit, (char)c); 3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(cc<=1) { 3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=codePointLimit+1; 3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert str.insert(codePointLimit, Character.toChars(c)); 3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(cc<=1) { 3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reorderStart=codePointLimit+2; 3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final Normalizer2Impl impl; 3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final Appendable app; 3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final StringBuilder str; 3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final boolean appIsStringBuilder; 3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int reorderStart; 3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int lastCC; 3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // private backward iterator 3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void setIterator() { codePointStart=str.length(); } 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void skipPrevious() { // Requires 0<codePointStart. 3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert codePointLimit=codePointStart; 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert codePointStart=str.offsetByCodePoints(codePointStart, -1); 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int previousCC() { // Returns 0 if there is no previous character. 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert codePointLimit=codePointStart; 3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(reorderStart>=codePointStart) { 3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=str.codePointBefore(codePointStart); 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert codePointStart-=Character.charCount(c); 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<MIN_CCC_LCCC_CP) { 3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return getCCFromYesOrMaybe(impl.getNorm16(c)); 3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int codePointStart, codePointLimit; 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: Propose as public API on the UTF16 class. 3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: Propose widening UTF16 methods that take char to take int. 3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: Propose widening UTF16 methods that take String to take CharSequence. 3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final class UTF16Plus { 3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), 3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * is it a lead surrogate? 3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param c code unit or code point 3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return true or false 3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } 3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compares two CharSequence objects for binary equality. 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param s1 first sequence 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param s2 second sequence 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return true if s1 contains the same text as s2 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static boolean equal(CharSequence s1, CharSequence s2) { 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(s1==s2) { 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length=s1.length(); 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(length!=s2.length()) { 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i=0; i<length; ++i) { 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(s1.charAt(i)!=s2.charAt(i)) { 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compares two CharSequence subsequences for binary equality. 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param s1 first sequence 3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param start1 start offset in first sequence 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param limit1 limit offset in first sequence 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param s2 second sequence 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param start2 start offset in second sequence 3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param limit2 limit offset in second sequence 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return true if s1.subSequence(start1, limit1) contains the same text 3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * as s2.subSequence(start2, limit2) 3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static boolean equal(CharSequence s1, int start1, int limit1, 3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharSequence s2, int start2, int limit2) { 3957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((limit1-start1)!=(limit2-start2)) { 3967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 3977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(s1==s2 && start1==start2) { 3997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 4007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(start1<limit1) { 4027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(s1.charAt(start1++)!=s2.charAt(start2++)) { 4037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 4047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 4077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Normalizer2Impl() {} 4117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class IsAcceptable implements ICUBinary.Authenticate { 4137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // @Override when we switch to Java 6 4147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isDataVersionAcceptable(byte version[]) { 4157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return version[0]==2; 4167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 4197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" 4207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Normalizer2Impl load(ByteBuffer bytes) { 4227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert try { 4237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 4247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 4257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(indexesLength<=IX_MIN_MAYBE_YES) { 4267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes"); 4277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int[] inIndexes=new int[indexesLength]; 4297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert inIndexes[0]=indexesLength*4; 4307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i=1; i<indexesLength; ++i) { 4317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert inIndexes[i]=bytes.getInt(); 4327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 4357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 4367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert minYesNo=inIndexes[IX_MIN_YES_NO]; 4387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 4397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert minNoNo=inIndexes[IX_MIN_NO_NO]; 4407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 4417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 4427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Read the normTrie. 4447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int offset=inIndexes[IX_NORM_TRIE_OFFSET]; 4457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 4467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert normTrie=Trie2_16.createFromSerialized(bytes); 4477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int trieLength=normTrie.getSerializedLength(); 4487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(trieLength>(nextOffset-offset)) { 4497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie"); 4507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes 4527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Read the composition and mapping data. 4547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert offset=nextOffset; 4557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 4567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int numChars=(nextOffset-offset)/2; 4577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(numChars!=0) { 458f8a0c400bbd62a2ea4ee9b77641f79cb443d2187Neil Fuller maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); 4597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes); 4607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // smallFCD: new in formatVersion 2 4637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert offset=nextOffset; 4647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert smallFCD=new byte[0x100]; 465f8a0c400bbd62a2ea4ee9b77641f79cb443d2187Neil Fuller bytes.get(smallFCD); 4667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Build tccc180[]. 4687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. 4697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tccc180=new int[0x180]; 4707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int bits=0; 4717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int c=0; c<0x180; bits>>=1) { 4727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((c&0xff)==0) { 4737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bits=smallFCD[c>>8]; // one byte per 0x100 code points 4747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((bits&1)!=0) { 4767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i=0; i<0x20; ++i, ++c) { 4777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tccc180[c]=getFCD16FromNormData(c)&0xff; 4787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 4807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c+=0x20; 4817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 4857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } catch(IOException e) { 4867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new ICUUncheckedIOException(e); 4877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Normalizer2Impl load(String name) { 4907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return load(ICUBinary.getRequiredData(name)); 4917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) { 4947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isAlgorithmicNoNo(norm16)) { 4957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Range of code points with same-norm16-value algorithmic decompositions. 4967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // They might have different non-zero FCD16 values. 4977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 4987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int fcd16=getFCD16(start); 4997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(fcd16>0xff) { set.add(start); } 5007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while(++start<=end); 5017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 5027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int fcd16=getFCD16(start); 5037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(fcd16>0xff) { set.add(start, end); } 5047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) { 5087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* add the start code point to the USet */ 5097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert set.add(start); 5107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(start!=end && isAlgorithmicNoNo(value)) { 5117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Range of code points with same-norm16-value algorithmic decompositions. 5127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // They might have different non-zero FCD16 values. 5137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevFCD16=getFCD16(start); 5147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(++start<=end) { 5157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int fcd16=getFCD16(start); 5167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(fcd16!=prevFCD16) { 5177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert set.add(start); 5187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevFCD16=fcd16; 5197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void addLcccChars(UnicodeSet set) { 5257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* add the start code point of each same-value range of each trie */ 5267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 5277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Trie2.Range range; 5287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 5297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set); 5307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void addPropertyStarts(UnicodeSet set) { 5347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* add the start code point of each same-value range of each trie */ 5357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 5367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Trie2.Range range; 5377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 5387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set); 5397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* add Hangul LV syllables and LV+1 because of skippables */ 5427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) { 5437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert set.add(c); 5447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert set.add(c+1); 5457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 5477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void addCanonIterPropertyStarts(UnicodeSet set) { 5507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* add the start code point of each same-value range of the canonical iterator data trie */ 5517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ensureCanonIterData(); 5527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // currently only used for the SEGMENT_STARTER property 5537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper); 5547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Trie2.Range range; 5557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 5567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* add the start code point to the USet */ 5577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert set.add(range.startCodePoint); 5587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() { 5617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int map(int in) { 5627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return in&CANON_NOT_SEGMENT_STARTER; 5637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }; 5657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // low-level properties ------------------------------------------------ *** 5677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Trie2_16 getNormTrie() { return normTrie; } 5697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: Normalizer2Impl.java r30983 (2011-nov-27) 5717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // still had getFCDTrie() which built and cached an FCD trie. 5727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // That provided faster access to FCD data than getFCD16FromNormData() 5737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // but required synchronization and consumed some 10kB of heap memory 5747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // in any process that uses FCD (e.g., via collation). 5757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // tccc180[] and smallFCD[] are intended to help with any loss of performance, 5767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // at least for Latin & CJK. 5777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Builds the canonical-iterator data for this instance. 5807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This is required before any of {@link #isCanonSegmentStarter(int)} or 5817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * {@link #getCanonStartSet(int, UnicodeSet)} are called, 5827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or else they crash. 5837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return this 5847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public synchronized Normalizer2Impl ensureCanonIterData() { 5867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(canonIterData==null) { 5877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Trie2Writable newData=new Trie2Writable(0, 0); 5887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert canonStartSets=new ArrayList<UnicodeSet>(); 5897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 5907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Trie2.Range range; 5917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 5927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert final int norm16=range.value; 5937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) { 5947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Inert, or 2-way mapping (including Hangul syllable). 5957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We do not write a canonStartSet for any yesNo character. 5967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Composites from 2-way mappings are added at runtime from the 5977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // starter's compositions list, and the other characters in 5987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 5997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // "maybe" characters. 6007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 6017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) { 6037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert final int oldValue=newData.get(c); 6047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int newValue=oldValue; 6057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm16>=minMaybeYes) { 6067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // not a segment starter if it occurs in a decomposition or has cc!=0 6077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newValue|=CANON_NOT_SEGMENT_STARTER; 6087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm16<MIN_NORMAL_MAYBE_YES) { 6097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newValue|=CANON_HAS_COMPOSITIONS; 6107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(norm16<minYesNo) { 6127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newValue|=CANON_HAS_COMPOSITIONS; 6137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 6147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c has a one-way decomposition 6157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c2=c; 6167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int norm16_2=norm16; 6177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) { 6187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c2=this.mapAlgorithmic(c2, norm16_2); 6197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm16_2=getNorm16(c2); 6207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(minYesNo<=norm16_2 && norm16_2<limitNoNo) { 6227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c decomposes, get everything from the variable-length extra data 6237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstUnit=extraData.charAt(norm16_2); 6247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length=firstUnit&MAPPING_LENGTH_MASK; 6257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 6267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c==c2 && (extraData.charAt(norm16_2-1)&0xff)!=0) { 6277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 6287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Skip empty mappings (no characters in the decomposition). 6317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(length!=0) { 6327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++norm16_2; // skip over the firstUnit 6337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // add c to first code point's start set 6347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int limit=norm16_2+length; 6357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c2=extraData.codePointAt(norm16_2); 6367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addToStartSet(newData, c, c2); 6377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 6387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // one-way mapping. A 2-way mapping is possible here after 6397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // intermediate algorithmic mapping. 6407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm16_2>=minNoNo) { 6417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while((norm16_2+=Character.charCount(c2))<limit) { 6427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c2=extraData.codePointAt(norm16_2); 6437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c2Value=newData.get(c2); 6447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 6457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER); 6467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 6517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c decomposed to c2 algorithmically; c has cc==0 6527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addToStartSet(newData, c, c2); 6537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(newValue!=oldValue) { 6567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newData.set(c, newValue); 6577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert canonIterData=newData.toTrie2_32(); 6617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return this; 6637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getNorm16(int c) { return normTrie.get(c); } 6667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getCompQuickCheck(int norm16) { 6687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { 6697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 1; // yes 6707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(minMaybeYes<=norm16) { 6717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 2; // maybe 6727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 6737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; // no 6747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } 6777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } 6787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } 6797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getCC(int norm16) { 6817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm16>=MIN_NORMAL_MAYBE_YES) { 6827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return norm16&0xff; 6837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm16<minNoNo || limitNoNo<=norm16) { 6857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 6867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return getCCFromNoNo(norm16); 6887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static int getCCFromYesOrMaybe(int norm16) { 6907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0; 6917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 6947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the FCD data for code point c. 6957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param c A Unicode code point. 6967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 6977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 6987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getFCD16(int c) { 6997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<0) { 7007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 7017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(c<0x180) { 7027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return tccc180[c]; 7037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(c<=0xffff) { 7047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 7057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return getFCD16FromNormData(c); 7077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Returns the FCD data for U+0000<=c<U+0180. */ 7097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getFCD16FromBelow180(int c) { return tccc180[c]; } 7107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ 7117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean singleLeadMightHaveNonZeroFCD16(int lead) { 7127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 0<=lead<=0xffff 7137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert byte bits=smallFCD[lead>>8]; 7147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(bits==0) { return false; } 7157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ((bits>>((lead>>5)&7))&1)!=0; 7167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Gets the FCD value from the regular normalization data. */ 7197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getFCD16FromNormData(int c) { 7207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Only loops for 1:1 algorithmic mappings. 7217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 7227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int norm16=getNorm16(c); 7237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm16<=minYesNo) { 7247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // no decomposition or Hangul syllable, all zeros 7257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 7267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(norm16>=MIN_NORMAL_MAYBE_YES) { 7277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // combining mark 7287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm16&=0xff; 7297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return norm16|(norm16<<8); 7307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(norm16>=minMaybeYes) { 7317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 7327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(isDecompNoAlgorithmic(norm16)) { 7337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=mapAlgorithmic(c, norm16); 7347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 7357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c decomposes, get everything from the variable-length extra data 7367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstUnit=extraData.charAt(norm16); 7377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((firstUnit&MAPPING_LENGTH_MASK)==0) { 7387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // A character that is deleted (maps to an empty string) must 7397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // get the worst-case lccc and tccc values because arbitrary 7407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // characters on both sides will become adjacent. 7417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0x1ff; 7427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 7437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int fcd16=firstUnit>>8; // tccc 7447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 7457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc 7467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return fcd16; 7487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 7547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Gets the decomposition for one code point. 7557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param c code point 7567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return c's decomposition, if it has one; returns null if it does not have a decomposition 7577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 7587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public String getDecomposition(int c) { 7597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int decomp=-1; 7607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int norm16; 7617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 7627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 7637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c does not decompose 7647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(isHangul(norm16)) { 7657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Hangul syllable: decompose algorithmically 7667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder buffer=new StringBuilder(); 7677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Hangul.decompose(c, buffer); 7687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return buffer.toString(); 7697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(isDecompNoAlgorithmic(norm16)) { 7707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decomp=c=mapAlgorithmic(c, norm16); 7717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 7727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 7737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c decomposes, get everything from the variable-length extra data 7747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK; 7757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return extraData.substring(norm16, norm16+length); 7767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(decomp<0) { 7787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return null; 7797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 7807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return UTF16.valueOf(decomp); 7817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 7867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Gets the raw decomposition for one code point. 7877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param c code point 7887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition 7897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 7907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public String getRawDecomposition(int c) { 7917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We do not loop in this method because an algorithmic mapping itself 7927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // becomes a final result rather than having to be decomposed recursively. 7937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int norm16; 7947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 7957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c does not decompose 7967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return null; 7977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(isHangul(norm16)) { 7987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Hangul syllable: decompose algorithmically 7997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder buffer=new StringBuilder(); 8007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Hangul.getRawDecomposition(c, buffer); 8017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return buffer.toString(); 8027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(isDecompNoAlgorithmic(norm16)) { 8037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return UTF16.valueOf(mapAlgorithmic(c, norm16)); 8047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 8057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c decomposes, get everything from the variable-length extra data 8067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstUnit=extraData.charAt(norm16); 8077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 8087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) { 8097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 8107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 8117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int rawMapping=norm16-((firstUnit>>7)&1)-1; 8127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char rm0=extraData.charAt(rawMapping); 8137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(rm0<=MAPPING_LENGTH_MASK) { 8147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return extraData.substring(rawMapping-rm0, rawMapping); 8157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 8167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Copy the normal mapping and replace its first two code units with rm0. 8177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder buffer=new StringBuilder(mLength-1).append(rm0); 8187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm16+=1+2; // skip over the firstUnit and the first two mapping code units 8197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return buffer.append(extraData, norm16, norm16+mLength-2).toString(); 8207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 8227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm16+=1; // skip over the firstUnit 8237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return extraData.substring(norm16, norm16+mLength); 8247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 8297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns true if code point c starts a canonical-iterator string segment. 8307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <b>{@link #ensureCanonIterData()} must have been called before this method, 8317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or else this method will crash.</b> 8327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param c A Unicode code point. 8337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return true if c starts a canonical-iterator string segment. 8347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isCanonSegmentStarter(int c) { 8367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return canonIterData.get(c)>=0; 8377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 8397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns true if there are characters whose decomposition starts with c. 8407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If so, then the set is cleared and then filled with those characters. 8417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <b>{@link #ensureCanonIterData()} must have been called before this method, 8427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or else this method will crash.</b> 8437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param c A Unicode code point. 8447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param set A UnicodeSet to receive the characters whose decompositions 8457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * start with c, if there are any. 8467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return true if there are characters whose decomposition starts with c. 8477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean getCanonStartSet(int c, UnicodeSet set) { 8497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER; 8507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(canonValue==0) { 8517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 8527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert set.clear(); 8547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int value=canonValue&CANON_VALUE_MASK; 8557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((canonValue&CANON_HAS_SET)!=0) { 8567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert set.addAll(canonStartSets.get(value)); 8577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(value!=0) { 8587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert set.add(value); 8597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 8617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int norm16=getNorm16(c); 8627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm16==JAMO_L) { 8637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT; 8647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1); 8657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 8667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addComposites(getCompositionsList(norm16), set); 8677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 8707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int MIN_CCC_LCCC_CP=0x300; 8737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int MIN_YES_YES_WITH_CC=0xff01; 8757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int JAMO_VT=0xff00; 8767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int MIN_NORMAL_MAYBE_YES=0xfe00; 8777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int JAMO_L=1; 8787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int MAX_DELTA=0x40; 8797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Byte offsets from the start of the data, after the generic header. 8817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int IX_NORM_TRIE_OFFSET=0; 8827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int IX_EXTRA_DATA_OFFSET=1; 8837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int IX_SMALL_FCD_OFFSET=2; 8847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int IX_RESERVED3_OFFSET=3; 8857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int IX_TOTAL_SIZE=7; 8867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Code point thresholds for quick check codes. 8887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int IX_MIN_DECOMP_NO_CP=8; 8897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int IX_MIN_COMP_NO_MAYBE_CP=9; 8907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Norm16 value thresholds for quick check combinations and types of extra data. 8927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. 8937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int IX_MIN_YES_NO=10; 8947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int IX_MIN_NO_NO=11; 8957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int IX_LIMIT_NO_NO=12; 8967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int IX_MIN_MAYBE_YES=13; 8977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Mappings only in [minYesNoMappingsOnly..minNoNo[. 8997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; 9007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int IX_COUNT=16; 9027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; 9047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int MAPPING_HAS_RAW_MAPPING=0x40; 9057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int MAPPING_NO_COMP_BOUNDARY_AFTER=0x20; 9067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int MAPPING_LENGTH_MASK=0x1f; 9077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int COMP_1_LAST_TUPLE=0x8000; 9097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int COMP_1_TRIPLE=1; 9107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int COMP_1_TRAIL_LIMIT=0x3400; 9117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int COMP_1_TRAIL_MASK=0x7ffe; 9127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit 9137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int COMP_2_TRAIL_SHIFT=6; 9147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int COMP_2_TRAIL_MASK=0xffc0; 9157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // higher-level functionality ------------------------------------------ *** 9177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // NFD without an NFD Normalizer2 instance. 9197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Appendable decompose(CharSequence s, StringBuilder dest) { 9207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decompose(s, 0, s.length(), dest, s.length()); 9217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return dest; 9227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 9247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Decomposes s[src, limit[ and writes the result to dest. 9257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * limit can be NULL if src is NUL-terminated. 9267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * destLengthEstimate is the initial dest buffer capacity and can be -1. 9277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 9287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void decompose(CharSequence s, int src, int limit, StringBuilder dest, 9297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int destLengthEstimate) { 9307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(destLengthEstimate<0) { 9317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert destLengthEstimate=limit-src; 9327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dest.setLength(0); 9347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); 9357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decompose(s, src, limit, buffer); 9367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Dual functionality: 9397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // buffer!=NULL: normalize 9407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 9417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int decompose(CharSequence s, int src, int limit, 9427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ReorderingBuffer buffer) { 9437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int minNoCP=minDecompNoCP; 9447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevSrc; 9467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=0; 9477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int norm16=0; 9487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // only for quick check 9507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevBoundary=src; 9517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevCC=0; 9527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 9547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // count code units below the minimum or with irrelevant data for the quick check 9557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(prevSrc=src; src!=limit;) { 9567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( (c=s.charAt(src))<minNoCP || 9577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 9587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 9597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++src; 9607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(!UTF16.isSurrogate((char)c)) { 9617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 9627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 9637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c2; 9647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(UTF16Plus.isSurrogateLead(c)) { 9657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 9667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.toCodePoint((char)c, c2); 9677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else /* trail surrogate */ { 9697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 9707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --src; 9717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.toCodePoint(c2, (char)c); 9727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 9757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert src+=Character.charCount(c); 9767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 9777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 9787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // copy these code units all at once 9827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(src!=prevSrc) { 9837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(buffer!=null) { 9847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.flushAndAppendZeroCC(s, prevSrc, src); 9857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 9867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC=0; 9877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevBoundary=src; 9887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(src==limit) { 9917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 9927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Check one above-minimum, relevant code point. 9957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert src+=Character.charCount(c); 9967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(buffer!=null) { 9977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decompose(c, norm16, buffer); 9987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 9997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isDecompYes(norm16)) { 10007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cc=getCCFromYesOrMaybe(norm16); 10017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(prevCC<=cc || cc==0) { 10027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC=cc; 10037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(cc<=1) { 10047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevBoundary=src; 10057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 10077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return prevBoundary; // "no" or cc out of order 10107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return src; 10137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { 10157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int limit=s.length(); 10167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(limit==0) { 10177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 10187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(doDecompose) { 10207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decompose(s, 0, limit, buffer); 10217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 10227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Just merge the strings at the boundary. 10247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=Character.codePointAt(s, 0); 10257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int src=0; 10267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstCC, prevCC, cc; 10277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert firstCC=prevCC=cc=getCC(getNorm16(c)); 10287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(cc!=0) { 10297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC=cc; 10307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert src+=Character.charCount(c); 10317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(src>=limit) { 10327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 10337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.codePointAt(s, src); 10357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cc=getCC(getNorm16(c)); 10367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }; 10377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append(s, 0, src, firstCC, prevCC); 10387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append(s, src, limit); 10397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 10417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // doCompose: normalize 10427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // !doCompose: isNormalized (buffer must be empty and initialized) 10437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean compose(CharSequence s, int src, int limit, 10447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean onlyContiguous, 10457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean doCompose, 10467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ReorderingBuffer buffer) { 10477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int minNoMaybeCP=minCompNoMaybeCP; 10487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 10507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * prevBoundary points to the last character before the current one 10517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * that has a composition boundary before it with ccc==0 and quick check "yes". 10527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Keeping track of prevBoundary saves us looking for a composition boundary 10537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * when we find a "no" or "maybe". 10547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * When we back out from prevSrc back to prevBoundary, 10567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * then we also remove those same characters (which had been simply copied 10577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or canonically-order-inserted) from the ReorderingBuffer. 10587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Therefore, at all times, the [prevBoundary..prevSrc[ source units 10597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * must correspond 1:1 to destination units at the end of the destination buffer. 10607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 10617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevBoundary=src; 10627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevSrc; 10637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=0; 10647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int norm16=0; 10657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // only for isNormalized 10677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevCC=0; 10687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 10707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // count code units below the minimum or with irrelevant data for the quick check 10717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(prevSrc=src; src!=limit;) { 10727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( (c=s.charAt(src))<minNoMaybeCP || 10737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 10747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 10757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++src; 10767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(!UTF16.isSurrogate((char)c)) { 10777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 10787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 10797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c2; 10807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(UTF16Plus.isSurrogateLead(c)) { 10817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 10827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.toCodePoint((char)c, c2); 10837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else /* trail surrogate */ { 10857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 10867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --src; 10877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.toCodePoint(c2, (char)c); 10887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 10917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert src+=Character.charCount(c); 10927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 10937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 10947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // copy these code units all at once 10987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(src!=prevSrc) { 10997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(src==limit) { 11007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(doCompose) { 11017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.flushAndAppendZeroCC(s, prevSrc, src); 11027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 11047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Set prevBoundary to the last character in the quick check loop. 11067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevBoundary=src-1; 11077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && 11087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Character.isHighSurrogate(s.charAt(prevBoundary-1)) 11097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 11107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --prevBoundary; 11117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(doCompose) { 11137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The last "quick check yes" character is excluded from the 11147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // flush-and-append call in case it needs to be modified. 11157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 11167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append(s, prevBoundary, src); 11177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 11187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC=0; 11197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The start of the current character (c). 11217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevSrc=src; 11227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(src==limit) { 11237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 11247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert src+=Character.charCount(c); 11277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 11287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 11297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 11307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or has ccc!=0. 11317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Check for Jamo V/T, then for regular characters. 11327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * c is not a Hangul syllable or Jamo L because those have "yes" properties. 11337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 11347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isJamoVT(norm16) && prevBoundary!=prevSrc) { 11357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char prev=s.charAt(prevSrc-1); 11367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean needToDecompose=false; 11377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<Hangul.JAMO_T_BASE) { 11387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 11397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prev-=Hangul.JAMO_L_BASE; 11407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(prev<Hangul.JAMO_L_COUNT) { 11417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!doCompose) { 11427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 11437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char syllable=(char) 11457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (Hangul.HANGUL_BASE+ 11467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 11477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Hangul.JAMO_T_COUNT); 11487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char t; 11497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 11507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++src; 11517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert syllable+=t; // The next character was a Jamo T. 11527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevBoundary=src; 11537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.setLastChar(syllable); 11547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 11557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If we see L+V+x where x!=T then we drop to the slow path, 11577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // decompose and recompose. 11587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This is to deal with NFKC finding normal L and V but a 11597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // compatibility variant of a T. We need to either fully compose that 11607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // combination here (which would complicate the code and may not work 11617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // with strange custom data) or use the slow path -- or else our replacing 11627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // two input characters (L+V) with one output character (LV syllable) 11637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // would violate the invariant that [prevBoundary..prevSrc[ has the same 11647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // length as what we appended to the buffer since prevBoundary. 11657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert needToDecompose=true; 11667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(Hangul.isHangulWithoutJamoT(prev)) { 11687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c is a Jamo Trailing consonant, 11697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // compose with previous Hangul LV that does not contain a Jamo T. 11707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!doCompose) { 11717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 11727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE)); 11747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevBoundary=src; 11757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 11767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!needToDecompose) { 11787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The Jamo V/T did not compose into a Hangul syllable. 11797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(doCompose) { 11807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append((char)c); 11817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 11827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC=0; 11837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 11857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 11887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Source buffer pointers: 11897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * all done quick check current char not yet 11917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * "yes" but (c) processed 11927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * may combine 11937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * forward 11947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * [-------------[-------------[-------------[-------------[ 11957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * | | | | | 11967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * orig. src prevBoundary prevSrc src limit 11977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 11997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Destination buffer pointers inside the ReorderingBuffer: 12007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 12017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * all done might take not filled yet 12027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * characters for 12037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * reordering 12047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * [-------------[-------------[-------------[ 12057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * | | | | 12067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * start reorderStart limit | 12077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * +remainingCap.+ 12087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 12097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm16>=MIN_YES_YES_WITH_CC) { 12107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cc=norm16&0xff; // cc!=0 12117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( onlyContiguous && // FCC 12127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (doCompose ? buffer.getLastCC() : prevCC)==0 && 12137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevBoundary<prevSrc && 12147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that 12157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 12167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // passed the quick check "yes && ccc==0" test. 12177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Check whether the last character was a "yesYes" or a "yesNo". 12187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If a "yesNo", then we get its trailing ccc from its 12197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // mapping and check for canonical order. 12207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // All other cases are ok. 12217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc 12227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 12237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fails FCD test, need to decompose and contiguously recompose. 12247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!doCompose) { 12257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 12267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(doCompose) { 12287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append(c, cc); 12297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 12307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(prevCC<=cc) { 12317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC=cc; 12327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 12337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 12347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 12357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { 12377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 12387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 12417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Find appropriate boundaries around this character, 12427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * decompose the source text from between the boundaries, 12437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and recompose it. 12447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 12457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * We may need to remove the last few characters from the ReorderingBuffer 12467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * to account for source text that was copied or appended 12477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * but needs to take part in the recomposition. 12487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 12497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 12517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Find the last composition boundary in [prevBoundary..src[. 12527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * It is either the decomposition of the current character (at prevSrc), 12537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or prevBoundary. 12547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 12557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(hasCompBoundaryBefore(c, norm16)) { 12567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevBoundary=prevSrc; 12577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(doCompose) { 12587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.removeSuffix(prevSrc-prevBoundary); 12597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Find the next composition boundary in [src..limit[ - 12627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // modifies src to point to the next starter. 12637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert src=findNextCompBoundary(s, src, limit); 12647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. 12667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int recomposeStartIndex=buffer.length(); 12677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decomposeShort(s, prevBoundary, src, buffer); 12687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert recompose(buffer, recomposeStartIndex, onlyContiguous); 12697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!doCompose) { 12707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!buffer.equals(s, prevBoundary, src)) { 12717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 12727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.remove(); 12747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC=0; 12757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Move to the next starter. We never need to look back before this point again. 12787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevBoundary=src; 12797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 12817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 12837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Very similar to compose(): Make the same changes in both places if relevant. 12847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) 12857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * !doSpan: quickCheck 12867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and 12877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * bit 0: set if "maybe"; otherwise, if the span length<s.length() 12887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * then the quick check result is "no" 12897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 12907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int composeQuickCheck(CharSequence s, int src, int limit, 12917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean onlyContiguous, boolean doSpan) { 12927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int qcResult=0; 12937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int minNoMaybeCP=minCompNoMaybeCP; 12947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 12967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * prevBoundary points to the last character before the current one 12977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * that has a composition boundary before it with ccc==0 and quick check "yes". 12987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 12997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevBoundary=src; 13007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevSrc; 13017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=0; 13027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int norm16=0; 13037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevCC=0; 13047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 13067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // count code units below the minimum or with irrelevant data for the quick check 13077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(prevSrc=src;;) { 13087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(src==limit) { 13097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (src<<1)|qcResult; // "yes" or "maybe" 13107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( (c=s.charAt(src))<minNoMaybeCP || 13127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 13137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 13147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++src; 13157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(!UTF16.isSurrogate((char)c)) { 13167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 13177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 13187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c2; 13197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(UTF16Plus.isSurrogateLead(c)) { 13207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 13217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.toCodePoint((char)c, c2); 13227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else /* trail surrogate */ { 13247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 13257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --src; 13267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.toCodePoint(c2, (char)c); 13277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 13307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert src+=Character.charCount(c); 13317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 13327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 13337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(src!=prevSrc) { 13377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Set prevBoundary to the last character in the quick check loop. 13387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevBoundary=src-1; 13397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && 13407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Character.isHighSurrogate(s.charAt(prevBoundary-1)) 13417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 13427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --prevBoundary; 13437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC=0; 13457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The start of the current character (c). 13467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevSrc=src; 13477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert src+=Character.charCount(c); 13507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 13517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 13527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 13537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or has ccc!=0. 13547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 13557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isMaybeOrNonZeroCC(norm16)) { 13567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cc=getCCFromYesOrMaybe(norm16); 13577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( onlyContiguous && // FCC 13587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cc!=0 && 13597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC==0 && 13607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevBoundary<prevSrc && 13617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // prevCC==0 && prevBoundary<prevSrc tell us that 13627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 13637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // passed the quick check "yes && ccc==0" test. 13647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Check whether the last character was a "yesYes" or a "yesNo". 13657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If a "yesNo", then we get its trailing ccc from its 13667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // mapping and check for canonical order. 13677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // All other cases are ok. 13687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc 13697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 13707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fails FCD test. 13717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(prevCC<=cc || cc==0) { 13727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC=cc; 13737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm16<MIN_YES_YES_WITH_CC) { 13747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!doSpan) { 13757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert qcResult=1; 13767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 13777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return prevBoundary<<1; // spanYes does not care to know it's "maybe" 13787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 13817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return prevBoundary<<1; // "no" 13847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void composeAndAppend(CharSequence s, 13877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean doCompose, 13887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean onlyContiguous, 13897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ReorderingBuffer buffer) { 13907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int src=0, limit=s.length(); 13917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!buffer.isEmpty()) { 13927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstStarterInSrc=findNextCompBoundary(s, 0, limit); 13937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(0!=firstStarterInSrc) { 13947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), 13957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.length()); 13967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ 13977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert firstStarterInSrc+16); 13987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); 13997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.removeSuffix(buffer.length()-lastStarterInDest); 14007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert middle.append(s, 0, firstStarterInSrc); 14017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compose(middle, 0, middle.length(), onlyContiguous, true, buffer); 14027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert src=firstStarterInSrc; 14037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(doCompose) { 14067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compose(s, src, limit, onlyContiguous, true, buffer); 14077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 14087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append(s, src, limit); 14097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Dual functionality: 14127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // buffer!=NULL: normalize 14137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 14147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { 14157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: In this function we use buffer->appendZeroCC() because we track 14167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the lead and trail combining classes here, rather than leaving it to 14177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the ReorderingBuffer. 14187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The exception is the call to decomposeShort() which uses the buffer 14197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // in the normal way. 14207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 14227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Similar to the prevBoundary in the compose() implementation. 14237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevBoundary=src; 14247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevSrc; 14257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=0; 14267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevFCD16=0; 14277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int fcd16=0; 14287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 14307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // count code units with lccc==0 14317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(prevSrc=src; src!=limit;) { 14327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((c=s.charAt(src))<MIN_CCC_LCCC_CP) { 14337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevFCD16=~c; 14347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++src; 14357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 14367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevFCD16=0; 14377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++src; 14387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 14397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(UTF16.isSurrogate((char)c)) { 14407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c2; 14417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(UTF16Plus.isSurrogateLead(c)) { 14427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 14437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.toCodePoint((char)c, c2); 14447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else /* trail surrogate */ { 14467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 14477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --src; 14487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.toCodePoint(c2, (char)c); 14497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((fcd16=getFCD16FromNormData(c))<=0xff) { 14537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevFCD16=fcd16; 14547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert src+=Character.charCount(c); 14557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 14567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 14577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // copy these code units all at once 14617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(src!=prevSrc) { 14627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(src==limit) { 14637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(buffer!=null) { 14647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.flushAndAppendZeroCC(s, prevSrc, src); 14657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 14677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevBoundary=src; 14697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We know that the previous character's lccc==0. 14707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(prevFCD16<0) { 14717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fetching the fcd16 value was deferred for this below-U+0300 code point. 14727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prev=~prevFCD16; 14737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); 14747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(prevFCD16>1) { 14757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --prevBoundary; 14767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 14787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int p=src-1; 14797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && 14807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Character.isHighSurrogate(s.charAt(p-1)) 14817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 14827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --p; 14837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Need to fetch the previous character's FCD value because 14847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // prevFCD16 was just for the trail surrogate code point. 14857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); 14867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 14877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(prevFCD16>1) { 14897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevBoundary=p; 14907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(buffer!=null) { 14937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The last lccc==0 character is excluded from the 14947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // flush-and-append call in case it needs to be modified. 14957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 14967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append(s, prevBoundary, src); 14977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The start of the current character (c). 14997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevSrc=src; 15007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(src==limit) { 15017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 15027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert src+=Character.charCount(c); 15057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 15067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Check for proper order, and decompose locally if necessary. 15077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((prevFCD16&0xff)<=(fcd16>>8)) { 15087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // proper order: prev tccc <= current lccc 15097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((fcd16&0xff)<=1) { 15107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevBoundary=src; 15117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(buffer!=null) { 15137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.appendZeroCC(c); 15147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevFCD16=fcd16; 15167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 15177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(buffer==null) { 15187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return prevBoundary; // quick check "no" 15197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 15207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 15217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Back out the part of the source that we copied or appended 15227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * already but is now going to be decomposed. 15237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * prevSrc is set to after what was copied/appended. 15247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 15257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.removeSuffix(prevSrc-prevBoundary); 15267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 15277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Find the part of the source that needs to be decomposed, 15287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * up to the next safe boundary. 15297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 15307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert src=findNextFCDBoundary(s, src, limit); 15317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 15327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The source text does not fulfill the conditions for FCD. 15337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Decompose and reorder a limited piece of the text. 15347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 15357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decomposeShort(s, prevBoundary, src, buffer); 15367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevBoundary=src; 15377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevFCD16=0; 15387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return src; 15417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) { 15437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int src=0, limit=s.length(); 15447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!buffer.isEmpty()) { 15457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit); 15467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(0!=firstBoundaryInSrc) { 15477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(), 15487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.length()); 15497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+ 15507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert firstBoundaryInSrc+16); 15517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length()); 15527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.removeSuffix(buffer.length()-lastBoundaryInDest); 15537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert middle.append(s, 0, firstBoundaryInSrc); 15547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert makeFCD(middle, 0, middle.length(), buffer); 15557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert src=firstBoundaryInSrc; 15567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(doMakeFCD) { 15597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert makeFCD(s, src, limit, buffer); 15607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 15617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append(s, src, limit); 15627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: hasDecompBoundary() could be implemented as aliases to 15667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() 15677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // at the cost of building the FCD trie for a decomposition normalizer. 15687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean hasDecompBoundary(int c, boolean before) { 15697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 15707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<minDecompNoCP) { 15717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 15727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int norm16=getNorm16(c); 15747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { 15757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 15767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(norm16>MIN_NORMAL_MAYBE_YES) { 15777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; // ccc!=0 15787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(isDecompNoAlgorithmic(norm16)) { 15797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=mapAlgorithmic(c, norm16); 15807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 15817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c decomposes, get everything from the variable-length extra data 15827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstUnit=extraData.charAt(norm16); 15837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((firstUnit&MAPPING_LENGTH_MASK)==0) { 15847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 15857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!before) { 15877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // decomp after-boundary: same as hasFCDBoundaryAfter(), 15887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // fcd16<=1 || trailCC==0 15897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(firstUnit>0x1ff) { 15907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; // trailCC>1 15917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(firstUnit<=0xff) { 15937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; // trailCC==0 15947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if(trailCC==1) test leadCC==0, same as checking for before-boundary 15967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // true if leadCC==0 (hasFCDBoundaryBefore()) 15987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0; 15997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } 16037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean hasCompBoundaryBefore(int c) { 16057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c)); 16067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous, boolean testInert) { 16087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 16097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int norm16=getNorm16(c); 16107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isInert(norm16)) { 16117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 16127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(norm16<=minYesNo) { 16137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Hangul: norm16==minYesNo 16147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Hangul LVT has a boundary after it. 16157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Hangul LV and non-inert yesYes characters combine forward. 16167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return isHangul(norm16) && !Hangul.isHangulWithoutJamoT((char)c); 16177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { 16187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 16197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(isDecompNoAlgorithmic(norm16)) { 16207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=mapAlgorithmic(c, norm16); 16217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 16227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c decomposes, get everything from the variable-length extra data. 16237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If testInert, then c must be a yesNo character which has lccc=0, 16247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // otherwise it could be a noNo. 16257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstUnit=extraData.charAt(norm16); 16267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // true if 16277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // not MAPPING_NO_COMP_BOUNDARY_AFTER 16287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (which is set if 16297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c is not deleted, and 16307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // it and its decomposition do not combine forward, and it has a starter) 16317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and if FCC then trailCC<=1 16327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 16337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && 16347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (!onlyContiguous || firstUnit<=0x1ff); 16357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean hasFCDBoundaryBefore(int c) { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; } 16407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean hasFCDBoundaryAfter(int c) { 16417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int fcd16=getFCD16(c); 16427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return fcd16<=1 || (fcd16&0xff)==0; 16437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isFCDInert(int c) { return getFCD16(c)<=1; } 16457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } 16477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } 16487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean isInert(int norm16) { return norm16==0; } 16497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean isJamoL(int norm16) { return norm16==1; } 16507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } 16517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean isHangul(int norm16) { return norm16==minYesNo; } 16527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } 16537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // UBool isCompYes(uint16_t norm16) const { 16547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 16557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // } 16567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // UBool isCompYesOrMaybe(uint16_t norm16) const { 16577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // return norm16<minNoNo || minMaybeYes<=norm16; 16587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // } 16597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // private boolean hasZeroCCFromDecompYes(int norm16) { 16607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 16617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // } 16627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean isDecompYesAndZeroCC(int norm16) { 16637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return norm16<minYesNo || 16647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm16==JAMO_VT || 16657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 16667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 16687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A little faster and simpler than isDecompYesAndZeroCC() but does not include 16697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the MaybeYes which combine-forward and have ccc=0. 16707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (Standard Unicode 5.2 normalization does not have such characters.) 16717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 16727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean isMostDecompYesAndZeroCC(int norm16) { 16737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 16747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } 16767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 16777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // For use with isCompYes(). 16787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 16797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // static uint8_t getCCFromYes(uint16_t norm16) { 16807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; 16817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // } 16827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int getCCFromNoNo(int norm16) { 16837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 16847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return extraData.charAt(norm16-1)&0xff; 16857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 16867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 16877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() 16907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) { 16917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c; 16927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(cpStart==(cpLimit-1)) { 16937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=s.charAt(cpStart); 16947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 16957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=Character.codePointAt(s, cpStart); 16967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 16977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int prevNorm16=getNorm16(c); 16987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(prevNorm16<=minYesNo) { 16997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 17007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 17017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return extraData.charAt(prevNorm16)>>8; // tccc from yesNo 17027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Requires algorithmic-NoNo. 17067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int mapAlgorithmic(int c, int norm16) { 17077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return c+norm16-(minMaybeYes-MAX_DELTA-1); 17087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Requires minYesNo<norm16<limitNoNo. 17117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // private int getMapping(int norm16) { return /*extraData+*/norm16; } 17127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 17147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return index into maybeYesCompositions, or -1 17157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 17167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int getCompositionsListForDecompYes(int norm16) { 17177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) { 17187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return -1; 17197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 17207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((norm16-=minMaybeYes)<0) { 17217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // norm16<minMaybeYes: index into extraData which is a substring at 17227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] 17237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 17247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list 17257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return norm16; 17277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 17307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return index into maybeYesCompositions 17317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 17327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int getCompositionsListForComposite(int norm16) { 17337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // composite has both mapping & compositions list 17347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstUnit=extraData.charAt(norm16); 17357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions 17367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1+ // +1 to skip the first unit with the mapping lenth 17377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (firstUnit&MAPPING_LENGTH_MASK); // + mapping length 17387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 17407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param c code point must have compositions 17417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return index into maybeYesCompositions 17427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 17437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int getCompositionsList(int norm16) { 17447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return isDecompYes(norm16) ? 17457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert getCompositionsListForDecompYes(norm16) : 17467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert getCompositionsListForComposite(norm16); 17477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Decompose a short piece of text which is likely to contain characters that 17507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // fail the quick check loop and/or where the quick check loop's overhead 17517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // is unlikely to be amortized. 17527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Called by the compose() and makeFCD() implementations. 17537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Public in Java for collation implementation code. 17547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void decomposeShort(CharSequence s, int src, int limit, 17557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ReorderingBuffer buffer) { 17567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(src<limit) { 17577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=Character.codePointAt(s, src); 17587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert src+=Character.charCount(c); 17597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decompose(c, getNorm16(c), buffer); 17607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void decompose(int c, int norm16, 17637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ReorderingBuffer buffer) { 17647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Only loops for 1:1 algorithmic mappings. 17657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 17667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // get the decomposition and the lead and trail cc's 17677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isDecompYes(norm16)) { 17687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c does not decompose 17697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append(c, getCCFromYesOrMaybe(norm16)); 17707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(isHangul(norm16)) { 17717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Hangul syllable: decompose algorithmically 17727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Hangul.decompose(c, buffer); 17737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(isDecompNoAlgorithmic(norm16)) { 17747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=mapAlgorithmic(c, norm16); 17757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm16=getNorm16(c); 17767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 17777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 17787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c decomposes, get everything from the variable-length extra data 17797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstUnit=extraData.charAt(norm16); 17807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length=firstUnit&MAPPING_LENGTH_MASK; 17817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int leadCC, trailCC; 17827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert trailCC=firstUnit>>8; 17837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 17847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert leadCC=extraData.charAt(norm16-1)>>8; 17857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 17867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert leadCC=0; 17877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++norm16; // skip over the firstUnit 17897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.append(extraData, norm16, norm16+length, leadCC, trailCC); 17907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 17927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 17947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 17957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 17967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Finds the recomposition result for 17977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * a forward-combining "lead" character, 17987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * specified with a pointer to its compositions list, 17997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and a backward-combining "trail" character. 18007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 18017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p>If the lead and trail characters combine, then this function returns 18027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the following "compositeAndFwd" value: 18037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <pre> 18047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Bits 21..1 composite character 18057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Bit 0 set if the composite is a forward-combining starter 18067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * </pre> 18077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * otherwise it returns -1. 18087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 18097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p>The compositions list has (trail, compositeAndFwd) pair entries, 18107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * encoded as either pairs or triples of 16-bit units. 18117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The last entry has the high bit of its first unit set. 18127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 18137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p>The list is sorted by ascending trail characters (there are no duplicates). 18147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A linear search is used. 18157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 18167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p>See normalizer2impl.h for a more detailed description 18177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * of the compositions list format. 18187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 18197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int combine(String compositions, int list, int trail) { 18207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int key1, firstUnit; 18217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(trail<COMP_1_TRAIL_LIMIT) { 18227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // trail character is 0..33FF 18237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // result entry may have 2 or 3 units 18247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert key1=(trail<<1); 18257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(key1>(firstUnit=compositions.charAt(list))) { 18267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert list+=2+(firstUnit&COMP_1_TRIPLE); 18277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 18297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((firstUnit&COMP_1_TRIPLE)!=0) { 18307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2); 18317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 18327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return compositions.charAt(list+1); 18337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 18367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // trail character is 3400..10FFFF 18377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // result entry has 3 units 18387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); 18397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; 18407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int secondUnit; 18417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 18427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(key1>(firstUnit=compositions.charAt(list))) { 18437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert list+=2+(firstUnit&COMP_1_TRIPLE); 18447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 18457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(key2>(secondUnit=compositions.charAt(list+1))) { 18467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((firstUnit&COMP_1_LAST_TUPLE)!=0) { 18477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 18487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 18497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert list+=3; 18507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 18527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); 18537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 18547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 18557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 18577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 18587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return -1; 18627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 18647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param list some character's compositions list 18657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param set recursively receives the composites from these compositions 18667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 18677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void addComposites(int list, UnicodeSet set) { 18687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstUnit, compositeAndFwd; 18697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 18707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert firstUnit=maybeYesCompositions.charAt(list); 18717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((firstUnit&COMP_1_TRIPLE)==0) { 18727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compositeAndFwd=maybeYesCompositions.charAt(list+1); 18737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert list+=2; 18747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 18757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compositeAndFwd=(((int)maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)| 18767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert maybeYesCompositions.charAt(list+2); 18777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert list+=3; 18787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int composite=compositeAndFwd>>1; 18807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((compositeAndFwd&1)!=0) { 18817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addComposites(getCompositionsListForComposite(getNorm16(composite)), set); 18827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert set.add(composite); 18847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while((firstUnit&COMP_1_LAST_TUPLE)==0); 18857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 18867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 18877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Recomposes the buffer text starting at recomposeStartIndex 18887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (which is in NFD - decomposed and canonically ordered), 18897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and truncates the buffer contents. 18907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 18917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Note that recomposition never lengthens the text: 18927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Any character consists of either one or two code units; 18937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * a composition may contain at most one more code unit than the original starter, 18947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * while the combining mark that is removed has at least one code unit. 18957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 18967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, 18977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean onlyContiguous) { 18987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder sb=buffer.getStringBuilder(); 18997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int p=recomposeStartIndex; 19007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(p==sb.length()) { 19017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 19027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int starter, pRemove; 19057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int compositionsList; 19067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c, compositeAndFwd; 19077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int norm16; 19087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cc, prevCC; 19097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean starterIsSupplementary; 19107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Some of the following variables are not used until we have a forward-combining starter 19127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and are only initialized now to avoid compiler warnings. 19137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compositionsList=-1; // used as indicator for whether we have a forward-combining starter 19147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert starter=-1; 19157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert starterIsSupplementary=false; 19167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC=0; 19177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 19197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=sb.codePointAt(p); 19207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p+=Character.charCount(c); 19217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm16=getNorm16(c); 19227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cc=getCCFromYesOrMaybe(norm16); 19237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if( // this character combines backward and 19247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isMaybe(norm16) && 19257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we have seen a starter that combines forward and 19267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compositionsList>=0 && 19277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the backward-combining character is not blocked 19287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (prevCC<cc || prevCC==0) 19297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ) { 19307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isJamoVT(norm16)) { 19317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c is a Jamo V/T, see if we can compose it with the previous character. 19327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<Hangul.JAMO_T_BASE) { 19337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 19347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); 19357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(prev<Hangul.JAMO_L_COUNT) { 19367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pRemove=p-1; 19377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char syllable=(char) 19387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (Hangul.HANGUL_BASE+ 19397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 19407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Hangul.JAMO_T_COUNT); 19417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char t; 19427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 19437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++p; 19447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert syllable+=t; // The next character was a Jamo T. 19457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sb.setCharAt(starter, syllable); 19477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // remove the Jamo V/T 19487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sb.delete(pRemove, p); 19497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p=pRemove; 19507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 19537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * No "else" for Jamo T: 19547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Since the input is in NFD, there are no Hangul LV syllables that 19557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * a Jamo T could combine with. 19567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * All Jamo Ts are combined above when handling Jamo Vs. 19577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 19587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(p==sb.length()) { 19597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 19607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compositionsList=-1; 19627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 19637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { 19647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The starter and the combining mark (c) do combine. 19657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int composite=compositeAndFwd>>1; 19667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Remove the combining mark. 19687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark 19697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sb.delete(pRemove, p); 19707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p=pRemove; 19717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Replace the starter with the composite. 19727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(starterIsSupplementary) { 19737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(composite>0xffff) { 19747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // both are supplementary 19757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 19767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); 19777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 19787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sb.setCharAt(starter, (char)c); 19797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sb.deleteCharAt(starter+1); 19807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The composite is shorter than the starter, 19817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // move the intermediate characters forward one. 19827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert starterIsSupplementary=false; 19837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --p; 19847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(composite>0xffff) { 19867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The composite is longer than the starter, 19877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // move the intermediate characters back one. 19887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert starterIsSupplementary=true; 19897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 19907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); 19917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++p; 19927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 19937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // both are on the BMP 19947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sb.setCharAt(starter, (char)composite); 19957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 19967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Keep prevCC because we removed the combining mark. 19987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 19997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(p==sb.length()) { 20007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 20017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Is the composite a starter that combines forward? 20037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((compositeAndFwd&1)!=0) { 20047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compositionsList= 20057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert getCompositionsListForComposite(getNorm16(composite)); 20067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 20077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compositionsList=-1; 20087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We combined; continue with looking for compositions. 20117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 20127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // no combination this time 20167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert prevCC=cc; 20177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(p==sb.length()) { 20187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 20197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If c did not combine, then check if it is a starter. 20227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(cc==0) { 20237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Found a new starter. 20247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { 20257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // It may combine with something, prepare for it. 20267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<=0xffff) { 20277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert starterIsSupplementary=false; 20287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert starter=p-1; 20297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 20307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert starterIsSupplementary=true; 20317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert starter=p-2; 20327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(onlyContiguous) { 20357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // FCC: no discontiguous compositions; any intervening character blocks. 20367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compositionsList=-1; 20377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert buffer.flush(); 20407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int composePair(int a, int b) { 20437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 20447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int list; 20457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isInert(norm16)) { 20467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return -1; 20477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(norm16<minYesNoMappingsOnly) { 20487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isJamoL(norm16)) { 20497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert b-=Hangul.JAMO_V_BASE; 20507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(0<=b && b<Hangul.JAMO_V_COUNT) { 20517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 20527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (Hangul.HANGUL_BASE+ 20537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)* 20547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Hangul.JAMO_T_COUNT); 20557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 20567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return -1; 20577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(isHangul(norm16)) { 20597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert b-=Hangul.JAMO_T_BASE; 20607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Hangul.isHangulWithoutJamoT((char)a) && 0<b && b<Hangul.JAMO_T_COUNT) { // not b==0! 20617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return a+b; 20627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 20637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return -1; 20647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 20667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 'a' has a compositions list in extraData 20677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert list=norm16; 20687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 20697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert list+= // mapping pointer 20707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1+ // +1 to skip the first unit with the mapping lenth 20717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (extraData.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length 20727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Turn the offset-into-extraData into an offset-into-maybeYesCompositions. 20747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert list+=MIN_NORMAL_MAYBE_YES-minMaybeYes; 20757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 20777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return -1; 20787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 20797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert list=norm16-minMaybeYes; // offset into maybeYesCompositions 20807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 20827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return -1; 20837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return combine(maybeYesCompositions, list, b)>>1; 20857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 20867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 20877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 20887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Does c have a composition boundary before it? 20897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * True if its decomposition begins with a character that has 20907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 20917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 20927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (isCompYesAndZeroCC()) so we need not decompose. 20937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 20947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean hasCompBoundaryBefore(int c, int norm16) { 20957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 20967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isCompYesAndZeroCC(norm16)) { 20977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 20987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(isMaybeOrNonZeroCC(norm16)) { 20997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 21007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(isDecompNoAlgorithmic(norm16)) { 21017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c=mapAlgorithmic(c, norm16); 21027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert norm16=getNorm16(c); 21037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 21047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c decomposes, get everything from the variable-length extra data 21057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstUnit=extraData.charAt(norm16); 21067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((firstUnit&MAPPING_LENGTH_MASK)==0) { 21077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 21087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) { 21107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; // non-zero leadCC 21117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1))); 21137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int findPreviousCompBoundary(CharSequence s, int p) { 21177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(p>0) { 21187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=Character.codePointBefore(s, p); 21197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p-=Character.charCount(c); 21207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(hasCompBoundaryBefore(c)) { 21217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 21227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, 21247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // but that's probably not worth the extra cost. 21257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return p; 21277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int findNextCompBoundary(CharSequence s, int p, int limit) { 21297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(p<limit) { 21307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=Character.codePointAt(s, p); 21317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int norm16=normTrie.get(c); 21327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(hasCompBoundaryBefore(c, norm16)) { 21337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 21347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p+=Character.charCount(c); 21367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return p; 21387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int findPreviousFCDBoundary(CharSequence s, int p) { 21417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(p>0) { 21427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=Character.codePointBefore(s, p); 21437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p-=Character.charCount(c); 21447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { 21457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 21467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return p; 21497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int findNextFCDBoundary(CharSequence s, int p, int limit) { 21517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(p<limit) { 21527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c=Character.codePointAt(s, p); 21537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { 21547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 21557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p+=Character.charCount(c); 21577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return p; 21597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void addToStartSet(Trie2Writable newData, int origin, int decompLead) { 21627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int canonValue=newData.get(decompLead); 21637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 21647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // origin is the first character whose decomposition starts with 21657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the character for which we are setting the value. 21667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newData.set(decompLead, canonValue|origin); 21677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 21687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // origin is not the first character, or it is U+0000. 21697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UnicodeSet set; 21707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((canonValue&CANON_HAS_SET)==0) { 21717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int firstOrigin=canonValue&CANON_VALUE_MASK; 21727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size(); 21737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newData.set(decompLead, canonValue); 21747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert canonStartSets.add(set=new UnicodeSet()); 21757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(firstOrigin!=0) { 21767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert set.add(firstOrigin); 21777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 21797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert set=canonStartSets.get(canonValue&CANON_VALUE_MASK); 21807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert set.add(origin); 21827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 21847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @SuppressWarnings("unused") 21867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private VersionInfo dataVersion; 21877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Code point thresholds for quick check codes. 21897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int minDecompNoCP; 21907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int minCompNoMaybeCP; 21917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Norm16 value thresholds for quick check combinations and types of extra data. 21937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int minYesNo; 21947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int minYesNoMappingsOnly; 21957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int minNoNo; 21967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int limitNoNo; 21977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int minMaybeYes; 21987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 21997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private Trie2_16 normTrie; 22007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private String maybeYesCompositions; 22017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 22027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 22037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int[] tccc180; // [0x180] tccc values for U+0000..U+017F 22047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 22057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private Trie2_32 canonIterData; 22067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private ArrayList<UnicodeSet> canonStartSets; 22077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 22087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // bits in canonIterData 22097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000; 22107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int CANON_HAS_COMPOSITIONS = 0x40000000; 22117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int CANON_HAS_SET = 0x200000; 22127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int CANON_VALUE_MASK = 0x1fffff; 22137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 2214