1/* GENERATED SOURCE. DO NOT MODIFY. */ 2// © 2016 and later: Unicode, Inc. and others. 3// License & terms of use: http://www.unicode.org/copyright.html#License 4/* 5 ******************************************************************************* 6 * Copyright (C) 2009-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11package android.icu.impl; 12 13import java.io.IOException; 14import java.nio.ByteBuffer; 15import java.util.ArrayList; 16import java.util.Iterator; 17 18import android.icu.text.UTF16; 19import android.icu.text.UnicodeSet; 20import android.icu.util.ICUUncheckedIOException; 21import android.icu.util.VersionInfo; 22 23/** 24 * Low-level implementation of the Unicode Normalization Algorithm. 25 * For the data structure and details see the documentation at the end of 26 * C++ normalizer2impl.h and in the design doc at 27 * http://site.icu-project.org/design/normalization/custom 28 * @hide Only a subset of ICU is exposed in Android 29 */ 30public final class Normalizer2Impl { 31 public static final class Hangul { 32 /* Korean Hangul and Jamo constants */ 33 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 34 public static final int JAMO_L_END=0x1112; 35 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 36 public static final int JAMO_V_END=0x1175; 37 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 38 public static final int JAMO_T_END=0x11c2; 39 40 public static final int HANGUL_BASE=0xac00; 41 public static final int HANGUL_END=0xd7a3; 42 43 public static final int JAMO_L_COUNT=19; 44 public static final int JAMO_V_COUNT=21; 45 public static final int JAMO_T_COUNT=28; 46 47 public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT; 48 public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT; 49 50 public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT; 51 52 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 53 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; 54 55 public static boolean isHangul(int c) { 56 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 57 } 58 public static boolean isHangulLV(int c) { 59 c-=HANGUL_BASE; 60 return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 61 } 62 public static boolean isJamoL(int c) { 63 return JAMO_L_BASE<=c && c<JAMO_L_LIMIT; 64 } 65 public static boolean isJamoV(int c) { 66 return JAMO_V_BASE<=c && c<JAMO_V_LIMIT; 67 } 68 public static boolean isJamoT(int c) { 69 int t=c-JAMO_T_BASE; 70 return 0<t && t<JAMO_T_COUNT; // not JAMO_T_BASE itself 71 } 72 public static boolean isJamo(int c) { 73 return JAMO_L_BASE<=c && c<=JAMO_T_END && 74 (c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c); 75 } 76 77 /** 78 * Decomposes c, which must be a Hangul syllable, into buffer 79 * and returns the length of the decomposition (2 or 3). 80 */ 81 public static int decompose(int c, Appendable buffer) { 82 try { 83 c-=HANGUL_BASE; 84 int c2=c%JAMO_T_COUNT; 85 c/=JAMO_T_COUNT; 86 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 87 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 88 if(c2==0) { 89 return 2; 90 } else { 91 buffer.append((char)(JAMO_T_BASE+c2)); 92 return 3; 93 } 94 } catch(IOException e) { 95 // Will not occur because we do not write to I/O. 96 throw new ICUUncheckedIOException(e); 97 } 98 } 99 100 /** 101 * Decomposes c, which must be a Hangul syllable, into buffer. 102 * This is the raw, not recursive, decomposition. Its length is always 2. 103 */ 104 public static void getRawDecomposition(int c, Appendable buffer) { 105 try { 106 int orig=c; 107 c-=HANGUL_BASE; 108 int c2=c%JAMO_T_COUNT; 109 if(c2==0) { 110 c/=JAMO_T_COUNT; 111 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 112 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 113 } else { 114 buffer.append((char)(orig-c2)); // LV syllable 115 buffer.append((char)(JAMO_T_BASE+c2)); 116 } 117 } catch(IOException e) { 118 // Will not occur because we do not write to I/O. 119 throw new ICUUncheckedIOException(e); 120 } 121 } 122 } 123 124 /** 125 * Writable buffer that takes care of canonical ordering. 126 * Its Appendable methods behave like the C++ implementation's 127 * appendZeroCC() methods. 128 * <p> 129 * If dest is a StringBuilder, then the buffer writes directly to it. 130 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments 131 * until no further changes are necessary and whole segments are appended. 132 * append() methods that take combining-class values always write to the StringBuilder. 133 * Other append() methods flush and append to the Appendable. 134 */ 135 public static final class ReorderingBuffer implements Appendable { 136 public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) { 137 impl=ni; 138 app=dest; 139 if(app instanceof StringBuilder) { 140 appIsStringBuilder=true; 141 str=(StringBuilder)dest; 142 // In Java, the constructor subsumes public void init(int destCapacity) { 143 str.ensureCapacity(destCapacity); 144 reorderStart=0; 145 if(str.length()==0) { 146 lastCC=0; 147 } else { 148 setIterator(); 149 lastCC=previousCC(); 150 // Set reorderStart after the last code point with cc<=1 if there is one. 151 if(lastCC>1) { 152 while(previousCC()>1) {} 153 } 154 reorderStart=codePointLimit; 155 } 156 } else { 157 appIsStringBuilder=false; 158 str=new StringBuilder(); 159 reorderStart=0; 160 lastCC=0; 161 } 162 } 163 164 public boolean isEmpty() { return str.length()==0; } 165 public int length() { return str.length(); } 166 public int getLastCC() { return lastCC; } 167 168 public StringBuilder getStringBuilder() { return str; } 169 170 public boolean equals(CharSequence s, int start, int limit) { 171 return UTF16Plus.equal(str, 0, str.length(), s, start, limit); 172 } 173 174 public void append(int c, int cc) { 175 if(lastCC<=cc || cc==0) { 176 str.appendCodePoint(c); 177 lastCC=cc; 178 if(cc<=1) { 179 reorderStart=str.length(); 180 } 181 } else { 182 insert(c, cc); 183 } 184 } 185 // s must be in NFD, otherwise change the implementation. 186 public void append(CharSequence s, int start, int limit, 187 int leadCC, int trailCC) { 188 if(start==limit) { 189 return; 190 } 191 if(lastCC<=leadCC || leadCC==0) { 192 if(trailCC<=1) { 193 reorderStart=str.length()+(limit-start); 194 } else if(leadCC<=1) { 195 reorderStart=str.length()+1; // Ok if not a code point boundary. 196 } 197 str.append(s, start, limit); 198 lastCC=trailCC; 199 } else { 200 int c=Character.codePointAt(s, start); 201 start+=Character.charCount(c); 202 insert(c, leadCC); // insert first code point 203 while(start<limit) { 204 c=Character.codePointAt(s, start); 205 start+=Character.charCount(c); 206 if(start<limit) { 207 // s must be in NFD, otherwise we need to use getCC(). 208 leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); 209 } else { 210 leadCC=trailCC; 211 } 212 append(c, leadCC); 213 } 214 } 215 } 216 // The following append() methods work like C++ appendZeroCC(). 217 // They assume that the cc or trailCC of their input is 0. 218 // Most of them implement Appendable interface methods. 219 @Override 220 public ReorderingBuffer append(char c) { 221 str.append(c); 222 lastCC=0; 223 reorderStart=str.length(); 224 return this; 225 } 226 public void appendZeroCC(int c) { 227 str.appendCodePoint(c); 228 lastCC=0; 229 reorderStart=str.length(); 230 } 231 @Override 232 public ReorderingBuffer append(CharSequence s) { 233 if(s.length()!=0) { 234 str.append(s); 235 lastCC=0; 236 reorderStart=str.length(); 237 } 238 return this; 239 } 240 @Override 241 public ReorderingBuffer append(CharSequence s, int start, int limit) { 242 if(start!=limit) { 243 str.append(s, start, limit); 244 lastCC=0; 245 reorderStart=str.length(); 246 } 247 return this; 248 } 249 /** 250 * Flushes from the intermediate StringBuilder to the Appendable, 251 * if they are different objects. 252 * Used after recomposition. 253 * Must be called at the end when writing to a non-StringBuilder Appendable. 254 */ 255 public void flush() { 256 if(appIsStringBuilder) { 257 reorderStart=str.length(); 258 } else { 259 try { 260 app.append(str); 261 str.setLength(0); 262 reorderStart=0; 263 } catch(IOException e) { 264 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 265 } 266 } 267 lastCC=0; 268 } 269 /** 270 * Flushes from the intermediate StringBuilder to the Appendable, 271 * if they are different objects. 272 * Then appends the new text to the Appendable or StringBuilder. 273 * Normally used after quick check loops find a non-empty sequence. 274 */ 275 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { 276 if(appIsStringBuilder) { 277 str.append(s, start, limit); 278 reorderStart=str.length(); 279 } else { 280 try { 281 app.append(str).append(s, start, limit); 282 str.setLength(0); 283 reorderStart=0; 284 } catch(IOException e) { 285 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 286 } 287 } 288 lastCC=0; 289 return this; 290 } 291 public void remove() { 292 str.setLength(0); 293 lastCC=0; 294 reorderStart=0; 295 } 296 public void removeSuffix(int suffixLength) { 297 int oldLength=str.length(); 298 str.delete(oldLength-suffixLength, oldLength); 299 lastCC=0; 300 reorderStart=str.length(); 301 } 302 303 /* 304 * TODO: Revisit whether it makes sense to track reorderStart. 305 * It is set to after the last known character with cc<=1, 306 * which stops previousCC() before it reads that character and looks up its cc. 307 * previousCC() is normally only called from insert(). 308 * In other words, reorderStart speeds up the insertion of a combining mark 309 * into a multi-combining mark sequence where it does not belong at the end. 310 * This might not be worth the trouble. 311 * On the other hand, it's not a huge amount of trouble. 312 * 313 * We probably need it for UNORM_SIMPLE_APPEND. 314 */ 315 316 // Inserts c somewhere before the last character. 317 // Requires 0<cc<lastCC which implies reorderStart<limit. 318 private void insert(int c, int cc) { 319 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 320 // insert c at codePointLimit, after the character with prevCC<=cc 321 if(c<=0xffff) { 322 str.insert(codePointLimit, (char)c); 323 if(cc<=1) { 324 reorderStart=codePointLimit+1; 325 } 326 } else { 327 str.insert(codePointLimit, Character.toChars(c)); 328 if(cc<=1) { 329 reorderStart=codePointLimit+2; 330 } 331 } 332 } 333 334 private final Normalizer2Impl impl; 335 private final Appendable app; 336 private final StringBuilder str; 337 private final boolean appIsStringBuilder; 338 private int reorderStart; 339 private int lastCC; 340 341 // private backward iterator 342 private void setIterator() { codePointStart=str.length(); } 343 private void skipPrevious() { // Requires 0<codePointStart. 344 codePointLimit=codePointStart; 345 codePointStart=str.offsetByCodePoints(codePointStart, -1); 346 } 347 private int previousCC() { // Returns 0 if there is no previous character. 348 codePointLimit=codePointStart; 349 if(reorderStart>=codePointStart) { 350 return 0; 351 } 352 int c=str.codePointBefore(codePointStart); 353 codePointStart-=Character.charCount(c); 354 return impl.getCCFromYesOrMaybeCP(c); 355 } 356 357 private int codePointStart, codePointLimit; 358 } 359 360 // TODO: Propose as public API on the UTF16 class. 361 // TODO: Propose widening UTF16 methods that take char to take int. 362 // TODO: Propose widening UTF16 methods that take String to take CharSequence. 363 public static final class UTF16Plus { 364 /** 365 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), 366 * is it a lead surrogate? 367 * @param c code unit or code point 368 * @return true or false 369 */ 370 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } 371 /** 372 * Compares two CharSequence objects for binary equality. 373 * @param s1 first sequence 374 * @param s2 second sequence 375 * @return true if s1 contains the same text as s2 376 */ 377 public static boolean equal(CharSequence s1, CharSequence s2) { 378 if(s1==s2) { 379 return true; 380 } 381 int length=s1.length(); 382 if(length!=s2.length()) { 383 return false; 384 } 385 for(int i=0; i<length; ++i) { 386 if(s1.charAt(i)!=s2.charAt(i)) { 387 return false; 388 } 389 } 390 return true; 391 } 392 /** 393 * Compares two CharSequence subsequences for binary equality. 394 * @param s1 first sequence 395 * @param start1 start offset in first sequence 396 * @param limit1 limit offset in first sequence 397 * @param s2 second sequence 398 * @param start2 start offset in second sequence 399 * @param limit2 limit offset in second sequence 400 * @return true if s1.subSequence(start1, limit1) contains the same text 401 * as s2.subSequence(start2, limit2) 402 */ 403 public static boolean equal(CharSequence s1, int start1, int limit1, 404 CharSequence s2, int start2, int limit2) { 405 if((limit1-start1)!=(limit2-start2)) { 406 return false; 407 } 408 if(s1==s2 && start1==start2) { 409 return true; 410 } 411 while(start1<limit1) { 412 if(s1.charAt(start1++)!=s2.charAt(start2++)) { 413 return false; 414 } 415 } 416 return true; 417 } 418 } 419 420 public Normalizer2Impl() {} 421 422 private static final class IsAcceptable implements ICUBinary.Authenticate { 423 @Override 424 public boolean isDataVersionAcceptable(byte version[]) { 425 return version[0]==3; 426 } 427 } 428 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 429 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" 430 431 public Normalizer2Impl load(ByteBuffer bytes) { 432 try { 433 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 434 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 435 if(indexesLength<=IX_MIN_LCCC_CP) { 436 throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes"); 437 } 438 int[] inIndexes=new int[indexesLength]; 439 inIndexes[0]=indexesLength*4; 440 for(int i=1; i<indexesLength; ++i) { 441 inIndexes[i]=bytes.getInt(); 442 } 443 444 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 445 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 446 minLcccCP=inIndexes[IX_MIN_LCCC_CP]; 447 448 minYesNo=inIndexes[IX_MIN_YES_NO]; 449 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 450 minNoNo=inIndexes[IX_MIN_NO_NO]; 451 minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]; 452 minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]; 453 minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY]; 454 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 455 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 456 assert((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields 457 centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1; 458 459 // Read the normTrie. 460 int offset=inIndexes[IX_NORM_TRIE_OFFSET]; 461 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 462 normTrie=Trie2_16.createFromSerialized(bytes); 463 int trieLength=normTrie.getSerializedLength(); 464 if(trieLength>(nextOffset-offset)) { 465 throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie"); 466 } 467 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes 468 469 // Read the composition and mapping data. 470 offset=nextOffset; 471 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 472 int numChars=(nextOffset-offset)/2; 473 if(numChars!=0) { 474 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); 475 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); 476 } 477 478 // smallFCD: new in formatVersion 2 479 offset=nextOffset; 480 smallFCD=new byte[0x100]; 481 bytes.get(smallFCD); 482 483 return this; 484 } catch(IOException e) { 485 throw new ICUUncheckedIOException(e); 486 } 487 } 488 public Normalizer2Impl load(String name) { 489 return load(ICUBinary.getRequiredData(name)); 490 } 491 492 private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) { 493 if (norm16 > MIN_NORMAL_MAYBE_YES && norm16 != JAMO_VT) { 494 set.add(start, end); 495 } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) { 496 int fcd16=getFCD16(start); 497 if(fcd16>0xff) { set.add(start, end); } 498 } 499 } 500 501 private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) { 502 /* add the start code point to the USet */ 503 set.add(start); 504 if(start!=end && isAlgorithmicNoNo(value) && (value & DELTA_TCCC_MASK) > DELTA_TCCC_1) { 505 // Range of code points with same-norm16-value algorithmic decompositions. 506 // They might have different non-zero FCD16 values. 507 int prevFCD16=getFCD16(start); 508 while(++start<=end) { 509 int fcd16=getFCD16(start); 510 if(fcd16!=prevFCD16) { 511 set.add(start); 512 prevFCD16=fcd16; 513 } 514 } 515 } 516 } 517 518 public void addLcccChars(UnicodeSet set) { 519 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 520 Trie2.Range range; 521 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 522 enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set); 523 } 524 } 525 526 public void addPropertyStarts(UnicodeSet set) { 527 /* add the start code point of each same-value range of each trie */ 528 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 529 Trie2.Range range; 530 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 531 enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set); 532 } 533 534 /* add Hangul LV syllables and LV+1 because of skippables */ 535 for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) { 536 set.add(c); 537 set.add(c+1); 538 } 539 set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 540 } 541 542 public void addCanonIterPropertyStarts(UnicodeSet set) { 543 /* add the start code point of each same-value range of the canonical iterator data trie */ 544 ensureCanonIterData(); 545 // currently only used for the SEGMENT_STARTER property 546 Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper); 547 Trie2.Range range; 548 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 549 /* add the start code point to the USet */ 550 set.add(range.startCodePoint); 551 } 552 } 553 private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() { 554 @Override 555 public int map(int in) { 556 return in&CANON_NOT_SEGMENT_STARTER; 557 } 558 }; 559 560 // low-level properties ------------------------------------------------ *** 561 562 // Note: Normalizer2Impl.java r30983 (2011-nov-27) 563 // still had getFCDTrie() which built and cached an FCD trie. 564 // That provided faster access to FCD data than getFCD16FromNormData() 565 // but required synchronization and consumed some 10kB of heap memory 566 // in any process that uses FCD (e.g., via collation). 567 // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance, 568 // at least for ASCII & CJK. 569 570 /** 571 * Builds the canonical-iterator data for this instance. 572 * This is required before any of {@link #isCanonSegmentStarter(int)} or 573 * {@link #getCanonStartSet(int, UnicodeSet)} are called, 574 * or else they crash. 575 * @return this 576 */ 577 public synchronized Normalizer2Impl ensureCanonIterData() { 578 if(canonIterData==null) { 579 Trie2Writable newData=new Trie2Writable(0, 0); 580 canonStartSets=new ArrayList<UnicodeSet>(); 581 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 582 Trie2.Range range; 583 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 584 final int norm16=range.value; 585 if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) { 586 // Inert, or 2-way mapping (including Hangul syllable). 587 // We do not write a canonStartSet for any yesNo character. 588 // Composites from 2-way mappings are added at runtime from the 589 // starter's compositions list, and the other characters in 590 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 591 // "maybe" characters. 592 continue; 593 } 594 for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) { 595 final int oldValue=newData.get(c); 596 int newValue=oldValue; 597 if(isMaybeOrNonZeroCC(norm16)) { 598 // not a segment starter if it occurs in a decomposition or has cc!=0 599 newValue|=CANON_NOT_SEGMENT_STARTER; 600 if(norm16<MIN_NORMAL_MAYBE_YES) { 601 newValue|=CANON_HAS_COMPOSITIONS; 602 } 603 } else if(norm16<minYesNo) { 604 newValue|=CANON_HAS_COMPOSITIONS; 605 } else { 606 // c has a one-way decomposition 607 int c2=c; 608 // Do not modify the whole-range norm16 value. 609 int norm16_2=norm16; 610 if (isDecompNoAlgorithmic(norm16_2)) { 611 // Maps to an isCompYesAndZeroCC. 612 c2 = mapAlgorithmic(c2, norm16_2); 613 norm16_2 = getNorm16(c2); 614 // No compatibility mappings for the CanonicalIterator. 615 assert(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2))); 616 } 617 if (norm16_2 > minYesNo) { 618 // c decomposes, get everything from the variable-length extra data 619 int mapping=norm16_2>>OFFSET_SHIFT; 620 int firstUnit=extraData.charAt(mapping); 621 int length=firstUnit&MAPPING_LENGTH_MASK; 622 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 623 if(c==c2 && (extraData.charAt(mapping-1)&0xff)!=0) { 624 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 625 } 626 } 627 // Skip empty mappings (no characters in the decomposition). 628 if(length!=0) { 629 ++mapping; // skip over the firstUnit 630 // add c to first code point's start set 631 int limit=mapping+length; 632 c2=extraData.codePointAt(mapping); 633 addToStartSet(newData, c, c2); 634 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 635 // one-way mapping. A 2-way mapping is possible here after 636 // intermediate algorithmic mapping. 637 if(norm16_2>=minNoNo) { 638 while((mapping+=Character.charCount(c2))<limit) { 639 c2=extraData.codePointAt(mapping); 640 int c2Value=newData.get(c2); 641 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 642 newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER); 643 } 644 } 645 } 646 } 647 } else { 648 // c decomposed to c2 algorithmically; c has cc==0 649 addToStartSet(newData, c, c2); 650 } 651 } 652 if(newValue!=oldValue) { 653 newData.set(c, newValue); 654 } 655 } 656 } 657 canonIterData=newData.toTrie2_32(); 658 } 659 return this; 660 } 661 662 public int getNorm16(int c) { return normTrie.get(c); } 663 664 public int getCompQuickCheck(int norm16) { 665 if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { 666 return 1; // yes 667 } else if(minMaybeYes<=norm16) { 668 return 2; // maybe 669 } else { 670 return 0; // no 671 } 672 } 673 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } 674 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } 675 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } 676 677 public int getCC(int norm16) { 678 if(norm16>=MIN_NORMAL_MAYBE_YES) { 679 return getCCFromNormalYesOrMaybe(norm16); 680 } 681 if(norm16<minNoNo || limitNoNo<=norm16) { 682 return 0; 683 } 684 return getCCFromNoNo(norm16); 685 } 686 public static int getCCFromNormalYesOrMaybe(int norm16) { 687 return (norm16 >> OFFSET_SHIFT) & 0xff; 688 } 689 public static int getCCFromYesOrMaybe(int norm16) { 690 return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; 691 } 692 public int getCCFromYesOrMaybeCP(int c) { 693 if (c < minCompNoMaybeCP) { return 0; } 694 return getCCFromYesOrMaybe(getNorm16(c)); 695 } 696 697 /** 698 * Returns the FCD data for code point c. 699 * @param c A Unicode code point. 700 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 701 */ 702 public int getFCD16(int c) { 703 if(c<minDecompNoCP) { 704 return 0; 705 } else if(c<=0xffff) { 706 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 707 } 708 return getFCD16FromNormData(c); 709 } 710 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ 711 public boolean singleLeadMightHaveNonZeroFCD16(int lead) { 712 // 0<=lead<=0xffff 713 byte bits=smallFCD[lead>>8]; 714 if(bits==0) { return false; } 715 return ((bits>>((lead>>5)&7))&1)!=0; 716 } 717 718 /** Gets the FCD value from the regular normalization data. */ 719 public int getFCD16FromNormData(int c) { 720 int norm16=getNorm16(c); 721 if (norm16 >= limitNoNo) { 722 if(norm16>=MIN_NORMAL_MAYBE_YES) { 723 // combining mark 724 norm16=getCCFromNormalYesOrMaybe(norm16); 725 return norm16|(norm16<<8); 726 } else if(norm16>=minMaybeYes) { 727 return 0; 728 } else { // isDecompNoAlgorithmic(norm16) 729 int deltaTrailCC = norm16 & DELTA_TCCC_MASK; 730 if (deltaTrailCC <= DELTA_TCCC_1) { 731 return deltaTrailCC >> OFFSET_SHIFT; 732 } 733 // Maps to an isCompYesAndZeroCC. 734 c=mapAlgorithmic(c, norm16); 735 norm16=getNorm16(c); 736 } 737 } 738 if(norm16<=minYesNo || isHangulLVT(norm16)) { 739 // no decomposition or Hangul syllable, all zeros 740 return 0; 741 } 742 // c decomposes, get everything from the variable-length extra data 743 int mapping=norm16>>OFFSET_SHIFT; 744 int firstUnit=extraData.charAt(mapping); 745 int fcd16=firstUnit>>8; // tccc 746 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 747 fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc 748 } 749 return fcd16; 750 } 751 752 /** 753 * Gets the decomposition for one code point. 754 * @param c code point 755 * @return c's decomposition, if it has one; returns null if it does not have a decomposition 756 */ 757 public String getDecomposition(int c) { 758 int norm16; 759 if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) { 760 // c does not decompose 761 return null; 762 } 763 int decomp = -1; 764 if(isDecompNoAlgorithmic(norm16)) { 765 // Maps to an isCompYesAndZeroCC. 766 decomp=c=mapAlgorithmic(c, norm16); 767 // The mapping might decompose further. 768 norm16 = getNorm16(c); 769 } 770 if (norm16 < minYesNo) { 771 if(decomp<0) { 772 return null; 773 } else { 774 return UTF16.valueOf(decomp); 775 } 776 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 777 // Hangul syllable: decompose algorithmically 778 StringBuilder buffer=new StringBuilder(); 779 Hangul.decompose(c, buffer); 780 return buffer.toString(); 781 } 782 // c decomposes, get everything from the variable-length extra data 783 int mapping=norm16>>OFFSET_SHIFT; 784 int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; 785 return extraData.substring(mapping, mapping+length); 786 } 787 788 /** 789 * Gets the raw decomposition for one code point. 790 * @param c code point 791 * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition 792 */ 793 public String getRawDecomposition(int c) { 794 int norm16; 795 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 796 // c does not decompose 797 return null; 798 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 799 // Hangul syllable: decompose algorithmically 800 StringBuilder buffer=new StringBuilder(); 801 Hangul.getRawDecomposition(c, buffer); 802 return buffer.toString(); 803 } else if(isDecompNoAlgorithmic(norm16)) { 804 return UTF16.valueOf(mapAlgorithmic(c, norm16)); 805 } 806 // c decomposes, get everything from the variable-length extra data 807 int mapping=norm16>>OFFSET_SHIFT; 808 int firstUnit=extraData.charAt(mapping); 809 int mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 810 if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) { 811 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 812 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 813 int rawMapping=mapping-((firstUnit>>7)&1)-1; 814 char rm0=extraData.charAt(rawMapping); 815 if(rm0<=MAPPING_LENGTH_MASK) { 816 return extraData.substring(rawMapping-rm0, rawMapping); 817 } else { 818 // Copy the normal mapping and replace its first two code units with rm0. 819 StringBuilder buffer=new StringBuilder(mLength-1).append(rm0); 820 mapping+=1+2; // skip over the firstUnit and the first two mapping code units 821 return buffer.append(extraData, mapping, mapping+mLength-2).toString(); 822 } 823 } else { 824 mapping+=1; // skip over the firstUnit 825 return extraData.substring(mapping, mapping+mLength); 826 } 827 } 828 829 /** 830 * Returns true if code point c starts a canonical-iterator string segment. 831 * <b>{@link #ensureCanonIterData()} must have been called before this method, 832 * or else this method will crash.</b> 833 * @param c A Unicode code point. 834 * @return true if c starts a canonical-iterator string segment. 835 */ 836 public boolean isCanonSegmentStarter(int c) { 837 return canonIterData.get(c)>=0; 838 } 839 /** 840 * Returns true if there are characters whose decomposition starts with c. 841 * If so, then the set is cleared and then filled with those characters. 842 * <b>{@link #ensureCanonIterData()} must have been called before this method, 843 * or else this method will crash.</b> 844 * @param c A Unicode code point. 845 * @param set A UnicodeSet to receive the characters whose decompositions 846 * start with c, if there are any. 847 * @return true if there are characters whose decomposition starts with c. 848 */ 849 public boolean getCanonStartSet(int c, UnicodeSet set) { 850 int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER; 851 if(canonValue==0) { 852 return false; 853 } 854 set.clear(); 855 int value=canonValue&CANON_VALUE_MASK; 856 if((canonValue&CANON_HAS_SET)!=0) { 857 set.addAll(canonStartSets.get(value)); 858 } else if(value!=0) { 859 set.add(value); 860 } 861 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 862 int norm16=getNorm16(c); 863 if(norm16==JAMO_L) { 864 int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT; 865 set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1); 866 } else { 867 addComposites(getCompositionsList(norm16), set); 868 } 869 } 870 return true; 871 } 872 873 // Fixed norm16 values. 874 public static final int MIN_YES_YES_WITH_CC=0xfe02; 875 public static final int JAMO_VT=0xfe00; 876 public static final int MIN_NORMAL_MAYBE_YES=0xfc00; 877 public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE 878 public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE 879 880 // norm16 bit 0 is comp-boundary-after. 881 public static final int HAS_COMP_BOUNDARY_AFTER=1; 882 public static final int OFFSET_SHIFT=1; 883 884 // For algorithmic one-way mappings, norm16 bits 2..1 indicate the 885 // tccc (0, 1, >1) for quick FCC boundary-after tests. 886 public static final int DELTA_TCCC_0=0; 887 public static final int DELTA_TCCC_1=2; 888 public static final int DELTA_TCCC_GT_1=4; 889 public static final int DELTA_TCCC_MASK=6; 890 public static final int DELTA_SHIFT=3; 891 892 public static final int MAX_DELTA=0x40; 893 894 // Byte offsets from the start of the data, after the generic header. 895 public static final int IX_NORM_TRIE_OFFSET=0; 896 public static final int IX_EXTRA_DATA_OFFSET=1; 897 public static final int IX_SMALL_FCD_OFFSET=2; 898 public static final int IX_RESERVED3_OFFSET=3; 899 public static final int IX_TOTAL_SIZE=7; 900 901 // Code point thresholds for quick check codes. 902 public static final int IX_MIN_DECOMP_NO_CP=8; 903 public static final int IX_MIN_COMP_NO_MAYBE_CP=9; 904 905 // Norm16 value thresholds for quick check combinations and types of extra data. 906 907 /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ 908 public static final int IX_MIN_YES_NO=10; 909 /** Mappings are comp-normalized. */ 910 public static final int IX_MIN_NO_NO=11; 911 public static final int IX_LIMIT_NO_NO=12; 912 public static final int IX_MIN_MAYBE_YES=13; 913 914 /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ 915 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; 916 /** Mappings are not comp-normalized but have a comp boundary before. */ 917 public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; 918 /** Mappings do not have a comp boundary before. */ 919 public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; 920 /** Mappings to the empty string. */ 921 public static final int IX_MIN_NO_NO_EMPTY=17; 922 923 public static final int IX_MIN_LCCC_CP=18; 924 public static final int IX_COUNT=20; 925 926 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; 927 public static final int MAPPING_HAS_RAW_MAPPING=0x40; 928 // unused bit 0x20; 929 public static final int MAPPING_LENGTH_MASK=0x1f; 930 931 public static final int COMP_1_LAST_TUPLE=0x8000; 932 public static final int COMP_1_TRIPLE=1; 933 public static final int COMP_1_TRAIL_LIMIT=0x3400; 934 public static final int COMP_1_TRAIL_MASK=0x7ffe; 935 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit 936 public static final int COMP_2_TRAIL_SHIFT=6; 937 public static final int COMP_2_TRAIL_MASK=0xffc0; 938 939 // higher-level functionality ------------------------------------------ *** 940 941 // NFD without an NFD Normalizer2 instance. 942 public Appendable decompose(CharSequence s, StringBuilder dest) { 943 decompose(s, 0, s.length(), dest, s.length()); 944 return dest; 945 } 946 /** 947 * Decomposes s[src, limit[ and writes the result to dest. 948 * limit can be NULL if src is NUL-terminated. 949 * destLengthEstimate is the initial dest buffer capacity and can be -1. 950 */ 951 public void decompose(CharSequence s, int src, int limit, StringBuilder dest, 952 int destLengthEstimate) { 953 if(destLengthEstimate<0) { 954 destLengthEstimate=limit-src; 955 } 956 dest.setLength(0); 957 ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); 958 decompose(s, src, limit, buffer); 959 } 960 961 // Dual functionality: 962 // buffer!=NULL: normalize 963 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 964 public int decompose(CharSequence s, int src, int limit, 965 ReorderingBuffer buffer) { 966 int minNoCP=minDecompNoCP; 967 968 int prevSrc; 969 int c=0; 970 int norm16=0; 971 972 // only for quick check 973 int prevBoundary=src; 974 int prevCC=0; 975 976 for(;;) { 977 // count code units below the minimum or with irrelevant data for the quick check 978 for(prevSrc=src; src!=limit;) { 979 if( (c=s.charAt(src))<minNoCP || 980 isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 981 ) { 982 ++src; 983 } else if(!UTF16.isSurrogate((char)c)) { 984 break; 985 } else { 986 char c2; 987 if(UTF16Plus.isSurrogateLead(c)) { 988 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 989 c=Character.toCodePoint((char)c, c2); 990 } 991 } else /* trail surrogate */ { 992 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 993 --src; 994 c=Character.toCodePoint(c2, (char)c); 995 } 996 } 997 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 998 src+=Character.charCount(c); 999 } else { 1000 break; 1001 } 1002 } 1003 } 1004 // copy these code units all at once 1005 if(src!=prevSrc) { 1006 if(buffer!=null) { 1007 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1008 } else { 1009 prevCC=0; 1010 prevBoundary=src; 1011 } 1012 } 1013 if(src==limit) { 1014 break; 1015 } 1016 1017 // Check one above-minimum, relevant code point. 1018 src+=Character.charCount(c); 1019 if(buffer!=null) { 1020 decompose(c, norm16, buffer); 1021 } else { 1022 if(isDecompYes(norm16)) { 1023 int cc=getCCFromYesOrMaybe(norm16); 1024 if(prevCC<=cc || cc==0) { 1025 prevCC=cc; 1026 if(cc<=1) { 1027 prevBoundary=src; 1028 } 1029 continue; 1030 } 1031 } 1032 return prevBoundary; // "no" or cc out of order 1033 } 1034 } 1035 return src; 1036 } 1037 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { 1038 int limit=s.length(); 1039 if(limit==0) { 1040 return; 1041 } 1042 if(doDecompose) { 1043 decompose(s, 0, limit, buffer); 1044 return; 1045 } 1046 // Just merge the strings at the boundary. 1047 int c=Character.codePointAt(s, 0); 1048 int src=0; 1049 int firstCC, prevCC, cc; 1050 firstCC=prevCC=cc=getCC(getNorm16(c)); 1051 while(cc!=0) { 1052 prevCC=cc; 1053 src+=Character.charCount(c); 1054 if(src>=limit) { 1055 break; 1056 } 1057 c=Character.codePointAt(s, src); 1058 cc=getCC(getNorm16(c)); 1059 }; 1060 buffer.append(s, 0, src, firstCC, prevCC); 1061 buffer.append(s, src, limit); 1062 } 1063 1064 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 1065 // doCompose: normalize 1066 // !doCompose: isNormalized (buffer must be empty and initialized) 1067 public boolean compose(CharSequence s, int src, int limit, 1068 boolean onlyContiguous, 1069 boolean doCompose, 1070 ReorderingBuffer buffer) { 1071 int prevBoundary=src; 1072 int minNoMaybeCP=minCompNoMaybeCP; 1073 1074 for (;;) { 1075 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 1076 // or with (compYes && ccc==0) properties. 1077 int prevSrc; 1078 int c = 0; 1079 int norm16 = 0; 1080 for (;;) { 1081 if (src == limit) { 1082 if (prevBoundary != limit && doCompose) { 1083 buffer.append(s, prevBoundary, limit); 1084 } 1085 return true; 1086 } 1087 if( (c=s.charAt(src))<minNoMaybeCP || 1088 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 1089 ) { 1090 ++src; 1091 } else { 1092 prevSrc = src++; 1093 if(!UTF16.isSurrogate((char)c)) { 1094 break; 1095 } else { 1096 char c2; 1097 if(UTF16Plus.isSurrogateLead(c)) { 1098 if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) { 1099 ++src; 1100 c=Character.toCodePoint((char)c, c2); 1101 } 1102 } else /* trail surrogate */ { 1103 if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) { 1104 --prevSrc; 1105 c=Character.toCodePoint(c2, (char)c); 1106 } 1107 } 1108 if(!isCompYesAndZeroCC(norm16=getNorm16(c))) { 1109 break; 1110 } 1111 } 1112 } 1113 } 1114 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1115 // The current character is either a "noNo" (has a mapping) 1116 // or a "maybeYes" (combines backward) 1117 // or a "yesYes" with ccc!=0. 1118 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1119 1120 // Medium-fast path: Handle cases that do not require full decomposition and recomposition. 1121 if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes 1122 if (!doCompose) { 1123 return false; 1124 } 1125 // Fast path for mapping a character that is immediately surrounded by boundaries. 1126 // In this case, we need not decompose around the current character. 1127 if (isDecompNoAlgorithmic(norm16)) { 1128 // Maps to a single isCompYesAndZeroCC character 1129 // which also implies hasCompBoundaryBefore. 1130 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 1131 hasCompBoundaryBefore(s, src, limit)) { 1132 if (prevBoundary != prevSrc) { 1133 buffer.append(s, prevBoundary, prevSrc); 1134 } 1135 buffer.append(mapAlgorithmic(c, norm16), 0); 1136 prevBoundary = src; 1137 continue; 1138 } 1139 } else if (norm16 < minNoNoCompBoundaryBefore) { 1140 // The mapping is comp-normalized which also implies hasCompBoundaryBefore. 1141 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 1142 hasCompBoundaryBefore(s, src, limit)) { 1143 if (prevBoundary != prevSrc) { 1144 buffer.append(s, prevBoundary, prevSrc); 1145 } 1146 int mapping = norm16 >> OFFSET_SHIFT; 1147 int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; 1148 buffer.append(extraData, mapping, mapping + length); 1149 prevBoundary = src; 1150 continue; 1151 } 1152 } else if (norm16 >= minNoNoEmpty) { 1153 // The current character maps to nothing. 1154 // Simply omit it from the output if there is a boundary before _or_ after it. 1155 // The character itself implies no boundaries. 1156 if (hasCompBoundaryBefore(s, src, limit) || 1157 hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { 1158 if (prevBoundary != prevSrc) { 1159 buffer.append(s, prevBoundary, prevSrc); 1160 } 1161 prevBoundary = src; 1162 continue; 1163 } 1164 } 1165 // Other "noNo" type, or need to examine more text around this character: 1166 // Fall through to the slow path. 1167 } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { 1168 char prev=s.charAt(prevSrc-1); 1169 if(c<Hangul.JAMO_T_BASE) { 1170 // The current character is a Jamo Vowel, 1171 // compose with previous Jamo L and following Jamo T. 1172 char l = (char)(prev-Hangul.JAMO_L_BASE); 1173 if(l<Hangul.JAMO_L_COUNT) { 1174 if (!doCompose) { 1175 return false; 1176 } 1177 int t; 1178 if (src != limit && 1179 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) && 1180 t < Hangul.JAMO_T_COUNT) { 1181 // The next character is a Jamo T. 1182 ++src; 1183 } else if (hasCompBoundaryBefore(s, src, limit)) { 1184 // No Jamo T follows, not even via decomposition. 1185 t = 0; 1186 } else { 1187 t = -1; 1188 } 1189 if (t >= 0) { 1190 int syllable = Hangul.HANGUL_BASE + 1191 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * 1192 Hangul.JAMO_T_COUNT + t; 1193 --prevSrc; // Replace the Jamo L as well. 1194 if (prevBoundary != prevSrc) { 1195 buffer.append(s, prevBoundary, prevSrc); 1196 } 1197 buffer.append((char)syllable); 1198 prevBoundary = src; 1199 continue; 1200 } 1201 // If we see L+V+x where x!=T then we drop to the slow path, 1202 // decompose and recompose. 1203 // This is to deal with NFKC finding normal L and V but a 1204 // compatibility variant of a T. 1205 // We need to either fully compose that combination here 1206 // (which would complicate the code and may not work with strange custom data) 1207 // or use the slow path. 1208 } 1209 } else if (Hangul.isHangulLV(prev)) { 1210 // The current character is a Jamo Trailing consonant, 1211 // compose with previous Hangul LV that does not contain a Jamo T. 1212 if (!doCompose) { 1213 return false; 1214 } 1215 int syllable = prev + c - Hangul.JAMO_T_BASE; 1216 --prevSrc; // Replace the Hangul LV as well. 1217 if (prevBoundary != prevSrc) { 1218 buffer.append(s, prevBoundary, prevSrc); 1219 } 1220 buffer.append((char)syllable); 1221 prevBoundary = src; 1222 continue; 1223 } 1224 // No matching context, or may need to decompose surrounding text first: 1225 // Fall through to the slow path. 1226 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC 1227 // One or more combining marks that do not combine-back: 1228 // Check for canonical order, copy unchanged if ok and 1229 // if followed by a character with a boundary-before. 1230 int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 1231 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { 1232 // Fails FCD test, need to decompose and contiguously recompose. 1233 if (!doCompose) { 1234 return false; 1235 } 1236 } else { 1237 // If !onlyContiguous (not FCC), then we ignore the tccc of 1238 // the previous character which passed the quick check "yes && ccc==0" test. 1239 int n16; 1240 for (;;) { 1241 if (src == limit) { 1242 if (doCompose) { 1243 buffer.append(s, prevBoundary, limit); 1244 } 1245 return true; 1246 } 1247 int prevCC = cc; 1248 c = Character.codePointAt(s, src); 1249 n16 = normTrie.get(c); 1250 if (n16 >= MIN_YES_YES_WITH_CC) { 1251 cc = getCCFromNormalYesOrMaybe(n16); 1252 if (prevCC > cc) { 1253 if (!doCompose) { 1254 return false; 1255 } 1256 break; 1257 } 1258 } else { 1259 break; 1260 } 1261 src += Character.charCount(c); 1262 } 1263 // p is after the last in-order combining mark. 1264 // If there is a boundary here, then we continue with no change. 1265 if (norm16HasCompBoundaryBefore(n16)) { 1266 if (isCompYesAndZeroCC(n16)) { 1267 src += Character.charCount(c); 1268 } 1269 continue; 1270 } 1271 // Use the slow path. There is no boundary in [prevSrc, src[. 1272 } 1273 } 1274 1275 // Slow path: Find the nearest boundaries around the current character, 1276 // decompose and recompose. 1277 if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { 1278 c = Character.codePointBefore(s, prevSrc); 1279 norm16 = normTrie.get(c); 1280 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1281 prevSrc -= Character.charCount(c); 1282 } 1283 } 1284 if (doCompose && prevBoundary != prevSrc) { 1285 buffer.append(s, prevBoundary, prevSrc); 1286 } 1287 int recomposeStartIndex=buffer.length(); 1288 // We know there is not a boundary here. 1289 decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, 1290 buffer); 1291 // Decompose until the next boundary. 1292 src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, 1293 buffer); 1294 recompose(buffer, recomposeStartIndex, onlyContiguous); 1295 if(!doCompose) { 1296 if(!buffer.equals(s, prevSrc, src)) { 1297 return false; 1298 } 1299 buffer.remove(); 1300 } 1301 prevBoundary=src; 1302 } 1303 } 1304 1305 /** 1306 * Very similar to compose(): Make the same changes in both places if relevant. 1307 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) 1308 * !doSpan: quickCheck 1309 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and 1310 * bit 0: set if "maybe"; otherwise, if the span length<s.length() 1311 * then the quick check result is "no" 1312 */ 1313 public int composeQuickCheck(CharSequence s, int src, int limit, 1314 boolean onlyContiguous, boolean doSpan) { 1315 int qcResult=0; 1316 int prevBoundary=src; 1317 int minNoMaybeCP=minCompNoMaybeCP; 1318 1319 for(;;) { 1320 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 1321 // or with (compYes && ccc==0) properties. 1322 int prevSrc; 1323 int c = 0; 1324 int norm16 = 0; 1325 for (;;) { 1326 if(src==limit) { 1327 return (src<<1)|qcResult; // "yes" or "maybe" 1328 } 1329 if( (c=s.charAt(src))<minNoMaybeCP || 1330 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 1331 ) { 1332 ++src; 1333 } else { 1334 prevSrc = src++; 1335 if(!UTF16.isSurrogate((char)c)) { 1336 break; 1337 } else { 1338 char c2; 1339 if(UTF16Plus.isSurrogateLead(c)) { 1340 if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) { 1341 ++src; 1342 c=Character.toCodePoint((char)c, c2); 1343 } 1344 } else /* trail surrogate */ { 1345 if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) { 1346 --prevSrc; 1347 c=Character.toCodePoint(c2, (char)c); 1348 } 1349 } 1350 if(!isCompYesAndZeroCC(norm16=getNorm16(c))) { 1351 break; 1352 } 1353 } 1354 } 1355 } 1356 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1357 // The current character is either a "noNo" (has a mapping) 1358 // or a "maybeYes" (combines backward) 1359 // or a "yesYes" with ccc!=0. 1360 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1361 1362 int prevNorm16 = INERT; 1363 if (prevBoundary != prevSrc) { 1364 prevBoundary = prevSrc; 1365 if (!norm16HasCompBoundaryBefore(norm16)) { 1366 c = Character.codePointBefore(s, prevSrc); 1367 int n16 = getNorm16(c); 1368 if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { 1369 prevBoundary -= Character.charCount(c); 1370 prevNorm16 = n16; 1371 } 1372 } 1373 } 1374 1375 if(isMaybeOrNonZeroCC(norm16)) { 1376 int cc=getCCFromYesOrMaybe(norm16); 1377 if (onlyContiguous /* FCC */ && cc != 0 && 1378 getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { 1379 // The [prevBoundary..prevSrc[ character 1380 // passed the quick check "yes && ccc==0" test 1381 // but is out of canonical order with the current combining mark. 1382 } else { 1383 // If !onlyContiguous (not FCC), then we ignore the tccc of 1384 // the previous character which passed the quick check "yes && ccc==0" test. 1385 for (;;) { 1386 if (norm16 < MIN_YES_YES_WITH_CC) { 1387 if (!doSpan) { 1388 qcResult = 1; 1389 } else { 1390 return prevBoundary << 1; // spanYes does not care to know it's "maybe" 1391 } 1392 } 1393 if (src == limit) { 1394 return (src<<1) | qcResult; // "yes" or "maybe" 1395 } 1396 int prevCC = cc; 1397 c = Character.codePointAt(s, src); 1398 norm16 = getNorm16(c); 1399 if (isMaybeOrNonZeroCC(norm16)) { 1400 cc = getCCFromYesOrMaybe(norm16); 1401 if (!(prevCC <= cc || cc == 0)) { 1402 break; 1403 } 1404 } else { 1405 break; 1406 } 1407 src += Character.charCount(c); 1408 } 1409 // src is after the last in-order combining mark. 1410 if (isCompYesAndZeroCC(norm16)) { 1411 prevBoundary = src; 1412 src += Character.charCount(c); 1413 continue; 1414 } 1415 } 1416 } 1417 return prevBoundary<<1; // "no" 1418 } 1419 } 1420 public void composeAndAppend(CharSequence s, 1421 boolean doCompose, 1422 boolean onlyContiguous, 1423 ReorderingBuffer buffer) { 1424 int src=0, limit=s.length(); 1425 if(!buffer.isEmpty()) { 1426 int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); 1427 if(0!=firstStarterInSrc) { 1428 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), 1429 buffer.length(), onlyContiguous); 1430 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ 1431 firstStarterInSrc+16); 1432 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); 1433 buffer.removeSuffix(buffer.length()-lastStarterInDest); 1434 middle.append(s, 0, firstStarterInSrc); 1435 compose(middle, 0, middle.length(), onlyContiguous, true, buffer); 1436 src=firstStarterInSrc; 1437 } 1438 } 1439 if(doCompose) { 1440 compose(s, src, limit, onlyContiguous, true, buffer); 1441 } else { 1442 buffer.append(s, src, limit); 1443 } 1444 } 1445 // Dual functionality: 1446 // buffer!=NULL: normalize 1447 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1448 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { 1449 // Note: In this function we use buffer->appendZeroCC() because we track 1450 // the lead and trail combining classes here, rather than leaving it to 1451 // the ReorderingBuffer. 1452 // The exception is the call to decomposeShort() which uses the buffer 1453 // in the normal way. 1454 1455 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1456 // Similar to the prevBoundary in the compose() implementation. 1457 int prevBoundary=src; 1458 int prevSrc; 1459 int c=0; 1460 int prevFCD16=0; 1461 int fcd16=0; 1462 1463 for(;;) { 1464 // count code units with lccc==0 1465 for(prevSrc=src; src!=limit;) { 1466 if((c=s.charAt(src))<minLcccCP) { 1467 prevFCD16=~c; 1468 ++src; 1469 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1470 prevFCD16=0; 1471 ++src; 1472 } else { 1473 if(UTF16.isSurrogate((char)c)) { 1474 char c2; 1475 if(UTF16Plus.isSurrogateLead(c)) { 1476 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1477 c=Character.toCodePoint((char)c, c2); 1478 } 1479 } else /* trail surrogate */ { 1480 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1481 --src; 1482 c=Character.toCodePoint(c2, (char)c); 1483 } 1484 } 1485 } 1486 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1487 prevFCD16=fcd16; 1488 src+=Character.charCount(c); 1489 } else { 1490 break; 1491 } 1492 } 1493 } 1494 // copy these code units all at once 1495 if(src!=prevSrc) { 1496 if(src==limit) { 1497 if(buffer!=null) { 1498 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1499 } 1500 break; 1501 } 1502 prevBoundary=src; 1503 // We know that the previous character's lccc==0. 1504 if(prevFCD16<0) { 1505 // Fetching the fcd16 value was deferred for this below-minLcccCP code point. 1506 int prev=~prevFCD16; 1507 if(prev<minDecompNoCP) { 1508 prevFCD16=0; 1509 } else { 1510 prevFCD16=getFCD16FromNormData(prev); 1511 if(prevFCD16>1) { 1512 --prevBoundary; 1513 } 1514 } 1515 } else { 1516 int p=src-1; 1517 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && 1518 Character.isHighSurrogate(s.charAt(p-1)) 1519 ) { 1520 --p; 1521 // Need to fetch the previous character's FCD value because 1522 // prevFCD16 was just for the trail surrogate code point. 1523 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); 1524 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1525 } 1526 if(prevFCD16>1) { 1527 prevBoundary=p; 1528 } 1529 } 1530 if(buffer!=null) { 1531 // The last lccc==0 character is excluded from the 1532 // flush-and-append call in case it needs to be modified. 1533 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1534 buffer.append(s, prevBoundary, src); 1535 } 1536 // The start of the current character (c). 1537 prevSrc=src; 1538 } else if(src==limit) { 1539 break; 1540 } 1541 1542 src+=Character.charCount(c); 1543 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1544 // Check for proper order, and decompose locally if necessary. 1545 if((prevFCD16&0xff)<=(fcd16>>8)) { 1546 // proper order: prev tccc <= current lccc 1547 if((fcd16&0xff)<=1) { 1548 prevBoundary=src; 1549 } 1550 if(buffer!=null) { 1551 buffer.appendZeroCC(c); 1552 } 1553 prevFCD16=fcd16; 1554 continue; 1555 } else if(buffer==null) { 1556 return prevBoundary; // quick check "no" 1557 } else { 1558 /* 1559 * Back out the part of the source that we copied or appended 1560 * already but is now going to be decomposed. 1561 * prevSrc is set to after what was copied/appended. 1562 */ 1563 buffer.removeSuffix(prevSrc-prevBoundary); 1564 /* 1565 * Find the part of the source that needs to be decomposed, 1566 * up to the next safe boundary. 1567 */ 1568 src=findNextFCDBoundary(s, src, limit); 1569 /* 1570 * The source text does not fulfill the conditions for FCD. 1571 * Decompose and reorder a limited piece of the text. 1572 */ 1573 decomposeShort(s, prevBoundary, src, false, false, buffer); 1574 prevBoundary=src; 1575 prevFCD16=0; 1576 } 1577 } 1578 return src; 1579 } 1580 public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) { 1581 int src=0, limit=s.length(); 1582 if(!buffer.isEmpty()) { 1583 int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit); 1584 if(0!=firstBoundaryInSrc) { 1585 int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(), 1586 buffer.length()); 1587 StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+ 1588 firstBoundaryInSrc+16); 1589 middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length()); 1590 buffer.removeSuffix(buffer.length()-lastBoundaryInDest); 1591 middle.append(s, 0, firstBoundaryInSrc); 1592 makeFCD(middle, 0, middle.length(), buffer); 1593 src=firstBoundaryInSrc; 1594 } 1595 } 1596 if(doMakeFCD) { 1597 makeFCD(s, src, limit, buffer); 1598 } else { 1599 buffer.append(s, src, limit); 1600 } 1601 } 1602 1603 public boolean hasDecompBoundaryBefore(int c) { 1604 return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || 1605 norm16HasDecompBoundaryBefore(getNorm16(c)); 1606 } 1607 public boolean norm16HasDecompBoundaryBefore(int norm16) { 1608 if (norm16 < minNoNoCompNoMaybeCC) { 1609 return true; 1610 } 1611 if (norm16 >= limitNoNo) { 1612 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1613 } 1614 // c decomposes, get everything from the variable-length extra data 1615 int mapping=norm16>>OFFSET_SHIFT; 1616 int firstUnit=extraData.charAt(mapping); 1617 // true if leadCC==0 (hasFCDBoundaryBefore()) 1618 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1619 } 1620 public boolean hasDecompBoundaryAfter(int c) { 1621 if (c < minDecompNoCP) { 1622 return true; 1623 } 1624 if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { 1625 return true; 1626 } 1627 return norm16HasDecompBoundaryAfter(getNorm16(c)); 1628 } 1629 public boolean norm16HasDecompBoundaryAfter(int norm16) { 1630 if(norm16 <= minYesNo || isHangulLVT(norm16)) { 1631 return true; 1632 } 1633 if (norm16 >= limitNoNo) { 1634 if (isMaybeOrNonZeroCC(norm16)) { 1635 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1636 } 1637 // Maps to an isCompYesAndZeroCC. 1638 return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; 1639 } 1640 // c decomposes, get everything from the variable-length extra data 1641 int mapping=norm16>>OFFSET_SHIFT; 1642 int firstUnit=extraData.charAt(mapping); 1643 // decomp after-boundary: same as hasFCDBoundaryAfter(), 1644 // fcd16<=1 || trailCC==0 1645 if(firstUnit>0x1ff) { 1646 return false; // trailCC>1 1647 } 1648 if(firstUnit<=0xff) { 1649 return true; // trailCC==0 1650 } 1651 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1652 // true if leadCC==0 (hasFCDBoundaryBefore()) 1653 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1654 } 1655 public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } 1656 1657 public boolean hasCompBoundaryBefore(int c) { 1658 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c)); 1659 } 1660 public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) { 1661 return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous); 1662 } 1663 public boolean isCompInert(int c, boolean onlyContiguous) { 1664 int norm16=getNorm16(c); 1665 return isCompYesAndZeroCC(norm16) && 1666 (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && 1667 (!onlyContiguous || isInert(norm16) || extraData.charAt(norm16>>OFFSET_SHIFT) <= 0x1ff); 1668 } 1669 1670 public boolean hasFCDBoundaryBefore(int c) { return hasDecompBoundaryBefore(c); } 1671 public boolean hasFCDBoundaryAfter(int c) { return hasDecompBoundaryAfter(c); } 1672 public boolean isFCDInert(int c) { return getFCD16(c)<=1; } 1673 1674 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } 1675 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } 1676 private static boolean isInert(int norm16) { return norm16==INERT; } 1677 private static boolean isJamoL(int norm16) { return norm16==JAMO_L; } 1678 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } 1679 private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } 1680 private boolean isHangulLV(int norm16) { return norm16==minYesNo; } 1681 private boolean isHangulLVT(int norm16) { 1682 return norm16==hangulLVT(); 1683 } 1684 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } 1685 // UBool isCompYes(uint16_t norm16) const { 1686 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 1687 // } 1688 // UBool isCompYesOrMaybe(uint16_t norm16) const { 1689 // return norm16<minNoNo || minMaybeYes<=norm16; 1690 // } 1691 // private boolean hasZeroCCFromDecompYes(int norm16) { 1692 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1693 // } 1694 private boolean isDecompYesAndZeroCC(int norm16) { 1695 return norm16<minYesNo || 1696 norm16==JAMO_VT || 1697 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 1698 } 1699 /** 1700 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 1701 * the MaybeYes which combine-forward and have ccc=0. 1702 * (Standard Unicode 10 normalization does not have such characters.) 1703 */ 1704 private boolean isMostDecompYesAndZeroCC(int norm16) { 1705 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1706 } 1707 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } 1708 1709 // For use with isCompYes(). 1710 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 1711 // static uint8_t getCCFromYes(uint16_t norm16) { 1712 // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; 1713 // } 1714 private int getCCFromNoNo(int norm16) { 1715 int mapping=norm16>>OFFSET_SHIFT; 1716 if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1717 return extraData.charAt(mapping-1)&0xff; 1718 } else { 1719 return 0; 1720 } 1721 } 1722 int getTrailCCFromCompYesAndZeroCC(int norm16) { 1723 if(norm16<=minYesNo) { 1724 return 0; // yesYes and Hangul LV have ccc=tccc=0 1725 } else { 1726 // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. 1727 return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo 1728 } 1729 } 1730 1731 // Requires algorithmic-NoNo. 1732 private int mapAlgorithmic(int c, int norm16) { 1733 return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; 1734 } 1735 1736 // Requires minYesNo<norm16<limitNoNo. 1737 // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); } 1738 1739 /** 1740 * @return index into maybeYesCompositions, or -1 1741 */ 1742 private int getCompositionsListForDecompYes(int norm16) { 1743 if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) { 1744 return -1; 1745 } else { 1746 if((norm16-=minMaybeYes)<0) { 1747 // norm16<minMaybeYes: index into extraData which is a substring at 1748 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] 1749 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 1750 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list 1751 } 1752 return norm16>>OFFSET_SHIFT; 1753 } 1754 } 1755 /** 1756 * @return index into maybeYesCompositions 1757 */ 1758 private int getCompositionsListForComposite(int norm16) { 1759 // A composite has both mapping & compositions list. 1760 int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; 1761 int firstUnit=maybeYesCompositions.charAt(list); 1762 return list+ // mapping in maybeYesCompositions 1763 1+ // +1 to skip the first unit with the mapping length 1764 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length 1765 } 1766 private int getCompositionsListForMaybe(int norm16) { 1767 // minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES 1768 return (norm16-minMaybeYes)>>OFFSET_SHIFT; 1769 } 1770 /** 1771 * @param c code point must have compositions 1772 * @return index into maybeYesCompositions 1773 */ 1774 private int getCompositionsList(int norm16) { 1775 return isDecompYes(norm16) ? 1776 getCompositionsListForDecompYes(norm16) : 1777 getCompositionsListForComposite(norm16); 1778 } 1779 1780 // Decompose a short piece of text which is likely to contain characters that 1781 // fail the quick check loop and/or where the quick check loop's overhead 1782 // is unlikely to be amortized. 1783 // Called by the compose() and makeFCD() implementations. 1784 // Public in Java for collation implementation code. 1785 private int decomposeShort( 1786 CharSequence s, int src, int limit, 1787 boolean stopAtCompBoundary, boolean onlyContiguous, 1788 ReorderingBuffer buffer) { 1789 while(src<limit) { 1790 int c=Character.codePointAt(s, src); 1791 if (stopAtCompBoundary && c < minCompNoMaybeCP) { 1792 return src; 1793 } 1794 int norm16 = getNorm16(c); 1795 if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) { 1796 return src; 1797 } 1798 src+=Character.charCount(c); 1799 decompose(c, norm16, buffer); 1800 if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1801 return src; 1802 } 1803 } 1804 return src; 1805 } 1806 private void decompose(int c, int norm16, ReorderingBuffer buffer) { 1807 // get the decomposition and the lead and trail cc's 1808 if (norm16 >= limitNoNo) { 1809 if (isMaybeOrNonZeroCC(norm16)) { 1810 buffer.append(c, getCCFromYesOrMaybe(norm16)); 1811 return; 1812 } 1813 // Maps to an isCompYesAndZeroCC. 1814 c=mapAlgorithmic(c, norm16); 1815 norm16=getNorm16(c); 1816 } 1817 if (norm16 < minYesNo) { 1818 // c does not decompose 1819 buffer.append(c, 0); 1820 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 1821 // Hangul syllable: decompose algorithmically 1822 Hangul.decompose(c, buffer); 1823 } else { 1824 // c decomposes, get everything from the variable-length extra data 1825 int mapping=norm16>>OFFSET_SHIFT; 1826 int firstUnit=extraData.charAt(mapping); 1827 int length=firstUnit&MAPPING_LENGTH_MASK; 1828 int leadCC, trailCC; 1829 trailCC=firstUnit>>8; 1830 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1831 leadCC=extraData.charAt(mapping-1)>>8; 1832 } else { 1833 leadCC=0; 1834 } 1835 ++mapping; // skip over the firstUnit 1836 buffer.append(extraData, mapping, mapping+length, leadCC, trailCC); 1837 } 1838 } 1839 1840 /** 1841 * Finds the recomposition result for 1842 * a forward-combining "lead" character, 1843 * specified with a pointer to its compositions list, 1844 * and a backward-combining "trail" character. 1845 * 1846 * <p>If the lead and trail characters combine, then this function returns 1847 * the following "compositeAndFwd" value: 1848 * <pre> 1849 * Bits 21..1 composite character 1850 * Bit 0 set if the composite is a forward-combining starter 1851 * </pre> 1852 * otherwise it returns -1. 1853 * 1854 * <p>The compositions list has (trail, compositeAndFwd) pair entries, 1855 * encoded as either pairs or triples of 16-bit units. 1856 * The last entry has the high bit of its first unit set. 1857 * 1858 * <p>The list is sorted by ascending trail characters (there are no duplicates). 1859 * A linear search is used. 1860 * 1861 * <p>See normalizer2impl.h for a more detailed description 1862 * of the compositions list format. 1863 */ 1864 private static int combine(String compositions, int list, int trail) { 1865 int key1, firstUnit; 1866 if(trail<COMP_1_TRAIL_LIMIT) { 1867 // trail character is 0..33FF 1868 // result entry may have 2 or 3 units 1869 key1=(trail<<1); 1870 while(key1>(firstUnit=compositions.charAt(list))) { 1871 list+=2+(firstUnit&COMP_1_TRIPLE); 1872 } 1873 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1874 if((firstUnit&COMP_1_TRIPLE)!=0) { 1875 return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); 1876 } else { 1877 return compositions.charAt(list+1); 1878 } 1879 } 1880 } else { 1881 // trail character is 3400..10FFFF 1882 // result entry has 3 units 1883 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); 1884 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; 1885 int secondUnit; 1886 for(;;) { 1887 if(key1>(firstUnit=compositions.charAt(list))) { 1888 list+=2+(firstUnit&COMP_1_TRIPLE); 1889 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1890 if(key2>(secondUnit=compositions.charAt(list+1))) { 1891 if((firstUnit&COMP_1_LAST_TUPLE)!=0) { 1892 break; 1893 } else { 1894 list+=3; 1895 } 1896 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1897 return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); 1898 } else { 1899 break; 1900 } 1901 } else { 1902 break; 1903 } 1904 } 1905 } 1906 return -1; 1907 } 1908 /** 1909 * @param list some character's compositions list 1910 * @param set recursively receives the composites from these compositions 1911 */ 1912 private void addComposites(int list, UnicodeSet set) { 1913 int firstUnit, compositeAndFwd; 1914 do { 1915 firstUnit=maybeYesCompositions.charAt(list); 1916 if((firstUnit&COMP_1_TRIPLE)==0) { 1917 compositeAndFwd=maybeYesCompositions.charAt(list+1); 1918 list+=2; 1919 } else { 1920 compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)| 1921 maybeYesCompositions.charAt(list+2); 1922 list+=3; 1923 } 1924 int composite=compositeAndFwd>>1; 1925 if((compositeAndFwd&1)!=0) { 1926 addComposites(getCompositionsListForComposite(getNorm16(composite)), set); 1927 } 1928 set.add(composite); 1929 } while((firstUnit&COMP_1_LAST_TUPLE)==0); 1930 } 1931 /* 1932 * Recomposes the buffer text starting at recomposeStartIndex 1933 * (which is in NFD - decomposed and canonically ordered), 1934 * and truncates the buffer contents. 1935 * 1936 * Note that recomposition never lengthens the text: 1937 * Any character consists of either one or two code units; 1938 * a composition may contain at most one more code unit than the original starter, 1939 * while the combining mark that is removed has at least one code unit. 1940 */ 1941 private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, 1942 boolean onlyContiguous) { 1943 StringBuilder sb=buffer.getStringBuilder(); 1944 int p=recomposeStartIndex; 1945 if(p==sb.length()) { 1946 return; 1947 } 1948 1949 int starter, pRemove; 1950 int compositionsList; 1951 int c, compositeAndFwd; 1952 int norm16; 1953 int cc, prevCC; 1954 boolean starterIsSupplementary; 1955 1956 // Some of the following variables are not used until we have a forward-combining starter 1957 // and are only initialized now to avoid compiler warnings. 1958 compositionsList=-1; // used as indicator for whether we have a forward-combining starter 1959 starter=-1; 1960 starterIsSupplementary=false; 1961 prevCC=0; 1962 1963 for(;;) { 1964 c=sb.codePointAt(p); 1965 p+=Character.charCount(c); 1966 norm16=getNorm16(c); 1967 cc=getCCFromYesOrMaybe(norm16); 1968 if( // this character combines backward and 1969 isMaybe(norm16) && 1970 // we have seen a starter that combines forward and 1971 compositionsList>=0 && 1972 // the backward-combining character is not blocked 1973 (prevCC<cc || prevCC==0) 1974 ) { 1975 if(isJamoVT(norm16)) { 1976 // c is a Jamo V/T, see if we can compose it with the previous character. 1977 if(c<Hangul.JAMO_T_BASE) { 1978 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1979 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); 1980 if(prev<Hangul.JAMO_L_COUNT) { 1981 pRemove=p-1; 1982 char syllable=(char) 1983 (Hangul.HANGUL_BASE+ 1984 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1985 Hangul.JAMO_T_COUNT); 1986 char t; 1987 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1988 ++p; 1989 syllable+=t; // The next character was a Jamo T. 1990 } 1991 sb.setCharAt(starter, syllable); 1992 // remove the Jamo V/T 1993 sb.delete(pRemove, p); 1994 p=pRemove; 1995 } 1996 } 1997 /* 1998 * No "else" for Jamo T: 1999 * Since the input is in NFD, there are no Hangul LV syllables that 2000 * a Jamo T could combine with. 2001 * All Jamo Ts are combined above when handling Jamo Vs. 2002 */ 2003 if(p==sb.length()) { 2004 break; 2005 } 2006 compositionsList=-1; 2007 continue; 2008 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { 2009 // The starter and the combining mark (c) do combine. 2010 int composite=compositeAndFwd>>1; 2011 2012 // Remove the combining mark. 2013 pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark 2014 sb.delete(pRemove, p); 2015 p=pRemove; 2016 // Replace the starter with the composite. 2017 if(starterIsSupplementary) { 2018 if(composite>0xffff) { 2019 // both are supplementary 2020 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 2021 sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); 2022 } else { 2023 sb.setCharAt(starter, (char)c); 2024 sb.deleteCharAt(starter+1); 2025 // The composite is shorter than the starter, 2026 // move the intermediate characters forward one. 2027 starterIsSupplementary=false; 2028 --p; 2029 } 2030 } else if(composite>0xffff) { 2031 // The composite is longer than the starter, 2032 // move the intermediate characters back one. 2033 starterIsSupplementary=true; 2034 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 2035 sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); 2036 ++p; 2037 } else { 2038 // both are on the BMP 2039 sb.setCharAt(starter, (char)composite); 2040 } 2041 2042 // Keep prevCC because we removed the combining mark. 2043 2044 if(p==sb.length()) { 2045 break; 2046 } 2047 // Is the composite a starter that combines forward? 2048 if((compositeAndFwd&1)!=0) { 2049 compositionsList= 2050 getCompositionsListForComposite(getNorm16(composite)); 2051 } else { 2052 compositionsList=-1; 2053 } 2054 2055 // We combined; continue with looking for compositions. 2056 continue; 2057 } 2058 } 2059 2060 // no combination this time 2061 prevCC=cc; 2062 if(p==sb.length()) { 2063 break; 2064 } 2065 2066 // If c did not combine, then check if it is a starter. 2067 if(cc==0) { 2068 // Found a new starter. 2069 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { 2070 // It may combine with something, prepare for it. 2071 if(c<=0xffff) { 2072 starterIsSupplementary=false; 2073 starter=p-1; 2074 } else { 2075 starterIsSupplementary=true; 2076 starter=p-2; 2077 } 2078 } 2079 } else if(onlyContiguous) { 2080 // FCC: no discontiguous compositions; any intervening character blocks. 2081 compositionsList=-1; 2082 } 2083 } 2084 buffer.flush(); 2085 } 2086 2087 public int composePair(int a, int b) { 2088 int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 2089 int list; 2090 if(isInert(norm16)) { 2091 return -1; 2092 } else if(norm16<minYesNoMappingsOnly) { 2093 // a combines forward. 2094 if(isJamoL(norm16)) { 2095 b-=Hangul.JAMO_V_BASE; 2096 if(0<=b && b<Hangul.JAMO_V_COUNT) { 2097 return 2098 (Hangul.HANGUL_BASE+ 2099 ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)* 2100 Hangul.JAMO_T_COUNT); 2101 } else { 2102 return -1; 2103 } 2104 } else if(isHangulLV(norm16)) { 2105 b-=Hangul.JAMO_T_BASE; 2106 if(0<b && b<Hangul.JAMO_T_COUNT) { // not b==0! 2107 return a+b; 2108 } else { 2109 return -1; 2110 } 2111 } else { 2112 // 'a' has a compositions list in extraData 2113 list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; 2114 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 2115 list+= // mapping pointer 2116 1+ // +1 to skip the first unit with the mapping length 2117 (maybeYesCompositions.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length 2118 } 2119 } 2120 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 2121 return -1; 2122 } else { 2123 list=getCompositionsListForMaybe(norm16); // offset into maybeYesCompositions 2124 } 2125 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 2126 return -1; 2127 } 2128 return combine(maybeYesCompositions, list, b)>>1; 2129 } 2130 2131 /** 2132 * Does c have a composition boundary before it? 2133 * True if its decomposition begins with a character that has 2134 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 2135 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 2136 * (isCompYesAndZeroCC()) so we need not decompose. 2137 */ 2138 private boolean hasCompBoundaryBefore(int c, int norm16) { 2139 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16); 2140 } 2141 private boolean norm16HasCompBoundaryBefore(int norm16) { 2142 return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16); 2143 } 2144 private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) { 2145 return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src)); 2146 } 2147 private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) { 2148 return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && 2149 (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16)); 2150 } 2151 private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) { 2152 return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous); 2153 } 2154 /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */ 2155 private boolean isTrailCC01ForCompBoundaryAfter(int norm16) { 2156 return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ? 2157 (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff); 2158 } 2159 2160 private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { 2161 while(p>0) { 2162 int c=Character.codePointBefore(s, p); 2163 int norm16 = getNorm16(c); 2164 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 2165 break; 2166 } 2167 p-=Character.charCount(c); 2168 if(hasCompBoundaryBefore(c, norm16)) { 2169 break; 2170 } 2171 } 2172 return p; 2173 } 2174 private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { 2175 while(p<limit) { 2176 int c=Character.codePointAt(s, p); 2177 int norm16=normTrie.get(c); 2178 if(hasCompBoundaryBefore(c, norm16)) { 2179 break; 2180 } 2181 p+=Character.charCount(c); 2182 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 2183 break; 2184 } 2185 } 2186 return p; 2187 } 2188 2189 private int findPreviousFCDBoundary(CharSequence s, int p) { 2190 while(p>0) { 2191 int c=Character.codePointBefore(s, p); 2192 int norm16; 2193 if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16 = getNorm16(c))) { 2194 break; 2195 } 2196 p-=Character.charCount(c); 2197 if (norm16HasDecompBoundaryBefore(norm16)) { 2198 break; 2199 } 2200 } 2201 return p; 2202 } 2203 private int findNextFCDBoundary(CharSequence s, int p, int limit) { 2204 while(p<limit) { 2205 int c=Character.codePointAt(s, p); 2206 int norm16; 2207 if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) { 2208 break; 2209 } 2210 p+=Character.charCount(c); 2211 if (norm16HasDecompBoundaryAfter(norm16)) { 2212 break; 2213 } 2214 } 2215 return p; 2216 } 2217 2218 private int getPreviousTrailCC(CharSequence s, int start, int p) { 2219 if (start == p) { 2220 return 0; 2221 } 2222 return getFCD16(Character.codePointBefore(s, p)); 2223 } 2224 2225 private void addToStartSet(Trie2Writable newData, int origin, int decompLead) { 2226 int canonValue=newData.get(decompLead); 2227 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 2228 // origin is the first character whose decomposition starts with 2229 // the character for which we are setting the value. 2230 newData.set(decompLead, canonValue|origin); 2231 } else { 2232 // origin is not the first character, or it is U+0000. 2233 UnicodeSet set; 2234 if((canonValue&CANON_HAS_SET)==0) { 2235 int firstOrigin=canonValue&CANON_VALUE_MASK; 2236 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size(); 2237 newData.set(decompLead, canonValue); 2238 canonStartSets.add(set=new UnicodeSet()); 2239 if(firstOrigin!=0) { 2240 set.add(firstOrigin); 2241 } 2242 } else { 2243 set=canonStartSets.get(canonValue&CANON_VALUE_MASK); 2244 } 2245 set.add(origin); 2246 } 2247 } 2248 2249 @SuppressWarnings("unused") 2250 private VersionInfo dataVersion; 2251 2252 // BMP code point thresholds for quick check loops looking at single UTF-16 code units. 2253 private int minDecompNoCP; 2254 private int minCompNoMaybeCP; 2255 private int minLcccCP; 2256 2257 // Norm16 value thresholds for quick check combinations and types of extra data. 2258 private int minYesNo; 2259 private int minYesNoMappingsOnly; 2260 private int minNoNo; 2261 private int minNoNoCompBoundaryBefore; 2262 private int minNoNoCompNoMaybeCC; 2263 private int minNoNoEmpty; 2264 private int limitNoNo; 2265 private int centerNoNoDelta; 2266 private int minMaybeYes; 2267 2268 private Trie2_16 normTrie; 2269 private String maybeYesCompositions; 2270 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 2271 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 2272 2273 private Trie2_32 canonIterData; 2274 private ArrayList<UnicodeSet> canonStartSets; 2275 2276 // bits in canonIterData 2277 private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000; 2278 private static final int CANON_HAS_COMPOSITIONS = 0x40000000; 2279 private static final int CANON_HAS_SET = 0x200000; 2280 private static final int CANON_VALUE_MASK = 0x1fffff; 2281} 2282