Normalizer2Impl.java revision f86f25d102340da66b9c7cb6b2d5ecdc0de43ecf
1/* GENERATED SOURCE. DO NOT MODIFY. */ 2// © 2016 and later: Unicode, Inc. and others. 3// License & terms of use: http://www.unicode.org/copyright.html#License 4/* 5 ******************************************************************************* 6 * Copyright (C) 2009-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11package android.icu.impl; 12 13import java.io.IOException; 14import java.nio.ByteBuffer; 15import java.util.ArrayList; 16import java.util.Iterator; 17 18import android.icu.text.UTF16; 19import android.icu.text.UnicodeSet; 20import android.icu.util.ICUUncheckedIOException; 21import android.icu.util.VersionInfo; 22 23/** 24 * @hide Only a subset of ICU is exposed in Android 25 */ 26public final class Normalizer2Impl { 27 public static final class Hangul { 28 /* Korean Hangul and Jamo constants */ 29 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 30 public static final int JAMO_L_END=0x1112; 31 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 32 public static final int JAMO_V_END=0x1175; 33 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 34 public static final int JAMO_T_END=0x11c2; 35 36 public static final int HANGUL_BASE=0xac00; 37 public static final int HANGUL_END=0xd7a3; 38 39 public static final int JAMO_L_COUNT=19; 40 public static final int JAMO_V_COUNT=21; 41 public static final int JAMO_T_COUNT=28; 42 43 public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT; 44 public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT; 45 46 public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT; 47 48 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 49 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; 50 51 public static boolean isHangul(int c) { 52 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 53 } 54 public static boolean isHangulWithoutJamoT(char c) { 55 c-=HANGUL_BASE; 56 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 57 } 58 public static boolean isJamoL(int c) { 59 return JAMO_L_BASE<=c && c<JAMO_L_LIMIT; 60 } 61 public static boolean isJamoV(int c) { 62 return JAMO_V_BASE<=c && c<JAMO_V_LIMIT; 63 } 64 65 /** 66 * Decomposes c, which must be a Hangul syllable, into buffer 67 * and returns the length of the decomposition (2 or 3). 68 */ 69 public static int decompose(int c, Appendable buffer) { 70 try { 71 c-=HANGUL_BASE; 72 int c2=c%JAMO_T_COUNT; 73 c/=JAMO_T_COUNT; 74 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 75 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 76 if(c2==0) { 77 return 2; 78 } else { 79 buffer.append((char)(JAMO_T_BASE+c2)); 80 return 3; 81 } 82 } catch(IOException e) { 83 // Will not occur because we do not write to I/O. 84 throw new ICUUncheckedIOException(e); 85 } 86 } 87 88 /** 89 * Decomposes c, which must be a Hangul syllable, into buffer. 90 * This is the raw, not recursive, decomposition. Its length is always 2. 91 */ 92 public static void getRawDecomposition(int c, Appendable buffer) { 93 try { 94 int orig=c; 95 c-=HANGUL_BASE; 96 int c2=c%JAMO_T_COUNT; 97 if(c2==0) { 98 c/=JAMO_T_COUNT; 99 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 100 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 101 } else { 102 buffer.append((char)(orig-c2)); // LV syllable 103 buffer.append((char)(JAMO_T_BASE+c2)); 104 } 105 } catch(IOException e) { 106 // Will not occur because we do not write to I/O. 107 throw new ICUUncheckedIOException(e); 108 } 109 } 110 } 111 112 /** 113 * Writable buffer that takes care of canonical ordering. 114 * Its Appendable methods behave like the C++ implementation's 115 * appendZeroCC() methods. 116 * <p> 117 * If dest is a StringBuilder, then the buffer writes directly to it. 118 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments 119 * until no further changes are necessary and whole segments are appended. 120 * append() methods that take combining-class values always write to the StringBuilder. 121 * Other append() methods flush and append to the Appendable. 122 */ 123 public static final class ReorderingBuffer implements Appendable { 124 public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) { 125 impl=ni; 126 app=dest; 127 if(app instanceof StringBuilder) { 128 appIsStringBuilder=true; 129 str=(StringBuilder)dest; 130 // In Java, the constructor subsumes public void init(int destCapacity) { 131 str.ensureCapacity(destCapacity); 132 reorderStart=0; 133 if(str.length()==0) { 134 lastCC=0; 135 } else { 136 setIterator(); 137 lastCC=previousCC(); 138 // Set reorderStart after the last code point with cc<=1 if there is one. 139 if(lastCC>1) { 140 while(previousCC()>1) {} 141 } 142 reorderStart=codePointLimit; 143 } 144 } else { 145 appIsStringBuilder=false; 146 str=new StringBuilder(); 147 reorderStart=0; 148 lastCC=0; 149 } 150 } 151 152 public boolean isEmpty() { return str.length()==0; } 153 public int length() { return str.length(); } 154 public int getLastCC() { return lastCC; } 155 156 public StringBuilder getStringBuilder() { return str; } 157 158 public boolean equals(CharSequence s, int start, int limit) { 159 return UTF16Plus.equal(str, 0, str.length(), s, start, limit); 160 } 161 162 // For Hangul composition, replacing the Leading consonant Jamo with the syllable. 163 public void setLastChar(char c) { 164 str.setCharAt(str.length()-1, c); 165 } 166 167 public void append(int c, int cc) { 168 if(lastCC<=cc || cc==0) { 169 str.appendCodePoint(c); 170 lastCC=cc; 171 if(cc<=1) { 172 reorderStart=str.length(); 173 } 174 } else { 175 insert(c, cc); 176 } 177 } 178 // s must be in NFD, otherwise change the implementation. 179 public void append(CharSequence s, int start, int limit, 180 int leadCC, int trailCC) { 181 if(start==limit) { 182 return; 183 } 184 if(lastCC<=leadCC || leadCC==0) { 185 if(trailCC<=1) { 186 reorderStart=str.length()+(limit-start); 187 } else if(leadCC<=1) { 188 reorderStart=str.length()+1; // Ok if not a code point boundary. 189 } 190 str.append(s, start, limit); 191 lastCC=trailCC; 192 } else { 193 int c=Character.codePointAt(s, start); 194 start+=Character.charCount(c); 195 insert(c, leadCC); // insert first code point 196 while(start<limit) { 197 c=Character.codePointAt(s, start); 198 start+=Character.charCount(c); 199 if(start<limit) { 200 // s must be in NFD, otherwise we need to use getCC(). 201 leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); 202 } else { 203 leadCC=trailCC; 204 } 205 append(c, leadCC); 206 } 207 } 208 } 209 // The following append() methods work like C++ appendZeroCC(). 210 // They assume that the cc or trailCC of their input is 0. 211 // Most of them implement Appendable interface methods. 212 // @Override when we switch to Java 6 213 @Override 214 public ReorderingBuffer append(char c) { 215 str.append(c); 216 lastCC=0; 217 reorderStart=str.length(); 218 return this; 219 } 220 public void appendZeroCC(int c) { 221 str.appendCodePoint(c); 222 lastCC=0; 223 reorderStart=str.length(); 224 } 225 // @Override when we switch to Java 6 226 @Override 227 public ReorderingBuffer append(CharSequence s) { 228 if(s.length()!=0) { 229 str.append(s); 230 lastCC=0; 231 reorderStart=str.length(); 232 } 233 return this; 234 } 235 // @Override when we switch to Java 6 236 @Override 237 public ReorderingBuffer append(CharSequence s, int start, int limit) { 238 if(start!=limit) { 239 str.append(s, start, limit); 240 lastCC=0; 241 reorderStart=str.length(); 242 } 243 return this; 244 } 245 /** 246 * Flushes from the intermediate StringBuilder to the Appendable, 247 * if they are different objects. 248 * Used after recomposition. 249 * Must be called at the end when writing to a non-StringBuilder Appendable. 250 */ 251 public void flush() { 252 if(appIsStringBuilder) { 253 reorderStart=str.length(); 254 } else { 255 try { 256 app.append(str); 257 str.setLength(0); 258 reorderStart=0; 259 } catch(IOException e) { 260 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 261 } 262 } 263 lastCC=0; 264 } 265 /** 266 * Flushes from the intermediate StringBuilder to the Appendable, 267 * if they are different objects. 268 * Then appends the new text to the Appendable or StringBuilder. 269 * Normally used after quick check loops find a non-empty sequence. 270 */ 271 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { 272 if(appIsStringBuilder) { 273 str.append(s, start, limit); 274 reorderStart=str.length(); 275 } else { 276 try { 277 app.append(str).append(s, start, limit); 278 str.setLength(0); 279 reorderStart=0; 280 } catch(IOException e) { 281 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 282 } 283 } 284 lastCC=0; 285 return this; 286 } 287 public void remove() { 288 str.setLength(0); 289 lastCC=0; 290 reorderStart=0; 291 } 292 public void removeSuffix(int suffixLength) { 293 int oldLength=str.length(); 294 str.delete(oldLength-suffixLength, oldLength); 295 lastCC=0; 296 reorderStart=str.length(); 297 } 298 299 /* 300 * TODO: Revisit whether it makes sense to track reorderStart. 301 * It is set to after the last known character with cc<=1, 302 * which stops previousCC() before it reads that character and looks up its cc. 303 * previousCC() is normally only called from insert(). 304 * In other words, reorderStart speeds up the insertion of a combining mark 305 * into a multi-combining mark sequence where it does not belong at the end. 306 * This might not be worth the trouble. 307 * On the other hand, it's not a huge amount of trouble. 308 * 309 * We probably need it for UNORM_SIMPLE_APPEND. 310 */ 311 312 // Inserts c somewhere before the last character. 313 // Requires 0<cc<lastCC which implies reorderStart<limit. 314 private void insert(int c, int cc) { 315 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 316 // insert c at codePointLimit, after the character with prevCC<=cc 317 if(c<=0xffff) { 318 str.insert(codePointLimit, (char)c); 319 if(cc<=1) { 320 reorderStart=codePointLimit+1; 321 } 322 } else { 323 str.insert(codePointLimit, Character.toChars(c)); 324 if(cc<=1) { 325 reorderStart=codePointLimit+2; 326 } 327 } 328 } 329 330 private final Normalizer2Impl impl; 331 private final Appendable app; 332 private final StringBuilder str; 333 private final boolean appIsStringBuilder; 334 private int reorderStart; 335 private int lastCC; 336 337 // private backward iterator 338 private void setIterator() { codePointStart=str.length(); } 339 private void skipPrevious() { // Requires 0<codePointStart. 340 codePointLimit=codePointStart; 341 codePointStart=str.offsetByCodePoints(codePointStart, -1); 342 } 343 private int previousCC() { // Returns 0 if there is no previous character. 344 codePointLimit=codePointStart; 345 if(reorderStart>=codePointStart) { 346 return 0; 347 } 348 int c=str.codePointBefore(codePointStart); 349 codePointStart-=Character.charCount(c); 350 if(c<MIN_CCC_LCCC_CP) { 351 return 0; 352 } 353 return getCCFromYesOrMaybe(impl.getNorm16(c)); 354 } 355 356 private int codePointStart, codePointLimit; 357 } 358 359 // TODO: Propose as public API on the UTF16 class. 360 // TODO: Propose widening UTF16 methods that take char to take int. 361 // TODO: Propose widening UTF16 methods that take String to take CharSequence. 362 public static final class UTF16Plus { 363 /** 364 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), 365 * is it a lead surrogate? 366 * @param c code unit or code point 367 * @return true or false 368 */ 369 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } 370 /** 371 * Compares two CharSequence objects for binary equality. 372 * @param s1 first sequence 373 * @param s2 second sequence 374 * @return true if s1 contains the same text as s2 375 */ 376 public static boolean equal(CharSequence s1, CharSequence s2) { 377 if(s1==s2) { 378 return true; 379 } 380 int length=s1.length(); 381 if(length!=s2.length()) { 382 return false; 383 } 384 for(int i=0; i<length; ++i) { 385 if(s1.charAt(i)!=s2.charAt(i)) { 386 return false; 387 } 388 } 389 return true; 390 } 391 /** 392 * Compares two CharSequence subsequences for binary equality. 393 * @param s1 first sequence 394 * @param start1 start offset in first sequence 395 * @param limit1 limit offset in first sequence 396 * @param s2 second sequence 397 * @param start2 start offset in second sequence 398 * @param limit2 limit offset in second sequence 399 * @return true if s1.subSequence(start1, limit1) contains the same text 400 * as s2.subSequence(start2, limit2) 401 */ 402 public static boolean equal(CharSequence s1, int start1, int limit1, 403 CharSequence s2, int start2, int limit2) { 404 if((limit1-start1)!=(limit2-start2)) { 405 return false; 406 } 407 if(s1==s2 && start1==start2) { 408 return true; 409 } 410 while(start1<limit1) { 411 if(s1.charAt(start1++)!=s2.charAt(start2++)) { 412 return false; 413 } 414 } 415 return true; 416 } 417 } 418 419 public Normalizer2Impl() {} 420 421 private static final class IsAcceptable implements ICUBinary.Authenticate { 422 // @Override when we switch to Java 6 423 @Override 424 public boolean isDataVersionAcceptable(byte version[]) { 425 return version[0]==2; 426 } 427 } 428 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 429 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" 430 431 public Normalizer2Impl load(ByteBuffer bytes) { 432 try { 433 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 434 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 435 if(indexesLength<=IX_MIN_MAYBE_YES) { 436 throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes"); 437 } 438 int[] inIndexes=new int[indexesLength]; 439 inIndexes[0]=indexesLength*4; 440 for(int i=1; i<indexesLength; ++i) { 441 inIndexes[i]=bytes.getInt(); 442 } 443 444 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 445 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 446 447 minYesNo=inIndexes[IX_MIN_YES_NO]; 448 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 449 minNoNo=inIndexes[IX_MIN_NO_NO]; 450 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 451 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 452 453 // Read the normTrie. 454 int offset=inIndexes[IX_NORM_TRIE_OFFSET]; 455 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 456 normTrie=Trie2_16.createFromSerialized(bytes); 457 int trieLength=normTrie.getSerializedLength(); 458 if(trieLength>(nextOffset-offset)) { 459 throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie"); 460 } 461 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes 462 463 // Read the composition and mapping data. 464 offset=nextOffset; 465 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 466 int numChars=(nextOffset-offset)/2; 467 if(numChars!=0) { 468 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); 469 extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes); 470 } 471 472 // smallFCD: new in formatVersion 2 473 offset=nextOffset; 474 smallFCD=new byte[0x100]; 475 bytes.get(smallFCD); 476 477 // Build tccc180[]. 478 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. 479 tccc180=new int[0x180]; 480 int bits=0; 481 for(int c=0; c<0x180; bits>>=1) { 482 if((c&0xff)==0) { 483 bits=smallFCD[c>>8]; // one byte per 0x100 code points 484 } 485 if((bits&1)!=0) { 486 for(int i=0; i<0x20; ++i, ++c) { 487 tccc180[c]=getFCD16FromNormData(c)&0xff; 488 } 489 } else { 490 c+=0x20; 491 } 492 } 493 494 return this; 495 } catch(IOException e) { 496 throw new ICUUncheckedIOException(e); 497 } 498 } 499 public Normalizer2Impl load(String name) { 500 return load(ICUBinary.getRequiredData(name)); 501 } 502 503 private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) { 504 if(isAlgorithmicNoNo(norm16)) { 505 // Range of code points with same-norm16-value algorithmic decompositions. 506 // They might have different non-zero FCD16 values. 507 do { 508 int fcd16=getFCD16(start); 509 if(fcd16>0xff) { set.add(start); } 510 } while(++start<=end); 511 } else { 512 int fcd16=getFCD16(start); 513 if(fcd16>0xff) { set.add(start, end); } 514 } 515 } 516 517 private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) { 518 /* add the start code point to the USet */ 519 set.add(start); 520 if(start!=end && isAlgorithmicNoNo(value)) { 521 // Range of code points with same-norm16-value algorithmic decompositions. 522 // They might have different non-zero FCD16 values. 523 int prevFCD16=getFCD16(start); 524 while(++start<=end) { 525 int fcd16=getFCD16(start); 526 if(fcd16!=prevFCD16) { 527 set.add(start); 528 prevFCD16=fcd16; 529 } 530 } 531 } 532 } 533 534 public void addLcccChars(UnicodeSet set) { 535 /* add the start code point of each same-value range of each trie */ 536 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 537 Trie2.Range range; 538 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 539 enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set); 540 } 541 } 542 543 public void addPropertyStarts(UnicodeSet set) { 544 /* add the start code point of each same-value range of each trie */ 545 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 546 Trie2.Range range; 547 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 548 enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set); 549 } 550 551 /* add Hangul LV syllables and LV+1 because of skippables */ 552 for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) { 553 set.add(c); 554 set.add(c+1); 555 } 556 set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 557 } 558 559 public void addCanonIterPropertyStarts(UnicodeSet set) { 560 /* add the start code point of each same-value range of the canonical iterator data trie */ 561 ensureCanonIterData(); 562 // currently only used for the SEGMENT_STARTER property 563 Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper); 564 Trie2.Range range; 565 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 566 /* add the start code point to the USet */ 567 set.add(range.startCodePoint); 568 } 569 } 570 private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() { 571 @Override 572 public int map(int in) { 573 return in&CANON_NOT_SEGMENT_STARTER; 574 } 575 }; 576 577 // low-level properties ------------------------------------------------ *** 578 579 public Trie2_16 getNormTrie() { return normTrie; } 580 581 // Note: Normalizer2Impl.java r30983 (2011-nov-27) 582 // still had getFCDTrie() which built and cached an FCD trie. 583 // That provided faster access to FCD data than getFCD16FromNormData() 584 // but required synchronization and consumed some 10kB of heap memory 585 // in any process that uses FCD (e.g., via collation). 586 // tccc180[] and smallFCD[] are intended to help with any loss of performance, 587 // at least for Latin & CJK. 588 589 /** 590 * Builds the canonical-iterator data for this instance. 591 * This is required before any of {@link #isCanonSegmentStarter(int)} or 592 * {@link #getCanonStartSet(int, UnicodeSet)} are called, 593 * or else they crash. 594 * @return this 595 */ 596 public synchronized Normalizer2Impl ensureCanonIterData() { 597 if(canonIterData==null) { 598 Trie2Writable newData=new Trie2Writable(0, 0); 599 canonStartSets=new ArrayList<UnicodeSet>(); 600 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 601 Trie2.Range range; 602 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 603 final int norm16=range.value; 604 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) { 605 // Inert, or 2-way mapping (including Hangul syllable). 606 // We do not write a canonStartSet for any yesNo character. 607 // Composites from 2-way mappings are added at runtime from the 608 // starter's compositions list, and the other characters in 609 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 610 // "maybe" characters. 611 continue; 612 } 613 for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) { 614 final int oldValue=newData.get(c); 615 int newValue=oldValue; 616 if(norm16>=minMaybeYes) { 617 // not a segment starter if it occurs in a decomposition or has cc!=0 618 newValue|=CANON_NOT_SEGMENT_STARTER; 619 if(norm16<MIN_NORMAL_MAYBE_YES) { 620 newValue|=CANON_HAS_COMPOSITIONS; 621 } 622 } else if(norm16<minYesNo) { 623 newValue|=CANON_HAS_COMPOSITIONS; 624 } else { 625 // c has a one-way decomposition 626 int c2=c; 627 int norm16_2=norm16; 628 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) { 629 c2=this.mapAlgorithmic(c2, norm16_2); 630 norm16_2=getNorm16(c2); 631 } 632 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) { 633 // c decomposes, get everything from the variable-length extra data 634 int firstUnit=extraData.charAt(norm16_2); 635 int length=firstUnit&MAPPING_LENGTH_MASK; 636 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 637 if(c==c2 && (extraData.charAt(norm16_2-1)&0xff)!=0) { 638 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 639 } 640 } 641 // Skip empty mappings (no characters in the decomposition). 642 if(length!=0) { 643 ++norm16_2; // skip over the firstUnit 644 // add c to first code point's start set 645 int limit=norm16_2+length; 646 c2=extraData.codePointAt(norm16_2); 647 addToStartSet(newData, c, c2); 648 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 649 // one-way mapping. A 2-way mapping is possible here after 650 // intermediate algorithmic mapping. 651 if(norm16_2>=minNoNo) { 652 while((norm16_2+=Character.charCount(c2))<limit) { 653 c2=extraData.codePointAt(norm16_2); 654 int c2Value=newData.get(c2); 655 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 656 newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER); 657 } 658 } 659 } 660 } 661 } else { 662 // c decomposed to c2 algorithmically; c has cc==0 663 addToStartSet(newData, c, c2); 664 } 665 } 666 if(newValue!=oldValue) { 667 newData.set(c, newValue); 668 } 669 } 670 } 671 canonIterData=newData.toTrie2_32(); 672 } 673 return this; 674 } 675 676 public int getNorm16(int c) { return normTrie.get(c); } 677 678 public int getCompQuickCheck(int norm16) { 679 if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { 680 return 1; // yes 681 } else if(minMaybeYes<=norm16) { 682 return 2; // maybe 683 } else { 684 return 0; // no 685 } 686 } 687 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } 688 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } 689 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } 690 691 public int getCC(int norm16) { 692 if(norm16>=MIN_NORMAL_MAYBE_YES) { 693 return norm16&0xff; 694 } 695 if(norm16<minNoNo || limitNoNo<=norm16) { 696 return 0; 697 } 698 return getCCFromNoNo(norm16); 699 } 700 public static int getCCFromYesOrMaybe(int norm16) { 701 return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0; 702 } 703 704 /** 705 * Returns the FCD data for code point c. 706 * @param c A Unicode code point. 707 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 708 */ 709 public int getFCD16(int c) { 710 if(c<0) { 711 return 0; 712 } else if(c<0x180) { 713 return tccc180[c]; 714 } else if(c<=0xffff) { 715 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 716 } 717 return getFCD16FromNormData(c); 718 } 719 /** Returns the FCD data for U+0000<=c<U+0180. */ 720 public int getFCD16FromBelow180(int c) { return tccc180[c]; } 721 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ 722 public boolean singleLeadMightHaveNonZeroFCD16(int lead) { 723 // 0<=lead<=0xffff 724 byte bits=smallFCD[lead>>8]; 725 if(bits==0) { return false; } 726 return ((bits>>((lead>>5)&7))&1)!=0; 727 } 728 729 /** Gets the FCD value from the regular normalization data. */ 730 public int getFCD16FromNormData(int c) { 731 // Only loops for 1:1 algorithmic mappings. 732 for(;;) { 733 int norm16=getNorm16(c); 734 if(norm16<=minYesNo) { 735 // no decomposition or Hangul syllable, all zeros 736 return 0; 737 } else if(norm16>=MIN_NORMAL_MAYBE_YES) { 738 // combining mark 739 norm16&=0xff; 740 return norm16|(norm16<<8); 741 } else if(norm16>=minMaybeYes) { 742 return 0; 743 } else if(isDecompNoAlgorithmic(norm16)) { 744 c=mapAlgorithmic(c, norm16); 745 } else { 746 // c decomposes, get everything from the variable-length extra data 747 int firstUnit=extraData.charAt(norm16); 748 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 749 // A character that is deleted (maps to an empty string) must 750 // get the worst-case lccc and tccc values because arbitrary 751 // characters on both sides will become adjacent. 752 return 0x1ff; 753 } else { 754 int fcd16=firstUnit>>8; // tccc 755 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 756 fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc 757 } 758 return fcd16; 759 } 760 } 761 } 762 } 763 764 /** 765 * Gets the decomposition for one code point. 766 * @param c code point 767 * @return c's decomposition, if it has one; returns null if it does not have a decomposition 768 */ 769 public String getDecomposition(int c) { 770 int decomp=-1; 771 int norm16; 772 for(;;) { 773 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 774 // c does not decompose 775 } else if(isHangul(norm16)) { 776 // Hangul syllable: decompose algorithmically 777 StringBuilder buffer=new StringBuilder(); 778 Hangul.decompose(c, buffer); 779 return buffer.toString(); 780 } else if(isDecompNoAlgorithmic(norm16)) { 781 decomp=c=mapAlgorithmic(c, norm16); 782 continue; 783 } else { 784 // c decomposes, get everything from the variable-length extra data 785 int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK; 786 return extraData.substring(norm16, norm16+length); 787 } 788 if(decomp<0) { 789 return null; 790 } else { 791 return UTF16.valueOf(decomp); 792 } 793 } 794 } 795 796 /** 797 * Gets the raw decomposition for one code point. 798 * @param c code point 799 * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition 800 */ 801 public String getRawDecomposition(int c) { 802 // We do not loop in this method because an algorithmic mapping itself 803 // becomes a final result rather than having to be decomposed recursively. 804 int norm16; 805 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 806 // c does not decompose 807 return null; 808 } else if(isHangul(norm16)) { 809 // Hangul syllable: decompose algorithmically 810 StringBuilder buffer=new StringBuilder(); 811 Hangul.getRawDecomposition(c, buffer); 812 return buffer.toString(); 813 } else if(isDecompNoAlgorithmic(norm16)) { 814 return UTF16.valueOf(mapAlgorithmic(c, norm16)); 815 } else { 816 // c decomposes, get everything from the variable-length extra data 817 int firstUnit=extraData.charAt(norm16); 818 int mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 819 if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) { 820 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 821 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 822 int rawMapping=norm16-((firstUnit>>7)&1)-1; 823 char rm0=extraData.charAt(rawMapping); 824 if(rm0<=MAPPING_LENGTH_MASK) { 825 return extraData.substring(rawMapping-rm0, rawMapping); 826 } else { 827 // Copy the normal mapping and replace its first two code units with rm0. 828 StringBuilder buffer=new StringBuilder(mLength-1).append(rm0); 829 norm16+=1+2; // skip over the firstUnit and the first two mapping code units 830 return buffer.append(extraData, norm16, norm16+mLength-2).toString(); 831 } 832 } else { 833 norm16+=1; // skip over the firstUnit 834 return extraData.substring(norm16, norm16+mLength); 835 } 836 } 837 } 838 839 /** 840 * Returns true if code point c starts a canonical-iterator string segment. 841 * <b>{@link #ensureCanonIterData()} must have been called before this method, 842 * or else this method will crash.</b> 843 * @param c A Unicode code point. 844 * @return true if c starts a canonical-iterator string segment. 845 */ 846 public boolean isCanonSegmentStarter(int c) { 847 return canonIterData.get(c)>=0; 848 } 849 /** 850 * Returns true if there are characters whose decomposition starts with c. 851 * If so, then the set is cleared and then filled with those characters. 852 * <b>{@link #ensureCanonIterData()} must have been called before this method, 853 * or else this method will crash.</b> 854 * @param c A Unicode code point. 855 * @param set A UnicodeSet to receive the characters whose decompositions 856 * start with c, if there are any. 857 * @return true if there are characters whose decomposition starts with c. 858 */ 859 public boolean getCanonStartSet(int c, UnicodeSet set) { 860 int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER; 861 if(canonValue==0) { 862 return false; 863 } 864 set.clear(); 865 int value=canonValue&CANON_VALUE_MASK; 866 if((canonValue&CANON_HAS_SET)!=0) { 867 set.addAll(canonStartSets.get(value)); 868 } else if(value!=0) { 869 set.add(value); 870 } 871 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 872 int norm16=getNorm16(c); 873 if(norm16==JAMO_L) { 874 int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT; 875 set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1); 876 } else { 877 addComposites(getCompositionsList(norm16), set); 878 } 879 } 880 return true; 881 } 882 883 public static final int MIN_CCC_LCCC_CP=0x300; 884 885 public static final int MIN_YES_YES_WITH_CC=0xff01; 886 public static final int JAMO_VT=0xff00; 887 public static final int MIN_NORMAL_MAYBE_YES=0xfe00; 888 public static final int JAMO_L=1; 889 public static final int MAX_DELTA=0x40; 890 891 // Byte offsets from the start of the data, after the generic header. 892 public static final int IX_NORM_TRIE_OFFSET=0; 893 public static final int IX_EXTRA_DATA_OFFSET=1; 894 public static final int IX_SMALL_FCD_OFFSET=2; 895 public static final int IX_RESERVED3_OFFSET=3; 896 public static final int IX_TOTAL_SIZE=7; 897 898 // Code point thresholds for quick check codes. 899 public static final int IX_MIN_DECOMP_NO_CP=8; 900 public static final int IX_MIN_COMP_NO_MAYBE_CP=9; 901 902 // Norm16 value thresholds for quick check combinations and types of extra data. 903 // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. 904 public static final int IX_MIN_YES_NO=10; 905 public static final int IX_MIN_NO_NO=11; 906 public static final int IX_LIMIT_NO_NO=12; 907 public static final int IX_MIN_MAYBE_YES=13; 908 909 // Mappings only in [minYesNoMappingsOnly..minNoNo[. 910 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; 911 912 public static final int IX_COUNT=16; 913 914 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; 915 public static final int MAPPING_HAS_RAW_MAPPING=0x40; 916 public static final int MAPPING_NO_COMP_BOUNDARY_AFTER=0x20; 917 public static final int MAPPING_LENGTH_MASK=0x1f; 918 919 public static final int COMP_1_LAST_TUPLE=0x8000; 920 public static final int COMP_1_TRIPLE=1; 921 public static final int COMP_1_TRAIL_LIMIT=0x3400; 922 public static final int COMP_1_TRAIL_MASK=0x7ffe; 923 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit 924 public static final int COMP_2_TRAIL_SHIFT=6; 925 public static final int COMP_2_TRAIL_MASK=0xffc0; 926 927 // higher-level functionality ------------------------------------------ *** 928 929 // NFD without an NFD Normalizer2 instance. 930 public Appendable decompose(CharSequence s, StringBuilder dest) { 931 decompose(s, 0, s.length(), dest, s.length()); 932 return dest; 933 } 934 /** 935 * Decomposes s[src, limit[ and writes the result to dest. 936 * limit can be NULL if src is NUL-terminated. 937 * destLengthEstimate is the initial dest buffer capacity and can be -1. 938 */ 939 public void decompose(CharSequence s, int src, int limit, StringBuilder dest, 940 int destLengthEstimate) { 941 if(destLengthEstimate<0) { 942 destLengthEstimate=limit-src; 943 } 944 dest.setLength(0); 945 ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); 946 decompose(s, src, limit, buffer); 947 } 948 949 // Dual functionality: 950 // buffer!=NULL: normalize 951 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 952 public int decompose(CharSequence s, int src, int limit, 953 ReorderingBuffer buffer) { 954 int minNoCP=minDecompNoCP; 955 956 int prevSrc; 957 int c=0; 958 int norm16=0; 959 960 // only for quick check 961 int prevBoundary=src; 962 int prevCC=0; 963 964 for(;;) { 965 // count code units below the minimum or with irrelevant data for the quick check 966 for(prevSrc=src; src!=limit;) { 967 if( (c=s.charAt(src))<minNoCP || 968 isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 969 ) { 970 ++src; 971 } else if(!UTF16.isSurrogate((char)c)) { 972 break; 973 } else { 974 char c2; 975 if(UTF16Plus.isSurrogateLead(c)) { 976 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 977 c=Character.toCodePoint((char)c, c2); 978 } 979 } else /* trail surrogate */ { 980 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 981 --src; 982 c=Character.toCodePoint(c2, (char)c); 983 } 984 } 985 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 986 src+=Character.charCount(c); 987 } else { 988 break; 989 } 990 } 991 } 992 // copy these code units all at once 993 if(src!=prevSrc) { 994 if(buffer!=null) { 995 buffer.flushAndAppendZeroCC(s, prevSrc, src); 996 } else { 997 prevCC=0; 998 prevBoundary=src; 999 } 1000 } 1001 if(src==limit) { 1002 break; 1003 } 1004 1005 // Check one above-minimum, relevant code point. 1006 src+=Character.charCount(c); 1007 if(buffer!=null) { 1008 decompose(c, norm16, buffer); 1009 } else { 1010 if(isDecompYes(norm16)) { 1011 int cc=getCCFromYesOrMaybe(norm16); 1012 if(prevCC<=cc || cc==0) { 1013 prevCC=cc; 1014 if(cc<=1) { 1015 prevBoundary=src; 1016 } 1017 continue; 1018 } 1019 } 1020 return prevBoundary; // "no" or cc out of order 1021 } 1022 } 1023 return src; 1024 } 1025 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { 1026 int limit=s.length(); 1027 if(limit==0) { 1028 return; 1029 } 1030 if(doDecompose) { 1031 decompose(s, 0, limit, buffer); 1032 return; 1033 } 1034 // Just merge the strings at the boundary. 1035 int c=Character.codePointAt(s, 0); 1036 int src=0; 1037 int firstCC, prevCC, cc; 1038 firstCC=prevCC=cc=getCC(getNorm16(c)); 1039 while(cc!=0) { 1040 prevCC=cc; 1041 src+=Character.charCount(c); 1042 if(src>=limit) { 1043 break; 1044 } 1045 c=Character.codePointAt(s, src); 1046 cc=getCC(getNorm16(c)); 1047 }; 1048 buffer.append(s, 0, src, firstCC, prevCC); 1049 buffer.append(s, src, limit); 1050 } 1051 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 1052 // doCompose: normalize 1053 // !doCompose: isNormalized (buffer must be empty and initialized) 1054 public boolean compose(CharSequence s, int src, int limit, 1055 boolean onlyContiguous, 1056 boolean doCompose, 1057 ReorderingBuffer buffer) { 1058 int minNoMaybeCP=minCompNoMaybeCP; 1059 1060 /* 1061 * prevBoundary points to the last character before the current one 1062 * that has a composition boundary before it with ccc==0 and quick check "yes". 1063 * Keeping track of prevBoundary saves us looking for a composition boundary 1064 * when we find a "no" or "maybe". 1065 * 1066 * When we back out from prevSrc back to prevBoundary, 1067 * then we also remove those same characters (which had been simply copied 1068 * or canonically-order-inserted) from the ReorderingBuffer. 1069 * Therefore, at all times, the [prevBoundary..prevSrc[ source units 1070 * must correspond 1:1 to destination units at the end of the destination buffer. 1071 */ 1072 int prevBoundary=src; 1073 int prevSrc; 1074 int c=0; 1075 int norm16=0; 1076 1077 // only for isNormalized 1078 int prevCC=0; 1079 1080 for(;;) { 1081 // count code units below the minimum or with irrelevant data for the quick check 1082 for(prevSrc=src; src!=limit;) { 1083 if( (c=s.charAt(src))<minNoMaybeCP || 1084 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 1085 ) { 1086 ++src; 1087 } else if(!UTF16.isSurrogate((char)c)) { 1088 break; 1089 } else { 1090 char c2; 1091 if(UTF16Plus.isSurrogateLead(c)) { 1092 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1093 c=Character.toCodePoint((char)c, c2); 1094 } 1095 } else /* trail surrogate */ { 1096 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1097 --src; 1098 c=Character.toCodePoint(c2, (char)c); 1099 } 1100 } 1101 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1102 src+=Character.charCount(c); 1103 } else { 1104 break; 1105 } 1106 } 1107 } 1108 // copy these code units all at once 1109 if(src!=prevSrc) { 1110 if(src==limit) { 1111 if(doCompose) { 1112 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1113 } 1114 break; 1115 } 1116 // Set prevBoundary to the last character in the quick check loop. 1117 prevBoundary=src-1; 1118 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && 1119 Character.isHighSurrogate(s.charAt(prevBoundary-1)) 1120 ) { 1121 --prevBoundary; 1122 } 1123 if(doCompose) { 1124 // The last "quick check yes" character is excluded from the 1125 // flush-and-append call in case it needs to be modified. 1126 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1127 buffer.append(s, prevBoundary, src); 1128 } else { 1129 prevCC=0; 1130 } 1131 // The start of the current character (c). 1132 prevSrc=src; 1133 } else if(src==limit) { 1134 break; 1135 } 1136 1137 src+=Character.charCount(c); 1138 /* 1139 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1140 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1141 * or has ccc!=0. 1142 * Check for Jamo V/T, then for regular characters. 1143 * c is not a Hangul syllable or Jamo L because those have "yes" properties. 1144 */ 1145 if(isJamoVT(norm16) && prevBoundary!=prevSrc) { 1146 char prev=s.charAt(prevSrc-1); 1147 boolean needToDecompose=false; 1148 if(c<Hangul.JAMO_T_BASE) { 1149 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1150 prev-=Hangul.JAMO_L_BASE; 1151 if(prev<Hangul.JAMO_L_COUNT) { 1152 if(!doCompose) { 1153 return false; 1154 } 1155 char syllable=(char) 1156 (Hangul.HANGUL_BASE+ 1157 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1158 Hangul.JAMO_T_COUNT); 1159 char t; 1160 if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1161 ++src; 1162 syllable+=t; // The next character was a Jamo T. 1163 prevBoundary=src; 1164 buffer.setLastChar(syllable); 1165 continue; 1166 } 1167 // If we see L+V+x where x!=T then we drop to the slow path, 1168 // decompose and recompose. 1169 // This is to deal with NFKC finding normal L and V but a 1170 // compatibility variant of a T. We need to either fully compose that 1171 // combination here (which would complicate the code and may not work 1172 // with strange custom data) or use the slow path -- or else our replacing 1173 // two input characters (L+V) with one output character (LV syllable) 1174 // would violate the invariant that [prevBoundary..prevSrc[ has the same 1175 // length as what we appended to the buffer since prevBoundary. 1176 needToDecompose=true; 1177 } 1178 } else if(Hangul.isHangulWithoutJamoT(prev)) { 1179 // c is a Jamo Trailing consonant, 1180 // compose with previous Hangul LV that does not contain a Jamo T. 1181 if(!doCompose) { 1182 return false; 1183 } 1184 buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE)); 1185 prevBoundary=src; 1186 continue; 1187 } 1188 if(!needToDecompose) { 1189 // The Jamo V/T did not compose into a Hangul syllable. 1190 if(doCompose) { 1191 buffer.append((char)c); 1192 } else { 1193 prevCC=0; 1194 } 1195 continue; 1196 } 1197 } 1198 /* 1199 * Source buffer pointers: 1200 * 1201 * all done quick check current char not yet 1202 * "yes" but (c) processed 1203 * may combine 1204 * forward 1205 * [-------------[-------------[-------------[-------------[ 1206 * | | | | | 1207 * orig. src prevBoundary prevSrc src limit 1208 * 1209 * 1210 * Destination buffer pointers inside the ReorderingBuffer: 1211 * 1212 * all done might take not filled yet 1213 * characters for 1214 * reordering 1215 * [-------------[-------------[-------------[ 1216 * | | | | 1217 * start reorderStart limit | 1218 * +remainingCap.+ 1219 */ 1220 if(norm16>=MIN_YES_YES_WITH_CC) { 1221 int cc=norm16&0xff; // cc!=0 1222 if( onlyContiguous && // FCC 1223 (doCompose ? buffer.getLastCC() : prevCC)==0 && 1224 prevBoundary<prevSrc && 1225 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that 1226 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1227 // passed the quick check "yes && ccc==0" test. 1228 // Check whether the last character was a "yesYes" or a "yesNo". 1229 // If a "yesNo", then we get its trailing ccc from its 1230 // mapping and check for canonical order. 1231 // All other cases are ok. 1232 getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc 1233 ) { 1234 // Fails FCD test, need to decompose and contiguously recompose. 1235 if(!doCompose) { 1236 return false; 1237 } 1238 } else if(doCompose) { 1239 buffer.append(c, cc); 1240 continue; 1241 } else if(prevCC<=cc) { 1242 prevCC=cc; 1243 continue; 1244 } else { 1245 return false; 1246 } 1247 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { 1248 return false; 1249 } 1250 1251 /* 1252 * Find appropriate boundaries around this character, 1253 * decompose the source text from between the boundaries, 1254 * and recompose it. 1255 * 1256 * We may need to remove the last few characters from the ReorderingBuffer 1257 * to account for source text that was copied or appended 1258 * but needs to take part in the recomposition. 1259 */ 1260 1261 /* 1262 * Find the last composition boundary in [prevBoundary..src[. 1263 * It is either the decomposition of the current character (at prevSrc), 1264 * or prevBoundary. 1265 */ 1266 if(hasCompBoundaryBefore(c, norm16)) { 1267 prevBoundary=prevSrc; 1268 } else if(doCompose) { 1269 buffer.removeSuffix(prevSrc-prevBoundary); 1270 } 1271 1272 // Find the next composition boundary in [src..limit[ - 1273 // modifies src to point to the next starter. 1274 src=findNextCompBoundary(s, src, limit); 1275 1276 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. 1277 int recomposeStartIndex=buffer.length(); 1278 decomposeShort(s, prevBoundary, src, buffer); 1279 recompose(buffer, recomposeStartIndex, onlyContiguous); 1280 if(!doCompose) { 1281 if(!buffer.equals(s, prevBoundary, src)) { 1282 return false; 1283 } 1284 buffer.remove(); 1285 prevCC=0; 1286 } 1287 1288 // Move to the next starter. We never need to look back before this point again. 1289 prevBoundary=src; 1290 } 1291 return true; 1292 } 1293 /** 1294 * Very similar to compose(): Make the same changes in both places if relevant. 1295 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) 1296 * !doSpan: quickCheck 1297 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and 1298 * bit 0: set if "maybe"; otherwise, if the span length<s.length() 1299 * then the quick check result is "no" 1300 */ 1301 public int composeQuickCheck(CharSequence s, int src, int limit, 1302 boolean onlyContiguous, boolean doSpan) { 1303 int qcResult=0; 1304 int minNoMaybeCP=minCompNoMaybeCP; 1305 1306 /* 1307 * prevBoundary points to the last character before the current one 1308 * that has a composition boundary before it with ccc==0 and quick check "yes". 1309 */ 1310 int prevBoundary=src; 1311 int prevSrc; 1312 int c=0; 1313 int norm16=0; 1314 int prevCC=0; 1315 1316 for(;;) { 1317 // count code units below the minimum or with irrelevant data for the quick check 1318 for(prevSrc=src;;) { 1319 if(src==limit) { 1320 return (src<<1)|qcResult; // "yes" or "maybe" 1321 } 1322 if( (c=s.charAt(src))<minNoMaybeCP || 1323 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 1324 ) { 1325 ++src; 1326 } else if(!UTF16.isSurrogate((char)c)) { 1327 break; 1328 } else { 1329 char c2; 1330 if(UTF16Plus.isSurrogateLead(c)) { 1331 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1332 c=Character.toCodePoint((char)c, c2); 1333 } 1334 } else /* trail surrogate */ { 1335 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1336 --src; 1337 c=Character.toCodePoint(c2, (char)c); 1338 } 1339 } 1340 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1341 src+=Character.charCount(c); 1342 } else { 1343 break; 1344 } 1345 } 1346 } 1347 if(src!=prevSrc) { 1348 // Set prevBoundary to the last character in the quick check loop. 1349 prevBoundary=src-1; 1350 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && 1351 Character.isHighSurrogate(s.charAt(prevBoundary-1)) 1352 ) { 1353 --prevBoundary; 1354 } 1355 prevCC=0; 1356 // The start of the current character (c). 1357 prevSrc=src; 1358 } 1359 1360 src+=Character.charCount(c); 1361 /* 1362 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1363 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1364 * or has ccc!=0. 1365 */ 1366 if(isMaybeOrNonZeroCC(norm16)) { 1367 int cc=getCCFromYesOrMaybe(norm16); 1368 if( onlyContiguous && // FCC 1369 cc!=0 && 1370 prevCC==0 && 1371 prevBoundary<prevSrc && 1372 // prevCC==0 && prevBoundary<prevSrc tell us that 1373 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1374 // passed the quick check "yes && ccc==0" test. 1375 // Check whether the last character was a "yesYes" or a "yesNo". 1376 // If a "yesNo", then we get its trailing ccc from its 1377 // mapping and check for canonical order. 1378 // All other cases are ok. 1379 getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc 1380 ) { 1381 // Fails FCD test. 1382 } else if(prevCC<=cc || cc==0) { 1383 prevCC=cc; 1384 if(norm16<MIN_YES_YES_WITH_CC) { 1385 if(!doSpan) { 1386 qcResult=1; 1387 } else { 1388 return prevBoundary<<1; // spanYes does not care to know it's "maybe" 1389 } 1390 } 1391 continue; 1392 } 1393 } 1394 return prevBoundary<<1; // "no" 1395 } 1396 } 1397 public void composeAndAppend(CharSequence s, 1398 boolean doCompose, 1399 boolean onlyContiguous, 1400 ReorderingBuffer buffer) { 1401 int src=0, limit=s.length(); 1402 if(!buffer.isEmpty()) { 1403 int firstStarterInSrc=findNextCompBoundary(s, 0, limit); 1404 if(0!=firstStarterInSrc) { 1405 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), 1406 buffer.length()); 1407 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ 1408 firstStarterInSrc+16); 1409 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); 1410 buffer.removeSuffix(buffer.length()-lastStarterInDest); 1411 middle.append(s, 0, firstStarterInSrc); 1412 compose(middle, 0, middle.length(), onlyContiguous, true, buffer); 1413 src=firstStarterInSrc; 1414 } 1415 } 1416 if(doCompose) { 1417 compose(s, src, limit, onlyContiguous, true, buffer); 1418 } else { 1419 buffer.append(s, src, limit); 1420 } 1421 } 1422 // Dual functionality: 1423 // buffer!=NULL: normalize 1424 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1425 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { 1426 // Note: In this function we use buffer->appendZeroCC() because we track 1427 // the lead and trail combining classes here, rather than leaving it to 1428 // the ReorderingBuffer. 1429 // The exception is the call to decomposeShort() which uses the buffer 1430 // in the normal way. 1431 1432 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1433 // Similar to the prevBoundary in the compose() implementation. 1434 int prevBoundary=src; 1435 int prevSrc; 1436 int c=0; 1437 int prevFCD16=0; 1438 int fcd16=0; 1439 1440 for(;;) { 1441 // count code units with lccc==0 1442 for(prevSrc=src; src!=limit;) { 1443 if((c=s.charAt(src))<MIN_CCC_LCCC_CP) { 1444 prevFCD16=~c; 1445 ++src; 1446 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1447 prevFCD16=0; 1448 ++src; 1449 } else { 1450 if(UTF16.isSurrogate((char)c)) { 1451 char c2; 1452 if(UTF16Plus.isSurrogateLead(c)) { 1453 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1454 c=Character.toCodePoint((char)c, c2); 1455 } 1456 } else /* trail surrogate */ { 1457 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1458 --src; 1459 c=Character.toCodePoint(c2, (char)c); 1460 } 1461 } 1462 } 1463 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1464 prevFCD16=fcd16; 1465 src+=Character.charCount(c); 1466 } else { 1467 break; 1468 } 1469 } 1470 } 1471 // copy these code units all at once 1472 if(src!=prevSrc) { 1473 if(src==limit) { 1474 if(buffer!=null) { 1475 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1476 } 1477 break; 1478 } 1479 prevBoundary=src; 1480 // We know that the previous character's lccc==0. 1481 if(prevFCD16<0) { 1482 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1483 int prev=~prevFCD16; 1484 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); 1485 if(prevFCD16>1) { 1486 --prevBoundary; 1487 } 1488 } else { 1489 int p=src-1; 1490 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && 1491 Character.isHighSurrogate(s.charAt(p-1)) 1492 ) { 1493 --p; 1494 // Need to fetch the previous character's FCD value because 1495 // prevFCD16 was just for the trail surrogate code point. 1496 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); 1497 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1498 } 1499 if(prevFCD16>1) { 1500 prevBoundary=p; 1501 } 1502 } 1503 if(buffer!=null) { 1504 // The last lccc==0 character is excluded from the 1505 // flush-and-append call in case it needs to be modified. 1506 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1507 buffer.append(s, prevBoundary, src); 1508 } 1509 // The start of the current character (c). 1510 prevSrc=src; 1511 } else if(src==limit) { 1512 break; 1513 } 1514 1515 src+=Character.charCount(c); 1516 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1517 // Check for proper order, and decompose locally if necessary. 1518 if((prevFCD16&0xff)<=(fcd16>>8)) { 1519 // proper order: prev tccc <= current lccc 1520 if((fcd16&0xff)<=1) { 1521 prevBoundary=src; 1522 } 1523 if(buffer!=null) { 1524 buffer.appendZeroCC(c); 1525 } 1526 prevFCD16=fcd16; 1527 continue; 1528 } else if(buffer==null) { 1529 return prevBoundary; // quick check "no" 1530 } else { 1531 /* 1532 * Back out the part of the source that we copied or appended 1533 * already but is now going to be decomposed. 1534 * prevSrc is set to after what was copied/appended. 1535 */ 1536 buffer.removeSuffix(prevSrc-prevBoundary); 1537 /* 1538 * Find the part of the source that needs to be decomposed, 1539 * up to the next safe boundary. 1540 */ 1541 src=findNextFCDBoundary(s, src, limit); 1542 /* 1543 * The source text does not fulfill the conditions for FCD. 1544 * Decompose and reorder a limited piece of the text. 1545 */ 1546 decomposeShort(s, prevBoundary, src, buffer); 1547 prevBoundary=src; 1548 prevFCD16=0; 1549 } 1550 } 1551 return src; 1552 } 1553 public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) { 1554 int src=0, limit=s.length(); 1555 if(!buffer.isEmpty()) { 1556 int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit); 1557 if(0!=firstBoundaryInSrc) { 1558 int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(), 1559 buffer.length()); 1560 StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+ 1561 firstBoundaryInSrc+16); 1562 middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length()); 1563 buffer.removeSuffix(buffer.length()-lastBoundaryInDest); 1564 middle.append(s, 0, firstBoundaryInSrc); 1565 makeFCD(middle, 0, middle.length(), buffer); 1566 src=firstBoundaryInSrc; 1567 } 1568 } 1569 if(doMakeFCD) { 1570 makeFCD(s, src, limit, buffer); 1571 } else { 1572 buffer.append(s, src, limit); 1573 } 1574 } 1575 1576 // Note: hasDecompBoundary() could be implemented as aliases to 1577 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() 1578 // at the cost of building the FCD trie for a decomposition normalizer. 1579 public boolean hasDecompBoundary(int c, boolean before) { 1580 for(;;) { 1581 if(c<minDecompNoCP) { 1582 return true; 1583 } 1584 int norm16=getNorm16(c); 1585 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { 1586 return true; 1587 } else if(norm16>MIN_NORMAL_MAYBE_YES) { 1588 return false; // ccc!=0 1589 } else if(isDecompNoAlgorithmic(norm16)) { 1590 c=mapAlgorithmic(c, norm16); 1591 } else { 1592 // c decomposes, get everything from the variable-length extra data 1593 int firstUnit=extraData.charAt(norm16); 1594 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1595 return false; 1596 } 1597 if(!before) { 1598 // decomp after-boundary: same as hasFCDBoundaryAfter(), 1599 // fcd16<=1 || trailCC==0 1600 if(firstUnit>0x1ff) { 1601 return false; // trailCC>1 1602 } 1603 if(firstUnit<=0xff) { 1604 return true; // trailCC==0 1605 } 1606 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1607 } 1608 // true if leadCC==0 (hasFCDBoundaryBefore()) 1609 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0; 1610 } 1611 } 1612 } 1613 public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } 1614 1615 public boolean hasCompBoundaryBefore(int c) { 1616 return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c)); 1617 } 1618 public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous, boolean testInert) { 1619 for(;;) { 1620 int norm16=getNorm16(c); 1621 if(isInert(norm16)) { 1622 return true; 1623 } else if(norm16<=minYesNo) { 1624 // Hangul: norm16==minYesNo 1625 // Hangul LVT has a boundary after it. 1626 // Hangul LV and non-inert yesYes characters combine forward. 1627 return isHangul(norm16) && !Hangul.isHangulWithoutJamoT((char)c); 1628 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { 1629 return false; 1630 } else if(isDecompNoAlgorithmic(norm16)) { 1631 c=mapAlgorithmic(c, norm16); 1632 } else { 1633 // c decomposes, get everything from the variable-length extra data. 1634 // If testInert, then c must be a yesNo character which has lccc=0, 1635 // otherwise it could be a noNo. 1636 int firstUnit=extraData.charAt(norm16); 1637 // true if 1638 // not MAPPING_NO_COMP_BOUNDARY_AFTER 1639 // (which is set if 1640 // c is not deleted, and 1641 // it and its decomposition do not combine forward, and it has a starter) 1642 // and if FCC then trailCC<=1 1643 return 1644 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && 1645 (!onlyContiguous || firstUnit<=0x1ff); 1646 } 1647 } 1648 } 1649 1650 public boolean hasFCDBoundaryBefore(int c) { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; } 1651 public boolean hasFCDBoundaryAfter(int c) { 1652 int fcd16=getFCD16(c); 1653 return fcd16<=1 || (fcd16&0xff)==0; 1654 } 1655 public boolean isFCDInert(int c) { return getFCD16(c)<=1; } 1656 1657 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } 1658 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } 1659 private static boolean isInert(int norm16) { return norm16==0; } 1660 private static boolean isJamoL(int norm16) { return norm16==1; } 1661 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } 1662 private boolean isHangul(int norm16) { return norm16==minYesNo; } 1663 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } 1664 // UBool isCompYes(uint16_t norm16) const { 1665 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 1666 // } 1667 // UBool isCompYesOrMaybe(uint16_t norm16) const { 1668 // return norm16<minNoNo || minMaybeYes<=norm16; 1669 // } 1670 // private boolean hasZeroCCFromDecompYes(int norm16) { 1671 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1672 // } 1673 private boolean isDecompYesAndZeroCC(int norm16) { 1674 return norm16<minYesNo || 1675 norm16==JAMO_VT || 1676 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 1677 } 1678 /** 1679 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 1680 * the MaybeYes which combine-forward and have ccc=0. 1681 * (Standard Unicode 5.2 normalization does not have such characters.) 1682 */ 1683 private boolean isMostDecompYesAndZeroCC(int norm16) { 1684 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1685 } 1686 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } 1687 1688 // For use with isCompYes(). 1689 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 1690 // static uint8_t getCCFromYes(uint16_t norm16) { 1691 // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; 1692 // } 1693 private int getCCFromNoNo(int norm16) { 1694 if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1695 return extraData.charAt(norm16-1)&0xff; 1696 } else { 1697 return 0; 1698 } 1699 } 1700 // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() 1701 int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) { 1702 int c; 1703 if(cpStart==(cpLimit-1)) { 1704 c=s.charAt(cpStart); 1705 } else { 1706 c=Character.codePointAt(s, cpStart); 1707 } 1708 int prevNorm16=getNorm16(c); 1709 if(prevNorm16<=minYesNo) { 1710 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 1711 } else { 1712 return extraData.charAt(prevNorm16)>>8; // tccc from yesNo 1713 } 1714 } 1715 1716 // Requires algorithmic-NoNo. 1717 private int mapAlgorithmic(int c, int norm16) { 1718 return c+norm16-(minMaybeYes-MAX_DELTA-1); 1719 } 1720 1721 // Requires minYesNo<norm16<limitNoNo. 1722 // private int getMapping(int norm16) { return /*extraData+*/norm16; } 1723 1724 /** 1725 * @return index into maybeYesCompositions, or -1 1726 */ 1727 private int getCompositionsListForDecompYes(int norm16) { 1728 if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) { 1729 return -1; 1730 } else { 1731 if((norm16-=minMaybeYes)<0) { 1732 // norm16<minMaybeYes: index into extraData which is a substring at 1733 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] 1734 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 1735 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list 1736 } 1737 return norm16; 1738 } 1739 } 1740 /** 1741 * @return index into maybeYesCompositions 1742 */ 1743 private int getCompositionsListForComposite(int norm16) { 1744 // composite has both mapping & compositions list 1745 int firstUnit=extraData.charAt(norm16); 1746 return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions 1747 1+ // +1 to skip the first unit with the mapping lenth 1748 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length 1749 } 1750 /** 1751 * @param c code point must have compositions 1752 * @return index into maybeYesCompositions 1753 */ 1754 private int getCompositionsList(int norm16) { 1755 return isDecompYes(norm16) ? 1756 getCompositionsListForDecompYes(norm16) : 1757 getCompositionsListForComposite(norm16); 1758 } 1759 1760 // Decompose a short piece of text which is likely to contain characters that 1761 // fail the quick check loop and/or where the quick check loop's overhead 1762 // is unlikely to be amortized. 1763 // Called by the compose() and makeFCD() implementations. 1764 // Public in Java for collation implementation code. 1765 public void decomposeShort(CharSequence s, int src, int limit, 1766 ReorderingBuffer buffer) { 1767 while(src<limit) { 1768 int c=Character.codePointAt(s, src); 1769 src+=Character.charCount(c); 1770 decompose(c, getNorm16(c), buffer); 1771 } 1772 } 1773 private void decompose(int c, int norm16, 1774 ReorderingBuffer buffer) { 1775 // Only loops for 1:1 algorithmic mappings. 1776 for(;;) { 1777 // get the decomposition and the lead and trail cc's 1778 if(isDecompYes(norm16)) { 1779 // c does not decompose 1780 buffer.append(c, getCCFromYesOrMaybe(norm16)); 1781 } else if(isHangul(norm16)) { 1782 // Hangul syllable: decompose algorithmically 1783 Hangul.decompose(c, buffer); 1784 } else if(isDecompNoAlgorithmic(norm16)) { 1785 c=mapAlgorithmic(c, norm16); 1786 norm16=getNorm16(c); 1787 continue; 1788 } else { 1789 // c decomposes, get everything from the variable-length extra data 1790 int firstUnit=extraData.charAt(norm16); 1791 int length=firstUnit&MAPPING_LENGTH_MASK; 1792 int leadCC, trailCC; 1793 trailCC=firstUnit>>8; 1794 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1795 leadCC=extraData.charAt(norm16-1)>>8; 1796 } else { 1797 leadCC=0; 1798 } 1799 ++norm16; // skip over the firstUnit 1800 buffer.append(extraData, norm16, norm16+length, leadCC, trailCC); 1801 } 1802 return; 1803 } 1804 } 1805 1806 /** 1807 * Finds the recomposition result for 1808 * a forward-combining "lead" character, 1809 * specified with a pointer to its compositions list, 1810 * and a backward-combining "trail" character. 1811 * 1812 * <p>If the lead and trail characters combine, then this function returns 1813 * the following "compositeAndFwd" value: 1814 * <pre> 1815 * Bits 21..1 composite character 1816 * Bit 0 set if the composite is a forward-combining starter 1817 * </pre> 1818 * otherwise it returns -1. 1819 * 1820 * <p>The compositions list has (trail, compositeAndFwd) pair entries, 1821 * encoded as either pairs or triples of 16-bit units. 1822 * The last entry has the high bit of its first unit set. 1823 * 1824 * <p>The list is sorted by ascending trail characters (there are no duplicates). 1825 * A linear search is used. 1826 * 1827 * <p>See normalizer2impl.h for a more detailed description 1828 * of the compositions list format. 1829 */ 1830 private static int combine(String compositions, int list, int trail) { 1831 int key1, firstUnit; 1832 if(trail<COMP_1_TRAIL_LIMIT) { 1833 // trail character is 0..33FF 1834 // result entry may have 2 or 3 units 1835 key1=(trail<<1); 1836 while(key1>(firstUnit=compositions.charAt(list))) { 1837 list+=2+(firstUnit&COMP_1_TRIPLE); 1838 } 1839 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1840 if((firstUnit&COMP_1_TRIPLE)!=0) { 1841 return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); 1842 } else { 1843 return compositions.charAt(list+1); 1844 } 1845 } 1846 } else { 1847 // trail character is 3400..10FFFF 1848 // result entry has 3 units 1849 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); 1850 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; 1851 int secondUnit; 1852 for(;;) { 1853 if(key1>(firstUnit=compositions.charAt(list))) { 1854 list+=2+(firstUnit&COMP_1_TRIPLE); 1855 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1856 if(key2>(secondUnit=compositions.charAt(list+1))) { 1857 if((firstUnit&COMP_1_LAST_TUPLE)!=0) { 1858 break; 1859 } else { 1860 list+=3; 1861 } 1862 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1863 return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); 1864 } else { 1865 break; 1866 } 1867 } else { 1868 break; 1869 } 1870 } 1871 } 1872 return -1; 1873 } 1874 /** 1875 * @param list some character's compositions list 1876 * @param set recursively receives the composites from these compositions 1877 */ 1878 private void addComposites(int list, UnicodeSet set) { 1879 int firstUnit, compositeAndFwd; 1880 do { 1881 firstUnit=maybeYesCompositions.charAt(list); 1882 if((firstUnit&COMP_1_TRIPLE)==0) { 1883 compositeAndFwd=maybeYesCompositions.charAt(list+1); 1884 list+=2; 1885 } else { 1886 compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)| 1887 maybeYesCompositions.charAt(list+2); 1888 list+=3; 1889 } 1890 int composite=compositeAndFwd>>1; 1891 if((compositeAndFwd&1)!=0) { 1892 addComposites(getCompositionsListForComposite(getNorm16(composite)), set); 1893 } 1894 set.add(composite); 1895 } while((firstUnit&COMP_1_LAST_TUPLE)==0); 1896 } 1897 /* 1898 * Recomposes the buffer text starting at recomposeStartIndex 1899 * (which is in NFD - decomposed and canonically ordered), 1900 * and truncates the buffer contents. 1901 * 1902 * Note that recomposition never lengthens the text: 1903 * Any character consists of either one or two code units; 1904 * a composition may contain at most one more code unit than the original starter, 1905 * while the combining mark that is removed has at least one code unit. 1906 */ 1907 private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, 1908 boolean onlyContiguous) { 1909 StringBuilder sb=buffer.getStringBuilder(); 1910 int p=recomposeStartIndex; 1911 if(p==sb.length()) { 1912 return; 1913 } 1914 1915 int starter, pRemove; 1916 int compositionsList; 1917 int c, compositeAndFwd; 1918 int norm16; 1919 int cc, prevCC; 1920 boolean starterIsSupplementary; 1921 1922 // Some of the following variables are not used until we have a forward-combining starter 1923 // and are only initialized now to avoid compiler warnings. 1924 compositionsList=-1; // used as indicator for whether we have a forward-combining starter 1925 starter=-1; 1926 starterIsSupplementary=false; 1927 prevCC=0; 1928 1929 for(;;) { 1930 c=sb.codePointAt(p); 1931 p+=Character.charCount(c); 1932 norm16=getNorm16(c); 1933 cc=getCCFromYesOrMaybe(norm16); 1934 if( // this character combines backward and 1935 isMaybe(norm16) && 1936 // we have seen a starter that combines forward and 1937 compositionsList>=0 && 1938 // the backward-combining character is not blocked 1939 (prevCC<cc || prevCC==0) 1940 ) { 1941 if(isJamoVT(norm16)) { 1942 // c is a Jamo V/T, see if we can compose it with the previous character. 1943 if(c<Hangul.JAMO_T_BASE) { 1944 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1945 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); 1946 if(prev<Hangul.JAMO_L_COUNT) { 1947 pRemove=p-1; 1948 char syllable=(char) 1949 (Hangul.HANGUL_BASE+ 1950 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1951 Hangul.JAMO_T_COUNT); 1952 char t; 1953 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1954 ++p; 1955 syllable+=t; // The next character was a Jamo T. 1956 } 1957 sb.setCharAt(starter, syllable); 1958 // remove the Jamo V/T 1959 sb.delete(pRemove, p); 1960 p=pRemove; 1961 } 1962 } 1963 /* 1964 * No "else" for Jamo T: 1965 * Since the input is in NFD, there are no Hangul LV syllables that 1966 * a Jamo T could combine with. 1967 * All Jamo Ts are combined above when handling Jamo Vs. 1968 */ 1969 if(p==sb.length()) { 1970 break; 1971 } 1972 compositionsList=-1; 1973 continue; 1974 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { 1975 // The starter and the combining mark (c) do combine. 1976 int composite=compositeAndFwd>>1; 1977 1978 // Remove the combining mark. 1979 pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark 1980 sb.delete(pRemove, p); 1981 p=pRemove; 1982 // Replace the starter with the composite. 1983 if(starterIsSupplementary) { 1984 if(composite>0xffff) { 1985 // both are supplementary 1986 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1987 sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); 1988 } else { 1989 sb.setCharAt(starter, (char)c); 1990 sb.deleteCharAt(starter+1); 1991 // The composite is shorter than the starter, 1992 // move the intermediate characters forward one. 1993 starterIsSupplementary=false; 1994 --p; 1995 } 1996 } else if(composite>0xffff) { 1997 // The composite is longer than the starter, 1998 // move the intermediate characters back one. 1999 starterIsSupplementary=true; 2000 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 2001 sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); 2002 ++p; 2003 } else { 2004 // both are on the BMP 2005 sb.setCharAt(starter, (char)composite); 2006 } 2007 2008 // Keep prevCC because we removed the combining mark. 2009 2010 if(p==sb.length()) { 2011 break; 2012 } 2013 // Is the composite a starter that combines forward? 2014 if((compositeAndFwd&1)!=0) { 2015 compositionsList= 2016 getCompositionsListForComposite(getNorm16(composite)); 2017 } else { 2018 compositionsList=-1; 2019 } 2020 2021 // We combined; continue with looking for compositions. 2022 continue; 2023 } 2024 } 2025 2026 // no combination this time 2027 prevCC=cc; 2028 if(p==sb.length()) { 2029 break; 2030 } 2031 2032 // If c did not combine, then check if it is a starter. 2033 if(cc==0) { 2034 // Found a new starter. 2035 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { 2036 // It may combine with something, prepare for it. 2037 if(c<=0xffff) { 2038 starterIsSupplementary=false; 2039 starter=p-1; 2040 } else { 2041 starterIsSupplementary=true; 2042 starter=p-2; 2043 } 2044 } 2045 } else if(onlyContiguous) { 2046 // FCC: no discontiguous compositions; any intervening character blocks. 2047 compositionsList=-1; 2048 } 2049 } 2050 buffer.flush(); 2051 } 2052 2053 public int composePair(int a, int b) { 2054 int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 2055 int list; 2056 if(isInert(norm16)) { 2057 return -1; 2058 } else if(norm16<minYesNoMappingsOnly) { 2059 if(isJamoL(norm16)) { 2060 b-=Hangul.JAMO_V_BASE; 2061 if(0<=b && b<Hangul.JAMO_V_COUNT) { 2062 return 2063 (Hangul.HANGUL_BASE+ 2064 ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)* 2065 Hangul.JAMO_T_COUNT); 2066 } else { 2067 return -1; 2068 } 2069 } else if(isHangul(norm16)) { 2070 b-=Hangul.JAMO_T_BASE; 2071 if(Hangul.isHangulWithoutJamoT((char)a) && 0<b && b<Hangul.JAMO_T_COUNT) { // not b==0! 2072 return a+b; 2073 } else { 2074 return -1; 2075 } 2076 } else { 2077 // 'a' has a compositions list in extraData 2078 list=norm16; 2079 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 2080 list+= // mapping pointer 2081 1+ // +1 to skip the first unit with the mapping lenth 2082 (extraData.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length 2083 } 2084 // Turn the offset-into-extraData into an offset-into-maybeYesCompositions. 2085 list+=MIN_NORMAL_MAYBE_YES-minMaybeYes; 2086 } 2087 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 2088 return -1; 2089 } else { 2090 list=norm16-minMaybeYes; // offset into maybeYesCompositions 2091 } 2092 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 2093 return -1; 2094 } 2095 return combine(maybeYesCompositions, list, b)>>1; 2096 } 2097 2098 /** 2099 * Does c have a composition boundary before it? 2100 * True if its decomposition begins with a character that has 2101 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 2102 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 2103 * (isCompYesAndZeroCC()) so we need not decompose. 2104 */ 2105 private boolean hasCompBoundaryBefore(int c, int norm16) { 2106 for(;;) { 2107 if(isCompYesAndZeroCC(norm16)) { 2108 return true; 2109 } else if(isMaybeOrNonZeroCC(norm16)) { 2110 return false; 2111 } else if(isDecompNoAlgorithmic(norm16)) { 2112 c=mapAlgorithmic(c, norm16); 2113 norm16=getNorm16(c); 2114 } else { 2115 // c decomposes, get everything from the variable-length extra data 2116 int firstUnit=extraData.charAt(norm16); 2117 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 2118 return false; 2119 } 2120 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) { 2121 return false; // non-zero leadCC 2122 } 2123 return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1))); 2124 } 2125 } 2126 } 2127 private int findPreviousCompBoundary(CharSequence s, int p) { 2128 while(p>0) { 2129 int c=Character.codePointBefore(s, p); 2130 p-=Character.charCount(c); 2131 if(hasCompBoundaryBefore(c)) { 2132 break; 2133 } 2134 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, 2135 // but that's probably not worth the extra cost. 2136 } 2137 return p; 2138 } 2139 private int findNextCompBoundary(CharSequence s, int p, int limit) { 2140 while(p<limit) { 2141 int c=Character.codePointAt(s, p); 2142 int norm16=normTrie.get(c); 2143 if(hasCompBoundaryBefore(c, norm16)) { 2144 break; 2145 } 2146 p+=Character.charCount(c); 2147 } 2148 return p; 2149 } 2150 2151 private int findPreviousFCDBoundary(CharSequence s, int p) { 2152 while(p>0) { 2153 int c=Character.codePointBefore(s, p); 2154 p-=Character.charCount(c); 2155 if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { 2156 break; 2157 } 2158 } 2159 return p; 2160 } 2161 private int findNextFCDBoundary(CharSequence s, int p, int limit) { 2162 while(p<limit) { 2163 int c=Character.codePointAt(s, p); 2164 if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { 2165 break; 2166 } 2167 p+=Character.charCount(c); 2168 } 2169 return p; 2170 } 2171 2172 private void addToStartSet(Trie2Writable newData, int origin, int decompLead) { 2173 int canonValue=newData.get(decompLead); 2174 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 2175 // origin is the first character whose decomposition starts with 2176 // the character for which we are setting the value. 2177 newData.set(decompLead, canonValue|origin); 2178 } else { 2179 // origin is not the first character, or it is U+0000. 2180 UnicodeSet set; 2181 if((canonValue&CANON_HAS_SET)==0) { 2182 int firstOrigin=canonValue&CANON_VALUE_MASK; 2183 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size(); 2184 newData.set(decompLead, canonValue); 2185 canonStartSets.add(set=new UnicodeSet()); 2186 if(firstOrigin!=0) { 2187 set.add(firstOrigin); 2188 } 2189 } else { 2190 set=canonStartSets.get(canonValue&CANON_VALUE_MASK); 2191 } 2192 set.add(origin); 2193 } 2194 } 2195 2196 @SuppressWarnings("unused") 2197 private VersionInfo dataVersion; 2198 2199 // Code point thresholds for quick check codes. 2200 private int minDecompNoCP; 2201 private int minCompNoMaybeCP; 2202 2203 // Norm16 value thresholds for quick check combinations and types of extra data. 2204 private int minYesNo; 2205 private int minYesNoMappingsOnly; 2206 private int minNoNo; 2207 private int limitNoNo; 2208 private int minMaybeYes; 2209 2210 private Trie2_16 normTrie; 2211 private String maybeYesCompositions; 2212 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 2213 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 2214 private int[] tccc180; // [0x180] tccc values for U+0000..U+017F 2215 2216 private Trie2_32 canonIterData; 2217 private ArrayList<UnicodeSet> canonStartSets; 2218 2219 // bits in canonIterData 2220 private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000; 2221 private static final int CANON_HAS_COMPOSITIONS = 0x40000000; 2222 private static final int CANON_HAS_SET = 0x200000; 2223 private static final int CANON_VALUE_MASK = 0x1fffff; 2224} 2225