1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4 ******************************************************************************* 5 * Copyright (C) 2009-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10package com.ibm.icu.impl; 11 12import java.io.IOException; 13import java.nio.ByteBuffer; 14import java.util.ArrayList; 15import java.util.Iterator; 16 17import com.ibm.icu.text.UTF16; 18import com.ibm.icu.text.UnicodeSet; 19import com.ibm.icu.util.ICUUncheckedIOException; 20import com.ibm.icu.util.VersionInfo; 21 22public final class Normalizer2Impl { 23 public static final class Hangul { 24 /* Korean Hangul and Jamo constants */ 25 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 26 public static final int JAMO_L_END=0x1112; 27 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 28 public static final int JAMO_V_END=0x1175; 29 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 30 public static final int JAMO_T_END=0x11c2; 31 32 public static final int HANGUL_BASE=0xac00; 33 public static final int HANGUL_END=0xd7a3; 34 35 public static final int JAMO_L_COUNT=19; 36 public static final int JAMO_V_COUNT=21; 37 public static final int JAMO_T_COUNT=28; 38 39 public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT; 40 public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT; 41 42 public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT; 43 44 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 45 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; 46 47 public static boolean isHangul(int c) { 48 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 49 } 50 public static boolean isHangulWithoutJamoT(char c) { 51 c-=HANGUL_BASE; 52 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 53 } 54 public static boolean isJamoL(int c) { 55 return JAMO_L_BASE<=c && c<JAMO_L_LIMIT; 56 } 57 public static boolean isJamoV(int c) { 58 return JAMO_V_BASE<=c && c<JAMO_V_LIMIT; 59 } 60 61 /** 62 * Decomposes c, which must be a Hangul syllable, into buffer 63 * and returns the length of the decomposition (2 or 3). 64 */ 65 public static int decompose(int c, Appendable buffer) { 66 try { 67 c-=HANGUL_BASE; 68 int c2=c%JAMO_T_COUNT; 69 c/=JAMO_T_COUNT; 70 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 71 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 72 if(c2==0) { 73 return 2; 74 } else { 75 buffer.append((char)(JAMO_T_BASE+c2)); 76 return 3; 77 } 78 } catch(IOException e) { 79 // Will not occur because we do not write to I/O. 80 throw new ICUUncheckedIOException(e); 81 } 82 } 83 84 /** 85 * Decomposes c, which must be a Hangul syllable, into buffer. 86 * This is the raw, not recursive, decomposition. Its length is always 2. 87 */ 88 public static void getRawDecomposition(int c, Appendable buffer) { 89 try { 90 int orig=c; 91 c-=HANGUL_BASE; 92 int c2=c%JAMO_T_COUNT; 93 if(c2==0) { 94 c/=JAMO_T_COUNT; 95 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 96 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 97 } else { 98 buffer.append((char)(orig-c2)); // LV syllable 99 buffer.append((char)(JAMO_T_BASE+c2)); 100 } 101 } catch(IOException e) { 102 // Will not occur because we do not write to I/O. 103 throw new ICUUncheckedIOException(e); 104 } 105 } 106 } 107 108 /** 109 * Writable buffer that takes care of canonical ordering. 110 * Its Appendable methods behave like the C++ implementation's 111 * appendZeroCC() methods. 112 * <p> 113 * If dest is a StringBuilder, then the buffer writes directly to it. 114 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments 115 * until no further changes are necessary and whole segments are appended. 116 * append() methods that take combining-class values always write to the StringBuilder. 117 * Other append() methods flush and append to the Appendable. 118 */ 119 public static final class ReorderingBuffer implements Appendable { 120 public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) { 121 impl=ni; 122 app=dest; 123 if(app instanceof StringBuilder) { 124 appIsStringBuilder=true; 125 str=(StringBuilder)dest; 126 // In Java, the constructor subsumes public void init(int destCapacity) { 127 str.ensureCapacity(destCapacity); 128 reorderStart=0; 129 if(str.length()==0) { 130 lastCC=0; 131 } else { 132 setIterator(); 133 lastCC=previousCC(); 134 // Set reorderStart after the last code point with cc<=1 if there is one. 135 if(lastCC>1) { 136 while(previousCC()>1) {} 137 } 138 reorderStart=codePointLimit; 139 } 140 } else { 141 appIsStringBuilder=false; 142 str=new StringBuilder(); 143 reorderStart=0; 144 lastCC=0; 145 } 146 } 147 148 public boolean isEmpty() { return str.length()==0; } 149 public int length() { return str.length(); } 150 public int getLastCC() { return lastCC; } 151 152 public StringBuilder getStringBuilder() { return str; } 153 154 public boolean equals(CharSequence s, int start, int limit) { 155 return UTF16Plus.equal(str, 0, str.length(), s, start, limit); 156 } 157 158 // For Hangul composition, replacing the Leading consonant Jamo with the syllable. 159 public void setLastChar(char c) { 160 str.setCharAt(str.length()-1, c); 161 } 162 163 public void append(int c, int cc) { 164 if(lastCC<=cc || cc==0) { 165 str.appendCodePoint(c); 166 lastCC=cc; 167 if(cc<=1) { 168 reorderStart=str.length(); 169 } 170 } else { 171 insert(c, cc); 172 } 173 } 174 // s must be in NFD, otherwise change the implementation. 175 public void append(CharSequence s, int start, int limit, 176 int leadCC, int trailCC) { 177 if(start==limit) { 178 return; 179 } 180 if(lastCC<=leadCC || leadCC==0) { 181 if(trailCC<=1) { 182 reorderStart=str.length()+(limit-start); 183 } else if(leadCC<=1) { 184 reorderStart=str.length()+1; // Ok if not a code point boundary. 185 } 186 str.append(s, start, limit); 187 lastCC=trailCC; 188 } else { 189 int c=Character.codePointAt(s, start); 190 start+=Character.charCount(c); 191 insert(c, leadCC); // insert first code point 192 while(start<limit) { 193 c=Character.codePointAt(s, start); 194 start+=Character.charCount(c); 195 if(start<limit) { 196 // s must be in NFD, otherwise we need to use getCC(). 197 leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); 198 } else { 199 leadCC=trailCC; 200 } 201 append(c, leadCC); 202 } 203 } 204 } 205 // The following append() methods work like C++ appendZeroCC(). 206 // They assume that the cc or trailCC of their input is 0. 207 // Most of them implement Appendable interface methods. 208 // @Override when we switch to Java 6 209 @Override 210 public ReorderingBuffer append(char c) { 211 str.append(c); 212 lastCC=0; 213 reorderStart=str.length(); 214 return this; 215 } 216 public void appendZeroCC(int c) { 217 str.appendCodePoint(c); 218 lastCC=0; 219 reorderStart=str.length(); 220 } 221 // @Override when we switch to Java 6 222 @Override 223 public ReorderingBuffer append(CharSequence s) { 224 if(s.length()!=0) { 225 str.append(s); 226 lastCC=0; 227 reorderStart=str.length(); 228 } 229 return this; 230 } 231 // @Override when we switch to Java 6 232 @Override 233 public ReorderingBuffer append(CharSequence s, int start, int limit) { 234 if(start!=limit) { 235 str.append(s, start, limit); 236 lastCC=0; 237 reorderStart=str.length(); 238 } 239 return this; 240 } 241 /** 242 * Flushes from the intermediate StringBuilder to the Appendable, 243 * if they are different objects. 244 * Used after recomposition. 245 * Must be called at the end when writing to a non-StringBuilder Appendable. 246 */ 247 public void flush() { 248 if(appIsStringBuilder) { 249 reorderStart=str.length(); 250 } else { 251 try { 252 app.append(str); 253 str.setLength(0); 254 reorderStart=0; 255 } catch(IOException e) { 256 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 257 } 258 } 259 lastCC=0; 260 } 261 /** 262 * Flushes from the intermediate StringBuilder to the Appendable, 263 * if they are different objects. 264 * Then appends the new text to the Appendable or StringBuilder. 265 * Normally used after quick check loops find a non-empty sequence. 266 */ 267 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { 268 if(appIsStringBuilder) { 269 str.append(s, start, limit); 270 reorderStart=str.length(); 271 } else { 272 try { 273 app.append(str).append(s, start, limit); 274 str.setLength(0); 275 reorderStart=0; 276 } catch(IOException e) { 277 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 278 } 279 } 280 lastCC=0; 281 return this; 282 } 283 public void remove() { 284 str.setLength(0); 285 lastCC=0; 286 reorderStart=0; 287 } 288 public void removeSuffix(int suffixLength) { 289 int oldLength=str.length(); 290 str.delete(oldLength-suffixLength, oldLength); 291 lastCC=0; 292 reorderStart=str.length(); 293 } 294 295 /* 296 * TODO: Revisit whether it makes sense to track reorderStart. 297 * It is set to after the last known character with cc<=1, 298 * which stops previousCC() before it reads that character and looks up its cc. 299 * previousCC() is normally only called from insert(). 300 * In other words, reorderStart speeds up the insertion of a combining mark 301 * into a multi-combining mark sequence where it does not belong at the end. 302 * This might not be worth the trouble. 303 * On the other hand, it's not a huge amount of trouble. 304 * 305 * We probably need it for UNORM_SIMPLE_APPEND. 306 */ 307 308 // Inserts c somewhere before the last character. 309 // Requires 0<cc<lastCC which implies reorderStart<limit. 310 private void insert(int c, int cc) { 311 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 312 // insert c at codePointLimit, after the character with prevCC<=cc 313 if(c<=0xffff) { 314 str.insert(codePointLimit, (char)c); 315 if(cc<=1) { 316 reorderStart=codePointLimit+1; 317 } 318 } else { 319 str.insert(codePointLimit, Character.toChars(c)); 320 if(cc<=1) { 321 reorderStart=codePointLimit+2; 322 } 323 } 324 } 325 326 private final Normalizer2Impl impl; 327 private final Appendable app; 328 private final StringBuilder str; 329 private final boolean appIsStringBuilder; 330 private int reorderStart; 331 private int lastCC; 332 333 // private backward iterator 334 private void setIterator() { codePointStart=str.length(); } 335 private void skipPrevious() { // Requires 0<codePointStart. 336 codePointLimit=codePointStart; 337 codePointStart=str.offsetByCodePoints(codePointStart, -1); 338 } 339 private int previousCC() { // Returns 0 if there is no previous character. 340 codePointLimit=codePointStart; 341 if(reorderStart>=codePointStart) { 342 return 0; 343 } 344 int c=str.codePointBefore(codePointStart); 345 codePointStart-=Character.charCount(c); 346 if(c<MIN_CCC_LCCC_CP) { 347 return 0; 348 } 349 return getCCFromYesOrMaybe(impl.getNorm16(c)); 350 } 351 352 private int codePointStart, codePointLimit; 353 } 354 355 // TODO: Propose as public API on the UTF16 class. 356 // TODO: Propose widening UTF16 methods that take char to take int. 357 // TODO: Propose widening UTF16 methods that take String to take CharSequence. 358 public static final class UTF16Plus { 359 /** 360 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), 361 * is it a lead surrogate? 362 * @param c code unit or code point 363 * @return true or false 364 */ 365 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } 366 /** 367 * Compares two CharSequence objects for binary equality. 368 * @param s1 first sequence 369 * @param s2 second sequence 370 * @return true if s1 contains the same text as s2 371 */ 372 public static boolean equal(CharSequence s1, CharSequence s2) { 373 if(s1==s2) { 374 return true; 375 } 376 int length=s1.length(); 377 if(length!=s2.length()) { 378 return false; 379 } 380 for(int i=0; i<length; ++i) { 381 if(s1.charAt(i)!=s2.charAt(i)) { 382 return false; 383 } 384 } 385 return true; 386 } 387 /** 388 * Compares two CharSequence subsequences for binary equality. 389 * @param s1 first sequence 390 * @param start1 start offset in first sequence 391 * @param limit1 limit offset in first sequence 392 * @param s2 second sequence 393 * @param start2 start offset in second sequence 394 * @param limit2 limit offset in second sequence 395 * @return true if s1.subSequence(start1, limit1) contains the same text 396 * as s2.subSequence(start2, limit2) 397 */ 398 public static boolean equal(CharSequence s1, int start1, int limit1, 399 CharSequence s2, int start2, int limit2) { 400 if((limit1-start1)!=(limit2-start2)) { 401 return false; 402 } 403 if(s1==s2 && start1==start2) { 404 return true; 405 } 406 while(start1<limit1) { 407 if(s1.charAt(start1++)!=s2.charAt(start2++)) { 408 return false; 409 } 410 } 411 return true; 412 } 413 } 414 415 public Normalizer2Impl() {} 416 417 private static final class IsAcceptable implements ICUBinary.Authenticate { 418 // @Override when we switch to Java 6 419 @Override 420 public boolean isDataVersionAcceptable(byte version[]) { 421 return version[0]==2; 422 } 423 } 424 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 425 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" 426 427 public Normalizer2Impl load(ByteBuffer bytes) { 428 try { 429 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 430 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 431 if(indexesLength<=IX_MIN_MAYBE_YES) { 432 throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes"); 433 } 434 int[] inIndexes=new int[indexesLength]; 435 inIndexes[0]=indexesLength*4; 436 for(int i=1; i<indexesLength; ++i) { 437 inIndexes[i]=bytes.getInt(); 438 } 439 440 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 441 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 442 443 minYesNo=inIndexes[IX_MIN_YES_NO]; 444 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 445 minNoNo=inIndexes[IX_MIN_NO_NO]; 446 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 447 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 448 449 // Read the normTrie. 450 int offset=inIndexes[IX_NORM_TRIE_OFFSET]; 451 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 452 normTrie=Trie2_16.createFromSerialized(bytes); 453 int trieLength=normTrie.getSerializedLength(); 454 if(trieLength>(nextOffset-offset)) { 455 throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie"); 456 } 457 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes 458 459 // Read the composition and mapping data. 460 offset=nextOffset; 461 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 462 int numChars=(nextOffset-offset)/2; 463 if(numChars!=0) { 464 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); 465 extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes); 466 } 467 468 // smallFCD: new in formatVersion 2 469 offset=nextOffset; 470 smallFCD=new byte[0x100]; 471 bytes.get(smallFCD); 472 473 // Build tccc180[]. 474 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. 475 tccc180=new int[0x180]; 476 int bits=0; 477 for(int c=0; c<0x180; bits>>=1) { 478 if((c&0xff)==0) { 479 bits=smallFCD[c>>8]; // one byte per 0x100 code points 480 } 481 if((bits&1)!=0) { 482 for(int i=0; i<0x20; ++i, ++c) { 483 tccc180[c]=getFCD16FromNormData(c)&0xff; 484 } 485 } else { 486 c+=0x20; 487 } 488 } 489 490 return this; 491 } catch(IOException e) { 492 throw new ICUUncheckedIOException(e); 493 } 494 } 495 public Normalizer2Impl load(String name) { 496 return load(ICUBinary.getRequiredData(name)); 497 } 498 499 private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) { 500 if(isAlgorithmicNoNo(norm16)) { 501 // Range of code points with same-norm16-value algorithmic decompositions. 502 // They might have different non-zero FCD16 values. 503 do { 504 int fcd16=getFCD16(start); 505 if(fcd16>0xff) { set.add(start); } 506 } while(++start<=end); 507 } else { 508 int fcd16=getFCD16(start); 509 if(fcd16>0xff) { set.add(start, end); } 510 } 511 } 512 513 private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) { 514 /* add the start code point to the USet */ 515 set.add(start); 516 if(start!=end && isAlgorithmicNoNo(value)) { 517 // Range of code points with same-norm16-value algorithmic decompositions. 518 // They might have different non-zero FCD16 values. 519 int prevFCD16=getFCD16(start); 520 while(++start<=end) { 521 int fcd16=getFCD16(start); 522 if(fcd16!=prevFCD16) { 523 set.add(start); 524 prevFCD16=fcd16; 525 } 526 } 527 } 528 } 529 530 public void addLcccChars(UnicodeSet set) { 531 /* add the start code point of each same-value range of each trie */ 532 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 533 Trie2.Range range; 534 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 535 enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set); 536 } 537 } 538 539 public void addPropertyStarts(UnicodeSet set) { 540 /* add the start code point of each same-value range of each trie */ 541 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 542 Trie2.Range range; 543 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 544 enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set); 545 } 546 547 /* add Hangul LV syllables and LV+1 because of skippables */ 548 for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) { 549 set.add(c); 550 set.add(c+1); 551 } 552 set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 553 } 554 555 public void addCanonIterPropertyStarts(UnicodeSet set) { 556 /* add the start code point of each same-value range of the canonical iterator data trie */ 557 ensureCanonIterData(); 558 // currently only used for the SEGMENT_STARTER property 559 Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper); 560 Trie2.Range range; 561 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 562 /* add the start code point to the USet */ 563 set.add(range.startCodePoint); 564 } 565 } 566 private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() { 567 @Override 568 public int map(int in) { 569 return in&CANON_NOT_SEGMENT_STARTER; 570 } 571 }; 572 573 // low-level properties ------------------------------------------------ *** 574 575 public Trie2_16 getNormTrie() { return normTrie; } 576 577 // Note: Normalizer2Impl.java r30983 (2011-nov-27) 578 // still had getFCDTrie() which built and cached an FCD trie. 579 // That provided faster access to FCD data than getFCD16FromNormData() 580 // but required synchronization and consumed some 10kB of heap memory 581 // in any process that uses FCD (e.g., via collation). 582 // tccc180[] and smallFCD[] are intended to help with any loss of performance, 583 // at least for Latin & CJK. 584 585 /** 586 * Builds the canonical-iterator data for this instance. 587 * This is required before any of {@link #isCanonSegmentStarter(int)} or 588 * {@link #getCanonStartSet(int, UnicodeSet)} are called, 589 * or else they crash. 590 * @return this 591 */ 592 public synchronized Normalizer2Impl ensureCanonIterData() { 593 if(canonIterData==null) { 594 Trie2Writable newData=new Trie2Writable(0, 0); 595 canonStartSets=new ArrayList<UnicodeSet>(); 596 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 597 Trie2.Range range; 598 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 599 final int norm16=range.value; 600 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) { 601 // Inert, or 2-way mapping (including Hangul syllable). 602 // We do not write a canonStartSet for any yesNo character. 603 // Composites from 2-way mappings are added at runtime from the 604 // starter's compositions list, and the other characters in 605 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 606 // "maybe" characters. 607 continue; 608 } 609 for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) { 610 final int oldValue=newData.get(c); 611 int newValue=oldValue; 612 if(norm16>=minMaybeYes) { 613 // not a segment starter if it occurs in a decomposition or has cc!=0 614 newValue|=CANON_NOT_SEGMENT_STARTER; 615 if(norm16<MIN_NORMAL_MAYBE_YES) { 616 newValue|=CANON_HAS_COMPOSITIONS; 617 } 618 } else if(norm16<minYesNo) { 619 newValue|=CANON_HAS_COMPOSITIONS; 620 } else { 621 // c has a one-way decomposition 622 int c2=c; 623 int norm16_2=norm16; 624 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) { 625 c2=this.mapAlgorithmic(c2, norm16_2); 626 norm16_2=getNorm16(c2); 627 } 628 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) { 629 // c decomposes, get everything from the variable-length extra data 630 int firstUnit=extraData.charAt(norm16_2); 631 int length=firstUnit&MAPPING_LENGTH_MASK; 632 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 633 if(c==c2 && (extraData.charAt(norm16_2-1)&0xff)!=0) { 634 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 635 } 636 } 637 // Skip empty mappings (no characters in the decomposition). 638 if(length!=0) { 639 ++norm16_2; // skip over the firstUnit 640 // add c to first code point's start set 641 int limit=norm16_2+length; 642 c2=extraData.codePointAt(norm16_2); 643 addToStartSet(newData, c, c2); 644 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 645 // one-way mapping. A 2-way mapping is possible here after 646 // intermediate algorithmic mapping. 647 if(norm16_2>=minNoNo) { 648 while((norm16_2+=Character.charCount(c2))<limit) { 649 c2=extraData.codePointAt(norm16_2); 650 int c2Value=newData.get(c2); 651 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 652 newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER); 653 } 654 } 655 } 656 } 657 } else { 658 // c decomposed to c2 algorithmically; c has cc==0 659 addToStartSet(newData, c, c2); 660 } 661 } 662 if(newValue!=oldValue) { 663 newData.set(c, newValue); 664 } 665 } 666 } 667 canonIterData=newData.toTrie2_32(); 668 } 669 return this; 670 } 671 672 public int getNorm16(int c) { return normTrie.get(c); } 673 674 public int getCompQuickCheck(int norm16) { 675 if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { 676 return 1; // yes 677 } else if(minMaybeYes<=norm16) { 678 return 2; // maybe 679 } else { 680 return 0; // no 681 } 682 } 683 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } 684 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } 685 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } 686 687 public int getCC(int norm16) { 688 if(norm16>=MIN_NORMAL_MAYBE_YES) { 689 return norm16&0xff; 690 } 691 if(norm16<minNoNo || limitNoNo<=norm16) { 692 return 0; 693 } 694 return getCCFromNoNo(norm16); 695 } 696 public static int getCCFromYesOrMaybe(int norm16) { 697 return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0; 698 } 699 700 /** 701 * Returns the FCD data for code point c. 702 * @param c A Unicode code point. 703 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 704 */ 705 public int getFCD16(int c) { 706 if(c<0) { 707 return 0; 708 } else if(c<0x180) { 709 return tccc180[c]; 710 } else if(c<=0xffff) { 711 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 712 } 713 return getFCD16FromNormData(c); 714 } 715 /** Returns the FCD data for U+0000<=c<U+0180. */ 716 public int getFCD16FromBelow180(int c) { return tccc180[c]; } 717 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ 718 public boolean singleLeadMightHaveNonZeroFCD16(int lead) { 719 // 0<=lead<=0xffff 720 byte bits=smallFCD[lead>>8]; 721 if(bits==0) { return false; } 722 return ((bits>>((lead>>5)&7))&1)!=0; 723 } 724 725 /** Gets the FCD value from the regular normalization data. */ 726 public int getFCD16FromNormData(int c) { 727 // Only loops for 1:1 algorithmic mappings. 728 for(;;) { 729 int norm16=getNorm16(c); 730 if(norm16<=minYesNo) { 731 // no decomposition or Hangul syllable, all zeros 732 return 0; 733 } else if(norm16>=MIN_NORMAL_MAYBE_YES) { 734 // combining mark 735 norm16&=0xff; 736 return norm16|(norm16<<8); 737 } else if(norm16>=minMaybeYes) { 738 return 0; 739 } else if(isDecompNoAlgorithmic(norm16)) { 740 c=mapAlgorithmic(c, norm16); 741 } else { 742 // c decomposes, get everything from the variable-length extra data 743 int firstUnit=extraData.charAt(norm16); 744 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 745 // A character that is deleted (maps to an empty string) must 746 // get the worst-case lccc and tccc values because arbitrary 747 // characters on both sides will become adjacent. 748 return 0x1ff; 749 } else { 750 int fcd16=firstUnit>>8; // tccc 751 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 752 fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc 753 } 754 return fcd16; 755 } 756 } 757 } 758 } 759 760 /** 761 * Gets the decomposition for one code point. 762 * @param c code point 763 * @return c's decomposition, if it has one; returns null if it does not have a decomposition 764 */ 765 public String getDecomposition(int c) { 766 int decomp=-1; 767 int norm16; 768 for(;;) { 769 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 770 // c does not decompose 771 } else if(isHangul(norm16)) { 772 // Hangul syllable: decompose algorithmically 773 StringBuilder buffer=new StringBuilder(); 774 Hangul.decompose(c, buffer); 775 return buffer.toString(); 776 } else if(isDecompNoAlgorithmic(norm16)) { 777 decomp=c=mapAlgorithmic(c, norm16); 778 continue; 779 } else { 780 // c decomposes, get everything from the variable-length extra data 781 int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK; 782 return extraData.substring(norm16, norm16+length); 783 } 784 if(decomp<0) { 785 return null; 786 } else { 787 return UTF16.valueOf(decomp); 788 } 789 } 790 } 791 792 /** 793 * Gets the raw decomposition for one code point. 794 * @param c code point 795 * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition 796 */ 797 public String getRawDecomposition(int c) { 798 // We do not loop in this method because an algorithmic mapping itself 799 // becomes a final result rather than having to be decomposed recursively. 800 int norm16; 801 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 802 // c does not decompose 803 return null; 804 } else if(isHangul(norm16)) { 805 // Hangul syllable: decompose algorithmically 806 StringBuilder buffer=new StringBuilder(); 807 Hangul.getRawDecomposition(c, buffer); 808 return buffer.toString(); 809 } else if(isDecompNoAlgorithmic(norm16)) { 810 return UTF16.valueOf(mapAlgorithmic(c, norm16)); 811 } else { 812 // c decomposes, get everything from the variable-length extra data 813 int firstUnit=extraData.charAt(norm16); 814 int mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 815 if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) { 816 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 817 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 818 int rawMapping=norm16-((firstUnit>>7)&1)-1; 819 char rm0=extraData.charAt(rawMapping); 820 if(rm0<=MAPPING_LENGTH_MASK) { 821 return extraData.substring(rawMapping-rm0, rawMapping); 822 } else { 823 // Copy the normal mapping and replace its first two code units with rm0. 824 StringBuilder buffer=new StringBuilder(mLength-1).append(rm0); 825 norm16+=1+2; // skip over the firstUnit and the first two mapping code units 826 return buffer.append(extraData, norm16, norm16+mLength-2).toString(); 827 } 828 } else { 829 norm16+=1; // skip over the firstUnit 830 return extraData.substring(norm16, norm16+mLength); 831 } 832 } 833 } 834 835 /** 836 * Returns true if code point c starts a canonical-iterator string segment. 837 * <b>{@link #ensureCanonIterData()} must have been called before this method, 838 * or else this method will crash.</b> 839 * @param c A Unicode code point. 840 * @return true if c starts a canonical-iterator string segment. 841 */ 842 public boolean isCanonSegmentStarter(int c) { 843 return canonIterData.get(c)>=0; 844 } 845 /** 846 * Returns true if there are characters whose decomposition starts with c. 847 * If so, then the set is cleared and then filled with those characters. 848 * <b>{@link #ensureCanonIterData()} must have been called before this method, 849 * or else this method will crash.</b> 850 * @param c A Unicode code point. 851 * @param set A UnicodeSet to receive the characters whose decompositions 852 * start with c, if there are any. 853 * @return true if there are characters whose decomposition starts with c. 854 */ 855 public boolean getCanonStartSet(int c, UnicodeSet set) { 856 int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER; 857 if(canonValue==0) { 858 return false; 859 } 860 set.clear(); 861 int value=canonValue&CANON_VALUE_MASK; 862 if((canonValue&CANON_HAS_SET)!=0) { 863 set.addAll(canonStartSets.get(value)); 864 } else if(value!=0) { 865 set.add(value); 866 } 867 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 868 int norm16=getNorm16(c); 869 if(norm16==JAMO_L) { 870 int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT; 871 set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1); 872 } else { 873 addComposites(getCompositionsList(norm16), set); 874 } 875 } 876 return true; 877 } 878 879 public static final int MIN_CCC_LCCC_CP=0x300; 880 881 public static final int MIN_YES_YES_WITH_CC=0xff01; 882 public static final int JAMO_VT=0xff00; 883 public static final int MIN_NORMAL_MAYBE_YES=0xfe00; 884 public static final int JAMO_L=1; 885 public static final int MAX_DELTA=0x40; 886 887 // Byte offsets from the start of the data, after the generic header. 888 public static final int IX_NORM_TRIE_OFFSET=0; 889 public static final int IX_EXTRA_DATA_OFFSET=1; 890 public static final int IX_SMALL_FCD_OFFSET=2; 891 public static final int IX_RESERVED3_OFFSET=3; 892 public static final int IX_TOTAL_SIZE=7; 893 894 // Code point thresholds for quick check codes. 895 public static final int IX_MIN_DECOMP_NO_CP=8; 896 public static final int IX_MIN_COMP_NO_MAYBE_CP=9; 897 898 // Norm16 value thresholds for quick check combinations and types of extra data. 899 // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. 900 public static final int IX_MIN_YES_NO=10; 901 public static final int IX_MIN_NO_NO=11; 902 public static final int IX_LIMIT_NO_NO=12; 903 public static final int IX_MIN_MAYBE_YES=13; 904 905 // Mappings only in [minYesNoMappingsOnly..minNoNo[. 906 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; 907 908 public static final int IX_COUNT=16; 909 910 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; 911 public static final int MAPPING_HAS_RAW_MAPPING=0x40; 912 public static final int MAPPING_NO_COMP_BOUNDARY_AFTER=0x20; 913 public static final int MAPPING_LENGTH_MASK=0x1f; 914 915 public static final int COMP_1_LAST_TUPLE=0x8000; 916 public static final int COMP_1_TRIPLE=1; 917 public static final int COMP_1_TRAIL_LIMIT=0x3400; 918 public static final int COMP_1_TRAIL_MASK=0x7ffe; 919 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit 920 public static final int COMP_2_TRAIL_SHIFT=6; 921 public static final int COMP_2_TRAIL_MASK=0xffc0; 922 923 // higher-level functionality ------------------------------------------ *** 924 925 // NFD without an NFD Normalizer2 instance. 926 public Appendable decompose(CharSequence s, StringBuilder dest) { 927 decompose(s, 0, s.length(), dest, s.length()); 928 return dest; 929 } 930 /** 931 * Decomposes s[src, limit[ and writes the result to dest. 932 * limit can be NULL if src is NUL-terminated. 933 * destLengthEstimate is the initial dest buffer capacity and can be -1. 934 */ 935 public void decompose(CharSequence s, int src, int limit, StringBuilder dest, 936 int destLengthEstimate) { 937 if(destLengthEstimate<0) { 938 destLengthEstimate=limit-src; 939 } 940 dest.setLength(0); 941 ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); 942 decompose(s, src, limit, buffer); 943 } 944 945 // Dual functionality: 946 // buffer!=NULL: normalize 947 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 948 public int decompose(CharSequence s, int src, int limit, 949 ReorderingBuffer buffer) { 950 int minNoCP=minDecompNoCP; 951 952 int prevSrc; 953 int c=0; 954 int norm16=0; 955 956 // only for quick check 957 int prevBoundary=src; 958 int prevCC=0; 959 960 for(;;) { 961 // count code units below the minimum or with irrelevant data for the quick check 962 for(prevSrc=src; src!=limit;) { 963 if( (c=s.charAt(src))<minNoCP || 964 isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 965 ) { 966 ++src; 967 } else if(!UTF16.isSurrogate((char)c)) { 968 break; 969 } else { 970 char c2; 971 if(UTF16Plus.isSurrogateLead(c)) { 972 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 973 c=Character.toCodePoint((char)c, c2); 974 } 975 } else /* trail surrogate */ { 976 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 977 --src; 978 c=Character.toCodePoint(c2, (char)c); 979 } 980 } 981 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 982 src+=Character.charCount(c); 983 } else { 984 break; 985 } 986 } 987 } 988 // copy these code units all at once 989 if(src!=prevSrc) { 990 if(buffer!=null) { 991 buffer.flushAndAppendZeroCC(s, prevSrc, src); 992 } else { 993 prevCC=0; 994 prevBoundary=src; 995 } 996 } 997 if(src==limit) { 998 break; 999 } 1000 1001 // Check one above-minimum, relevant code point. 1002 src+=Character.charCount(c); 1003 if(buffer!=null) { 1004 decompose(c, norm16, buffer); 1005 } else { 1006 if(isDecompYes(norm16)) { 1007 int cc=getCCFromYesOrMaybe(norm16); 1008 if(prevCC<=cc || cc==0) { 1009 prevCC=cc; 1010 if(cc<=1) { 1011 prevBoundary=src; 1012 } 1013 continue; 1014 } 1015 } 1016 return prevBoundary; // "no" or cc out of order 1017 } 1018 } 1019 return src; 1020 } 1021 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { 1022 int limit=s.length(); 1023 if(limit==0) { 1024 return; 1025 } 1026 if(doDecompose) { 1027 decompose(s, 0, limit, buffer); 1028 return; 1029 } 1030 // Just merge the strings at the boundary. 1031 int c=Character.codePointAt(s, 0); 1032 int src=0; 1033 int firstCC, prevCC, cc; 1034 firstCC=prevCC=cc=getCC(getNorm16(c)); 1035 while(cc!=0) { 1036 prevCC=cc; 1037 src+=Character.charCount(c); 1038 if(src>=limit) { 1039 break; 1040 } 1041 c=Character.codePointAt(s, src); 1042 cc=getCC(getNorm16(c)); 1043 }; 1044 buffer.append(s, 0, src, firstCC, prevCC); 1045 buffer.append(s, src, limit); 1046 } 1047 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 1048 // doCompose: normalize 1049 // !doCompose: isNormalized (buffer must be empty and initialized) 1050 public boolean compose(CharSequence s, int src, int limit, 1051 boolean onlyContiguous, 1052 boolean doCompose, 1053 ReorderingBuffer buffer) { 1054 int minNoMaybeCP=minCompNoMaybeCP; 1055 1056 /* 1057 * prevBoundary points to the last character before the current one 1058 * that has a composition boundary before it with ccc==0 and quick check "yes". 1059 * Keeping track of prevBoundary saves us looking for a composition boundary 1060 * when we find a "no" or "maybe". 1061 * 1062 * When we back out from prevSrc back to prevBoundary, 1063 * then we also remove those same characters (which had been simply copied 1064 * or canonically-order-inserted) from the ReorderingBuffer. 1065 * Therefore, at all times, the [prevBoundary..prevSrc[ source units 1066 * must correspond 1:1 to destination units at the end of the destination buffer. 1067 */ 1068 int prevBoundary=src; 1069 int prevSrc; 1070 int c=0; 1071 int norm16=0; 1072 1073 // only for isNormalized 1074 int prevCC=0; 1075 1076 for(;;) { 1077 // count code units below the minimum or with irrelevant data for the quick check 1078 for(prevSrc=src; src!=limit;) { 1079 if( (c=s.charAt(src))<minNoMaybeCP || 1080 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 1081 ) { 1082 ++src; 1083 } else if(!UTF16.isSurrogate((char)c)) { 1084 break; 1085 } else { 1086 char c2; 1087 if(UTF16Plus.isSurrogateLead(c)) { 1088 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1089 c=Character.toCodePoint((char)c, c2); 1090 } 1091 } else /* trail surrogate */ { 1092 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1093 --src; 1094 c=Character.toCodePoint(c2, (char)c); 1095 } 1096 } 1097 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1098 src+=Character.charCount(c); 1099 } else { 1100 break; 1101 } 1102 } 1103 } 1104 // copy these code units all at once 1105 if(src!=prevSrc) { 1106 if(src==limit) { 1107 if(doCompose) { 1108 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1109 } 1110 break; 1111 } 1112 // Set prevBoundary to the last character in the quick check loop. 1113 prevBoundary=src-1; 1114 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && 1115 Character.isHighSurrogate(s.charAt(prevBoundary-1)) 1116 ) { 1117 --prevBoundary; 1118 } 1119 if(doCompose) { 1120 // The last "quick check yes" character is excluded from the 1121 // flush-and-append call in case it needs to be modified. 1122 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1123 buffer.append(s, prevBoundary, src); 1124 } else { 1125 prevCC=0; 1126 } 1127 // The start of the current character (c). 1128 prevSrc=src; 1129 } else if(src==limit) { 1130 break; 1131 } 1132 1133 src+=Character.charCount(c); 1134 /* 1135 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1136 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1137 * or has ccc!=0. 1138 * Check for Jamo V/T, then for regular characters. 1139 * c is not a Hangul syllable or Jamo L because those have "yes" properties. 1140 */ 1141 if(isJamoVT(norm16) && prevBoundary!=prevSrc) { 1142 char prev=s.charAt(prevSrc-1); 1143 boolean needToDecompose=false; 1144 if(c<Hangul.JAMO_T_BASE) { 1145 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1146 prev-=Hangul.JAMO_L_BASE; 1147 if(prev<Hangul.JAMO_L_COUNT) { 1148 if(!doCompose) { 1149 return false; 1150 } 1151 char syllable=(char) 1152 (Hangul.HANGUL_BASE+ 1153 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1154 Hangul.JAMO_T_COUNT); 1155 char t; 1156 if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1157 ++src; 1158 syllable+=t; // The next character was a Jamo T. 1159 prevBoundary=src; 1160 buffer.setLastChar(syllable); 1161 continue; 1162 } 1163 // If we see L+V+x where x!=T then we drop to the slow path, 1164 // decompose and recompose. 1165 // This is to deal with NFKC finding normal L and V but a 1166 // compatibility variant of a T. We need to either fully compose that 1167 // combination here (which would complicate the code and may not work 1168 // with strange custom data) or use the slow path -- or else our replacing 1169 // two input characters (L+V) with one output character (LV syllable) 1170 // would violate the invariant that [prevBoundary..prevSrc[ has the same 1171 // length as what we appended to the buffer since prevBoundary. 1172 needToDecompose=true; 1173 } 1174 } else if(Hangul.isHangulWithoutJamoT(prev)) { 1175 // c is a Jamo Trailing consonant, 1176 // compose with previous Hangul LV that does not contain a Jamo T. 1177 if(!doCompose) { 1178 return false; 1179 } 1180 buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE)); 1181 prevBoundary=src; 1182 continue; 1183 } 1184 if(!needToDecompose) { 1185 // The Jamo V/T did not compose into a Hangul syllable. 1186 if(doCompose) { 1187 buffer.append((char)c); 1188 } else { 1189 prevCC=0; 1190 } 1191 continue; 1192 } 1193 } 1194 /* 1195 * Source buffer pointers: 1196 * 1197 * all done quick check current char not yet 1198 * "yes" but (c) processed 1199 * may combine 1200 * forward 1201 * [-------------[-------------[-------------[-------------[ 1202 * | | | | | 1203 * orig. src prevBoundary prevSrc src limit 1204 * 1205 * 1206 * Destination buffer pointers inside the ReorderingBuffer: 1207 * 1208 * all done might take not filled yet 1209 * characters for 1210 * reordering 1211 * [-------------[-------------[-------------[ 1212 * | | | | 1213 * start reorderStart limit | 1214 * +remainingCap.+ 1215 */ 1216 if(norm16>=MIN_YES_YES_WITH_CC) { 1217 int cc=norm16&0xff; // cc!=0 1218 if( onlyContiguous && // FCC 1219 (doCompose ? buffer.getLastCC() : prevCC)==0 && 1220 prevBoundary<prevSrc && 1221 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that 1222 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1223 // passed the quick check "yes && ccc==0" test. 1224 // Check whether the last character was a "yesYes" or a "yesNo". 1225 // If a "yesNo", then we get its trailing ccc from its 1226 // mapping and check for canonical order. 1227 // All other cases are ok. 1228 getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc 1229 ) { 1230 // Fails FCD test, need to decompose and contiguously recompose. 1231 if(!doCompose) { 1232 return false; 1233 } 1234 } else if(doCompose) { 1235 buffer.append(c, cc); 1236 continue; 1237 } else if(prevCC<=cc) { 1238 prevCC=cc; 1239 continue; 1240 } else { 1241 return false; 1242 } 1243 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { 1244 return false; 1245 } 1246 1247 /* 1248 * Find appropriate boundaries around this character, 1249 * decompose the source text from between the boundaries, 1250 * and recompose it. 1251 * 1252 * We may need to remove the last few characters from the ReorderingBuffer 1253 * to account for source text that was copied or appended 1254 * but needs to take part in the recomposition. 1255 */ 1256 1257 /* 1258 * Find the last composition boundary in [prevBoundary..src[. 1259 * It is either the decomposition of the current character (at prevSrc), 1260 * or prevBoundary. 1261 */ 1262 if(hasCompBoundaryBefore(c, norm16)) { 1263 prevBoundary=prevSrc; 1264 } else if(doCompose) { 1265 buffer.removeSuffix(prevSrc-prevBoundary); 1266 } 1267 1268 // Find the next composition boundary in [src..limit[ - 1269 // modifies src to point to the next starter. 1270 src=findNextCompBoundary(s, src, limit); 1271 1272 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. 1273 int recomposeStartIndex=buffer.length(); 1274 decomposeShort(s, prevBoundary, src, buffer); 1275 recompose(buffer, recomposeStartIndex, onlyContiguous); 1276 if(!doCompose) { 1277 if(!buffer.equals(s, prevBoundary, src)) { 1278 return false; 1279 } 1280 buffer.remove(); 1281 prevCC=0; 1282 } 1283 1284 // Move to the next starter. We never need to look back before this point again. 1285 prevBoundary=src; 1286 } 1287 return true; 1288 } 1289 /** 1290 * Very similar to compose(): Make the same changes in both places if relevant. 1291 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) 1292 * !doSpan: quickCheck 1293 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and 1294 * bit 0: set if "maybe"; otherwise, if the span length<s.length() 1295 * then the quick check result is "no" 1296 */ 1297 public int composeQuickCheck(CharSequence s, int src, int limit, 1298 boolean onlyContiguous, boolean doSpan) { 1299 int qcResult=0; 1300 int minNoMaybeCP=minCompNoMaybeCP; 1301 1302 /* 1303 * prevBoundary points to the last character before the current one 1304 * that has a composition boundary before it with ccc==0 and quick check "yes". 1305 */ 1306 int prevBoundary=src; 1307 int prevSrc; 1308 int c=0; 1309 int norm16=0; 1310 int prevCC=0; 1311 1312 for(;;) { 1313 // count code units below the minimum or with irrelevant data for the quick check 1314 for(prevSrc=src;;) { 1315 if(src==limit) { 1316 return (src<<1)|qcResult; // "yes" or "maybe" 1317 } 1318 if( (c=s.charAt(src))<minNoMaybeCP || 1319 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 1320 ) { 1321 ++src; 1322 } else if(!UTF16.isSurrogate((char)c)) { 1323 break; 1324 } else { 1325 char c2; 1326 if(UTF16Plus.isSurrogateLead(c)) { 1327 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1328 c=Character.toCodePoint((char)c, c2); 1329 } 1330 } else /* trail surrogate */ { 1331 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1332 --src; 1333 c=Character.toCodePoint(c2, (char)c); 1334 } 1335 } 1336 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1337 src+=Character.charCount(c); 1338 } else { 1339 break; 1340 } 1341 } 1342 } 1343 if(src!=prevSrc) { 1344 // Set prevBoundary to the last character in the quick check loop. 1345 prevBoundary=src-1; 1346 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && 1347 Character.isHighSurrogate(s.charAt(prevBoundary-1)) 1348 ) { 1349 --prevBoundary; 1350 } 1351 prevCC=0; 1352 // The start of the current character (c). 1353 prevSrc=src; 1354 } 1355 1356 src+=Character.charCount(c); 1357 /* 1358 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1359 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1360 * or has ccc!=0. 1361 */ 1362 if(isMaybeOrNonZeroCC(norm16)) { 1363 int cc=getCCFromYesOrMaybe(norm16); 1364 if( onlyContiguous && // FCC 1365 cc!=0 && 1366 prevCC==0 && 1367 prevBoundary<prevSrc && 1368 // prevCC==0 && prevBoundary<prevSrc tell us that 1369 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1370 // passed the quick check "yes && ccc==0" test. 1371 // Check whether the last character was a "yesYes" or a "yesNo". 1372 // If a "yesNo", then we get its trailing ccc from its 1373 // mapping and check for canonical order. 1374 // All other cases are ok. 1375 getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc 1376 ) { 1377 // Fails FCD test. 1378 } else if(prevCC<=cc || cc==0) { 1379 prevCC=cc; 1380 if(norm16<MIN_YES_YES_WITH_CC) { 1381 if(!doSpan) { 1382 qcResult=1; 1383 } else { 1384 return prevBoundary<<1; // spanYes does not care to know it's "maybe" 1385 } 1386 } 1387 continue; 1388 } 1389 } 1390 return prevBoundary<<1; // "no" 1391 } 1392 } 1393 public void composeAndAppend(CharSequence s, 1394 boolean doCompose, 1395 boolean onlyContiguous, 1396 ReorderingBuffer buffer) { 1397 int src=0, limit=s.length(); 1398 if(!buffer.isEmpty()) { 1399 int firstStarterInSrc=findNextCompBoundary(s, 0, limit); 1400 if(0!=firstStarterInSrc) { 1401 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), 1402 buffer.length()); 1403 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ 1404 firstStarterInSrc+16); 1405 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); 1406 buffer.removeSuffix(buffer.length()-lastStarterInDest); 1407 middle.append(s, 0, firstStarterInSrc); 1408 compose(middle, 0, middle.length(), onlyContiguous, true, buffer); 1409 src=firstStarterInSrc; 1410 } 1411 } 1412 if(doCompose) { 1413 compose(s, src, limit, onlyContiguous, true, buffer); 1414 } else { 1415 buffer.append(s, src, limit); 1416 } 1417 } 1418 // Dual functionality: 1419 // buffer!=NULL: normalize 1420 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1421 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { 1422 // Note: In this function we use buffer->appendZeroCC() because we track 1423 // the lead and trail combining classes here, rather than leaving it to 1424 // the ReorderingBuffer. 1425 // The exception is the call to decomposeShort() which uses the buffer 1426 // in the normal way. 1427 1428 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1429 // Similar to the prevBoundary in the compose() implementation. 1430 int prevBoundary=src; 1431 int prevSrc; 1432 int c=0; 1433 int prevFCD16=0; 1434 int fcd16=0; 1435 1436 for(;;) { 1437 // count code units with lccc==0 1438 for(prevSrc=src; src!=limit;) { 1439 if((c=s.charAt(src))<MIN_CCC_LCCC_CP) { 1440 prevFCD16=~c; 1441 ++src; 1442 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1443 prevFCD16=0; 1444 ++src; 1445 } else { 1446 if(UTF16.isSurrogate((char)c)) { 1447 char c2; 1448 if(UTF16Plus.isSurrogateLead(c)) { 1449 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1450 c=Character.toCodePoint((char)c, c2); 1451 } 1452 } else /* trail surrogate */ { 1453 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1454 --src; 1455 c=Character.toCodePoint(c2, (char)c); 1456 } 1457 } 1458 } 1459 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1460 prevFCD16=fcd16; 1461 src+=Character.charCount(c); 1462 } else { 1463 break; 1464 } 1465 } 1466 } 1467 // copy these code units all at once 1468 if(src!=prevSrc) { 1469 if(src==limit) { 1470 if(buffer!=null) { 1471 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1472 } 1473 break; 1474 } 1475 prevBoundary=src; 1476 // We know that the previous character's lccc==0. 1477 if(prevFCD16<0) { 1478 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1479 int prev=~prevFCD16; 1480 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); 1481 if(prevFCD16>1) { 1482 --prevBoundary; 1483 } 1484 } else { 1485 int p=src-1; 1486 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && 1487 Character.isHighSurrogate(s.charAt(p-1)) 1488 ) { 1489 --p; 1490 // Need to fetch the previous character's FCD value because 1491 // prevFCD16 was just for the trail surrogate code point. 1492 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); 1493 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1494 } 1495 if(prevFCD16>1) { 1496 prevBoundary=p; 1497 } 1498 } 1499 if(buffer!=null) { 1500 // The last lccc==0 character is excluded from the 1501 // flush-and-append call in case it needs to be modified. 1502 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1503 buffer.append(s, prevBoundary, src); 1504 } 1505 // The start of the current character (c). 1506 prevSrc=src; 1507 } else if(src==limit) { 1508 break; 1509 } 1510 1511 src+=Character.charCount(c); 1512 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1513 // Check for proper order, and decompose locally if necessary. 1514 if((prevFCD16&0xff)<=(fcd16>>8)) { 1515 // proper order: prev tccc <= current lccc 1516 if((fcd16&0xff)<=1) { 1517 prevBoundary=src; 1518 } 1519 if(buffer!=null) { 1520 buffer.appendZeroCC(c); 1521 } 1522 prevFCD16=fcd16; 1523 continue; 1524 } else if(buffer==null) { 1525 return prevBoundary; // quick check "no" 1526 } else { 1527 /* 1528 * Back out the part of the source that we copied or appended 1529 * already but is now going to be decomposed. 1530 * prevSrc is set to after what was copied/appended. 1531 */ 1532 buffer.removeSuffix(prevSrc-prevBoundary); 1533 /* 1534 * Find the part of the source that needs to be decomposed, 1535 * up to the next safe boundary. 1536 */ 1537 src=findNextFCDBoundary(s, src, limit); 1538 /* 1539 * The source text does not fulfill the conditions for FCD. 1540 * Decompose and reorder a limited piece of the text. 1541 */ 1542 decomposeShort(s, prevBoundary, src, buffer); 1543 prevBoundary=src; 1544 prevFCD16=0; 1545 } 1546 } 1547 return src; 1548 } 1549 public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) { 1550 int src=0, limit=s.length(); 1551 if(!buffer.isEmpty()) { 1552 int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit); 1553 if(0!=firstBoundaryInSrc) { 1554 int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(), 1555 buffer.length()); 1556 StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+ 1557 firstBoundaryInSrc+16); 1558 middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length()); 1559 buffer.removeSuffix(buffer.length()-lastBoundaryInDest); 1560 middle.append(s, 0, firstBoundaryInSrc); 1561 makeFCD(middle, 0, middle.length(), buffer); 1562 src=firstBoundaryInSrc; 1563 } 1564 } 1565 if(doMakeFCD) { 1566 makeFCD(s, src, limit, buffer); 1567 } else { 1568 buffer.append(s, src, limit); 1569 } 1570 } 1571 1572 // Note: hasDecompBoundary() could be implemented as aliases to 1573 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() 1574 // at the cost of building the FCD trie for a decomposition normalizer. 1575 public boolean hasDecompBoundary(int c, boolean before) { 1576 for(;;) { 1577 if(c<minDecompNoCP) { 1578 return true; 1579 } 1580 int norm16=getNorm16(c); 1581 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { 1582 return true; 1583 } else if(norm16>MIN_NORMAL_MAYBE_YES) { 1584 return false; // ccc!=0 1585 } else if(isDecompNoAlgorithmic(norm16)) { 1586 c=mapAlgorithmic(c, norm16); 1587 } else { 1588 // c decomposes, get everything from the variable-length extra data 1589 int firstUnit=extraData.charAt(norm16); 1590 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1591 return false; 1592 } 1593 if(!before) { 1594 // decomp after-boundary: same as hasFCDBoundaryAfter(), 1595 // fcd16<=1 || trailCC==0 1596 if(firstUnit>0x1ff) { 1597 return false; // trailCC>1 1598 } 1599 if(firstUnit<=0xff) { 1600 return true; // trailCC==0 1601 } 1602 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1603 } 1604 // true if leadCC==0 (hasFCDBoundaryBefore()) 1605 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0; 1606 } 1607 } 1608 } 1609 public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } 1610 1611 public boolean hasCompBoundaryBefore(int c) { 1612 return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c)); 1613 } 1614 public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous, boolean testInert) { 1615 for(;;) { 1616 int norm16=getNorm16(c); 1617 if(isInert(norm16)) { 1618 return true; 1619 } else if(norm16<=minYesNo) { 1620 // Hangul: norm16==minYesNo 1621 // Hangul LVT has a boundary after it. 1622 // Hangul LV and non-inert yesYes characters combine forward. 1623 return isHangul(norm16) && !Hangul.isHangulWithoutJamoT((char)c); 1624 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { 1625 return false; 1626 } else if(isDecompNoAlgorithmic(norm16)) { 1627 c=mapAlgorithmic(c, norm16); 1628 } else { 1629 // c decomposes, get everything from the variable-length extra data. 1630 // If testInert, then c must be a yesNo character which has lccc=0, 1631 // otherwise it could be a noNo. 1632 int firstUnit=extraData.charAt(norm16); 1633 // true if 1634 // not MAPPING_NO_COMP_BOUNDARY_AFTER 1635 // (which is set if 1636 // c is not deleted, and 1637 // it and its decomposition do not combine forward, and it has a starter) 1638 // and if FCC then trailCC<=1 1639 return 1640 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && 1641 (!onlyContiguous || firstUnit<=0x1ff); 1642 } 1643 } 1644 } 1645 1646 public boolean hasFCDBoundaryBefore(int c) { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; } 1647 public boolean hasFCDBoundaryAfter(int c) { 1648 int fcd16=getFCD16(c); 1649 return fcd16<=1 || (fcd16&0xff)==0; 1650 } 1651 public boolean isFCDInert(int c) { return getFCD16(c)<=1; } 1652 1653 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } 1654 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } 1655 private static boolean isInert(int norm16) { return norm16==0; } 1656 private static boolean isJamoL(int norm16) { return norm16==1; } 1657 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } 1658 private boolean isHangul(int norm16) { return norm16==minYesNo; } 1659 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } 1660 // UBool isCompYes(uint16_t norm16) const { 1661 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 1662 // } 1663 // UBool isCompYesOrMaybe(uint16_t norm16) const { 1664 // return norm16<minNoNo || minMaybeYes<=norm16; 1665 // } 1666 // private boolean hasZeroCCFromDecompYes(int norm16) { 1667 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1668 // } 1669 private boolean isDecompYesAndZeroCC(int norm16) { 1670 return norm16<minYesNo || 1671 norm16==JAMO_VT || 1672 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 1673 } 1674 /** 1675 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 1676 * the MaybeYes which combine-forward and have ccc=0. 1677 * (Standard Unicode 5.2 normalization does not have such characters.) 1678 */ 1679 private boolean isMostDecompYesAndZeroCC(int norm16) { 1680 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1681 } 1682 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } 1683 1684 // For use with isCompYes(). 1685 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 1686 // static uint8_t getCCFromYes(uint16_t norm16) { 1687 // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; 1688 // } 1689 private int getCCFromNoNo(int norm16) { 1690 if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1691 return extraData.charAt(norm16-1)&0xff; 1692 } else { 1693 return 0; 1694 } 1695 } 1696 // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() 1697 int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) { 1698 int c; 1699 if(cpStart==(cpLimit-1)) { 1700 c=s.charAt(cpStart); 1701 } else { 1702 c=Character.codePointAt(s, cpStart); 1703 } 1704 int prevNorm16=getNorm16(c); 1705 if(prevNorm16<=minYesNo) { 1706 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 1707 } else { 1708 return extraData.charAt(prevNorm16)>>8; // tccc from yesNo 1709 } 1710 } 1711 1712 // Requires algorithmic-NoNo. 1713 private int mapAlgorithmic(int c, int norm16) { 1714 return c+norm16-(minMaybeYes-MAX_DELTA-1); 1715 } 1716 1717 // Requires minYesNo<norm16<limitNoNo. 1718 // private int getMapping(int norm16) { return /*extraData+*/norm16; } 1719 1720 /** 1721 * @return index into maybeYesCompositions, or -1 1722 */ 1723 private int getCompositionsListForDecompYes(int norm16) { 1724 if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) { 1725 return -1; 1726 } else { 1727 if((norm16-=minMaybeYes)<0) { 1728 // norm16<minMaybeYes: index into extraData which is a substring at 1729 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] 1730 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 1731 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list 1732 } 1733 return norm16; 1734 } 1735 } 1736 /** 1737 * @return index into maybeYesCompositions 1738 */ 1739 private int getCompositionsListForComposite(int norm16) { 1740 // composite has both mapping & compositions list 1741 int firstUnit=extraData.charAt(norm16); 1742 return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions 1743 1+ // +1 to skip the first unit with the mapping lenth 1744 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length 1745 } 1746 /** 1747 * @param c code point must have compositions 1748 * @return index into maybeYesCompositions 1749 */ 1750 private int getCompositionsList(int norm16) { 1751 return isDecompYes(norm16) ? 1752 getCompositionsListForDecompYes(norm16) : 1753 getCompositionsListForComposite(norm16); 1754 } 1755 1756 // Decompose a short piece of text which is likely to contain characters that 1757 // fail the quick check loop and/or where the quick check loop's overhead 1758 // is unlikely to be amortized. 1759 // Called by the compose() and makeFCD() implementations. 1760 // Public in Java for collation implementation code. 1761 public void decomposeShort(CharSequence s, int src, int limit, 1762 ReorderingBuffer buffer) { 1763 while(src<limit) { 1764 int c=Character.codePointAt(s, src); 1765 src+=Character.charCount(c); 1766 decompose(c, getNorm16(c), buffer); 1767 } 1768 } 1769 private void decompose(int c, int norm16, 1770 ReorderingBuffer buffer) { 1771 // Only loops for 1:1 algorithmic mappings. 1772 for(;;) { 1773 // get the decomposition and the lead and trail cc's 1774 if(isDecompYes(norm16)) { 1775 // c does not decompose 1776 buffer.append(c, getCCFromYesOrMaybe(norm16)); 1777 } else if(isHangul(norm16)) { 1778 // Hangul syllable: decompose algorithmically 1779 Hangul.decompose(c, buffer); 1780 } else if(isDecompNoAlgorithmic(norm16)) { 1781 c=mapAlgorithmic(c, norm16); 1782 norm16=getNorm16(c); 1783 continue; 1784 } else { 1785 // c decomposes, get everything from the variable-length extra data 1786 int firstUnit=extraData.charAt(norm16); 1787 int length=firstUnit&MAPPING_LENGTH_MASK; 1788 int leadCC, trailCC; 1789 trailCC=firstUnit>>8; 1790 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1791 leadCC=extraData.charAt(norm16-1)>>8; 1792 } else { 1793 leadCC=0; 1794 } 1795 ++norm16; // skip over the firstUnit 1796 buffer.append(extraData, norm16, norm16+length, leadCC, trailCC); 1797 } 1798 return; 1799 } 1800 } 1801 1802 /** 1803 * Finds the recomposition result for 1804 * a forward-combining "lead" character, 1805 * specified with a pointer to its compositions list, 1806 * and a backward-combining "trail" character. 1807 * 1808 * <p>If the lead and trail characters combine, then this function returns 1809 * the following "compositeAndFwd" value: 1810 * <pre> 1811 * Bits 21..1 composite character 1812 * Bit 0 set if the composite is a forward-combining starter 1813 * </pre> 1814 * otherwise it returns -1. 1815 * 1816 * <p>The compositions list has (trail, compositeAndFwd) pair entries, 1817 * encoded as either pairs or triples of 16-bit units. 1818 * The last entry has the high bit of its first unit set. 1819 * 1820 * <p>The list is sorted by ascending trail characters (there are no duplicates). 1821 * A linear search is used. 1822 * 1823 * <p>See normalizer2impl.h for a more detailed description 1824 * of the compositions list format. 1825 */ 1826 private static int combine(String compositions, int list, int trail) { 1827 int key1, firstUnit; 1828 if(trail<COMP_1_TRAIL_LIMIT) { 1829 // trail character is 0..33FF 1830 // result entry may have 2 or 3 units 1831 key1=(trail<<1); 1832 while(key1>(firstUnit=compositions.charAt(list))) { 1833 list+=2+(firstUnit&COMP_1_TRIPLE); 1834 } 1835 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1836 if((firstUnit&COMP_1_TRIPLE)!=0) { 1837 return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); 1838 } else { 1839 return compositions.charAt(list+1); 1840 } 1841 } 1842 } else { 1843 // trail character is 3400..10FFFF 1844 // result entry has 3 units 1845 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); 1846 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; 1847 int secondUnit; 1848 for(;;) { 1849 if(key1>(firstUnit=compositions.charAt(list))) { 1850 list+=2+(firstUnit&COMP_1_TRIPLE); 1851 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1852 if(key2>(secondUnit=compositions.charAt(list+1))) { 1853 if((firstUnit&COMP_1_LAST_TUPLE)!=0) { 1854 break; 1855 } else { 1856 list+=3; 1857 } 1858 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1859 return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); 1860 } else { 1861 break; 1862 } 1863 } else { 1864 break; 1865 } 1866 } 1867 } 1868 return -1; 1869 } 1870 /** 1871 * @param list some character's compositions list 1872 * @param set recursively receives the composites from these compositions 1873 */ 1874 private void addComposites(int list, UnicodeSet set) { 1875 int firstUnit, compositeAndFwd; 1876 do { 1877 firstUnit=maybeYesCompositions.charAt(list); 1878 if((firstUnit&COMP_1_TRIPLE)==0) { 1879 compositeAndFwd=maybeYesCompositions.charAt(list+1); 1880 list+=2; 1881 } else { 1882 compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)| 1883 maybeYesCompositions.charAt(list+2); 1884 list+=3; 1885 } 1886 int composite=compositeAndFwd>>1; 1887 if((compositeAndFwd&1)!=0) { 1888 addComposites(getCompositionsListForComposite(getNorm16(composite)), set); 1889 } 1890 set.add(composite); 1891 } while((firstUnit&COMP_1_LAST_TUPLE)==0); 1892 } 1893 /* 1894 * Recomposes the buffer text starting at recomposeStartIndex 1895 * (which is in NFD - decomposed and canonically ordered), 1896 * and truncates the buffer contents. 1897 * 1898 * Note that recomposition never lengthens the text: 1899 * Any character consists of either one or two code units; 1900 * a composition may contain at most one more code unit than the original starter, 1901 * while the combining mark that is removed has at least one code unit. 1902 */ 1903 private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, 1904 boolean onlyContiguous) { 1905 StringBuilder sb=buffer.getStringBuilder(); 1906 int p=recomposeStartIndex; 1907 if(p==sb.length()) { 1908 return; 1909 } 1910 1911 int starter, pRemove; 1912 int compositionsList; 1913 int c, compositeAndFwd; 1914 int norm16; 1915 int cc, prevCC; 1916 boolean starterIsSupplementary; 1917 1918 // Some of the following variables are not used until we have a forward-combining starter 1919 // and are only initialized now to avoid compiler warnings. 1920 compositionsList=-1; // used as indicator for whether we have a forward-combining starter 1921 starter=-1; 1922 starterIsSupplementary=false; 1923 prevCC=0; 1924 1925 for(;;) { 1926 c=sb.codePointAt(p); 1927 p+=Character.charCount(c); 1928 norm16=getNorm16(c); 1929 cc=getCCFromYesOrMaybe(norm16); 1930 if( // this character combines backward and 1931 isMaybe(norm16) && 1932 // we have seen a starter that combines forward and 1933 compositionsList>=0 && 1934 // the backward-combining character is not blocked 1935 (prevCC<cc || prevCC==0) 1936 ) { 1937 if(isJamoVT(norm16)) { 1938 // c is a Jamo V/T, see if we can compose it with the previous character. 1939 if(c<Hangul.JAMO_T_BASE) { 1940 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1941 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); 1942 if(prev<Hangul.JAMO_L_COUNT) { 1943 pRemove=p-1; 1944 char syllable=(char) 1945 (Hangul.HANGUL_BASE+ 1946 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1947 Hangul.JAMO_T_COUNT); 1948 char t; 1949 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1950 ++p; 1951 syllable+=t; // The next character was a Jamo T. 1952 } 1953 sb.setCharAt(starter, syllable); 1954 // remove the Jamo V/T 1955 sb.delete(pRemove, p); 1956 p=pRemove; 1957 } 1958 } 1959 /* 1960 * No "else" for Jamo T: 1961 * Since the input is in NFD, there are no Hangul LV syllables that 1962 * a Jamo T could combine with. 1963 * All Jamo Ts are combined above when handling Jamo Vs. 1964 */ 1965 if(p==sb.length()) { 1966 break; 1967 } 1968 compositionsList=-1; 1969 continue; 1970 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { 1971 // The starter and the combining mark (c) do combine. 1972 int composite=compositeAndFwd>>1; 1973 1974 // Remove the combining mark. 1975 pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark 1976 sb.delete(pRemove, p); 1977 p=pRemove; 1978 // Replace the starter with the composite. 1979 if(starterIsSupplementary) { 1980 if(composite>0xffff) { 1981 // both are supplementary 1982 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1983 sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); 1984 } else { 1985 sb.setCharAt(starter, (char)c); 1986 sb.deleteCharAt(starter+1); 1987 // The composite is shorter than the starter, 1988 // move the intermediate characters forward one. 1989 starterIsSupplementary=false; 1990 --p; 1991 } 1992 } else if(composite>0xffff) { 1993 // The composite is longer than the starter, 1994 // move the intermediate characters back one. 1995 starterIsSupplementary=true; 1996 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1997 sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); 1998 ++p; 1999 } else { 2000 // both are on the BMP 2001 sb.setCharAt(starter, (char)composite); 2002 } 2003 2004 // Keep prevCC because we removed the combining mark. 2005 2006 if(p==sb.length()) { 2007 break; 2008 } 2009 // Is the composite a starter that combines forward? 2010 if((compositeAndFwd&1)!=0) { 2011 compositionsList= 2012 getCompositionsListForComposite(getNorm16(composite)); 2013 } else { 2014 compositionsList=-1; 2015 } 2016 2017 // We combined; continue with looking for compositions. 2018 continue; 2019 } 2020 } 2021 2022 // no combination this time 2023 prevCC=cc; 2024 if(p==sb.length()) { 2025 break; 2026 } 2027 2028 // If c did not combine, then check if it is a starter. 2029 if(cc==0) { 2030 // Found a new starter. 2031 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { 2032 // It may combine with something, prepare for it. 2033 if(c<=0xffff) { 2034 starterIsSupplementary=false; 2035 starter=p-1; 2036 } else { 2037 starterIsSupplementary=true; 2038 starter=p-2; 2039 } 2040 } 2041 } else if(onlyContiguous) { 2042 // FCC: no discontiguous compositions; any intervening character blocks. 2043 compositionsList=-1; 2044 } 2045 } 2046 buffer.flush(); 2047 } 2048 2049 public int composePair(int a, int b) { 2050 int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 2051 int list; 2052 if(isInert(norm16)) { 2053 return -1; 2054 } else if(norm16<minYesNoMappingsOnly) { 2055 if(isJamoL(norm16)) { 2056 b-=Hangul.JAMO_V_BASE; 2057 if(0<=b && b<Hangul.JAMO_V_COUNT) { 2058 return 2059 (Hangul.HANGUL_BASE+ 2060 ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)* 2061 Hangul.JAMO_T_COUNT); 2062 } else { 2063 return -1; 2064 } 2065 } else if(isHangul(norm16)) { 2066 b-=Hangul.JAMO_T_BASE; 2067 if(Hangul.isHangulWithoutJamoT((char)a) && 0<b && b<Hangul.JAMO_T_COUNT) { // not b==0! 2068 return a+b; 2069 } else { 2070 return -1; 2071 } 2072 } else { 2073 // 'a' has a compositions list in extraData 2074 list=norm16; 2075 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 2076 list+= // mapping pointer 2077 1+ // +1 to skip the first unit with the mapping lenth 2078 (extraData.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length 2079 } 2080 // Turn the offset-into-extraData into an offset-into-maybeYesCompositions. 2081 list+=MIN_NORMAL_MAYBE_YES-minMaybeYes; 2082 } 2083 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 2084 return -1; 2085 } else { 2086 list=norm16-minMaybeYes; // offset into maybeYesCompositions 2087 } 2088 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 2089 return -1; 2090 } 2091 return combine(maybeYesCompositions, list, b)>>1; 2092 } 2093 2094 /** 2095 * Does c have a composition boundary before it? 2096 * True if its decomposition begins with a character that has 2097 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 2098 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 2099 * (isCompYesAndZeroCC()) so we need not decompose. 2100 */ 2101 private boolean hasCompBoundaryBefore(int c, int norm16) { 2102 for(;;) { 2103 if(isCompYesAndZeroCC(norm16)) { 2104 return true; 2105 } else if(isMaybeOrNonZeroCC(norm16)) { 2106 return false; 2107 } else if(isDecompNoAlgorithmic(norm16)) { 2108 c=mapAlgorithmic(c, norm16); 2109 norm16=getNorm16(c); 2110 } else { 2111 // c decomposes, get everything from the variable-length extra data 2112 int firstUnit=extraData.charAt(norm16); 2113 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 2114 return false; 2115 } 2116 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) { 2117 return false; // non-zero leadCC 2118 } 2119 return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1))); 2120 } 2121 } 2122 } 2123 private int findPreviousCompBoundary(CharSequence s, int p) { 2124 while(p>0) { 2125 int c=Character.codePointBefore(s, p); 2126 p-=Character.charCount(c); 2127 if(hasCompBoundaryBefore(c)) { 2128 break; 2129 } 2130 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, 2131 // but that's probably not worth the extra cost. 2132 } 2133 return p; 2134 } 2135 private int findNextCompBoundary(CharSequence s, int p, int limit) { 2136 while(p<limit) { 2137 int c=Character.codePointAt(s, p); 2138 int norm16=normTrie.get(c); 2139 if(hasCompBoundaryBefore(c, norm16)) { 2140 break; 2141 } 2142 p+=Character.charCount(c); 2143 } 2144 return p; 2145 } 2146 2147 private int findPreviousFCDBoundary(CharSequence s, int p) { 2148 while(p>0) { 2149 int c=Character.codePointBefore(s, p); 2150 p-=Character.charCount(c); 2151 if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { 2152 break; 2153 } 2154 } 2155 return p; 2156 } 2157 private int findNextFCDBoundary(CharSequence s, int p, int limit) { 2158 while(p<limit) { 2159 int c=Character.codePointAt(s, p); 2160 if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { 2161 break; 2162 } 2163 p+=Character.charCount(c); 2164 } 2165 return p; 2166 } 2167 2168 private void addToStartSet(Trie2Writable newData, int origin, int decompLead) { 2169 int canonValue=newData.get(decompLead); 2170 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 2171 // origin is the first character whose decomposition starts with 2172 // the character for which we are setting the value. 2173 newData.set(decompLead, canonValue|origin); 2174 } else { 2175 // origin is not the first character, or it is U+0000. 2176 UnicodeSet set; 2177 if((canonValue&CANON_HAS_SET)==0) { 2178 int firstOrigin=canonValue&CANON_VALUE_MASK; 2179 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size(); 2180 newData.set(decompLead, canonValue); 2181 canonStartSets.add(set=new UnicodeSet()); 2182 if(firstOrigin!=0) { 2183 set.add(firstOrigin); 2184 } 2185 } else { 2186 set=canonStartSets.get(canonValue&CANON_VALUE_MASK); 2187 } 2188 set.add(origin); 2189 } 2190 } 2191 2192 @SuppressWarnings("unused") 2193 private VersionInfo dataVersion; 2194 2195 // Code point thresholds for quick check codes. 2196 private int minDecompNoCP; 2197 private int minCompNoMaybeCP; 2198 2199 // Norm16 value thresholds for quick check combinations and types of extra data. 2200 private int minYesNo; 2201 private int minYesNoMappingsOnly; 2202 private int minNoNo; 2203 private int limitNoNo; 2204 private int minMaybeYes; 2205 2206 private Trie2_16 normTrie; 2207 private String maybeYesCompositions; 2208 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 2209 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 2210 private int[] tccc180; // [0x180] tccc values for U+0000..U+017F 2211 2212 private Trie2_32 canonIterData; 2213 private ArrayList<UnicodeSet> canonStartSets; 2214 2215 // bits in canonIterData 2216 private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000; 2217 private static final int CANON_HAS_COMPOSITIONS = 0x40000000; 2218 private static final int CANON_HAS_SET = 0x200000; 2219 private static final int CANON_VALUE_MASK = 0x1fffff; 2220} 2221