1/* 2 ******************************************************************************* 3 * Copyright (C) 2009-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8package com.ibm.icu.impl; 9 10import java.io.IOException; 11import java.nio.ByteBuffer; 12import java.util.ArrayList; 13import java.util.Iterator; 14 15import com.ibm.icu.text.UTF16; 16import com.ibm.icu.text.UnicodeSet; 17import com.ibm.icu.util.ICUUncheckedIOException; 18import com.ibm.icu.util.VersionInfo; 19 20public final class Normalizer2Impl { 21 public static final class Hangul { 22 /* Korean Hangul and Jamo constants */ 23 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 24 public static final int JAMO_L_END=0x1112; 25 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 26 public static final int JAMO_V_END=0x1175; 27 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 28 public static final int JAMO_T_END=0x11c2; 29 30 public static final int HANGUL_BASE=0xac00; 31 public static final int HANGUL_END=0xd7a3; 32 33 public static final int JAMO_L_COUNT=19; 34 public static final int JAMO_V_COUNT=21; 35 public static final int JAMO_T_COUNT=28; 36 37 public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT; 38 public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT; 39 40 public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT; 41 42 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 43 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; 44 45 public static boolean isHangul(int c) { 46 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 47 } 48 public static boolean isHangulWithoutJamoT(char c) { 49 c-=HANGUL_BASE; 50 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 51 } 52 public static boolean isJamoL(int c) { 53 return JAMO_L_BASE<=c && c<JAMO_L_LIMIT; 54 } 55 public static boolean isJamoV(int c) { 56 return JAMO_V_BASE<=c && c<JAMO_V_LIMIT; 57 } 58 59 /** 60 * Decomposes c, which must be a Hangul syllable, into buffer 61 * and returns the length of the decomposition (2 or 3). 62 */ 63 public static int decompose(int c, Appendable buffer) { 64 try { 65 c-=HANGUL_BASE; 66 int c2=c%JAMO_T_COUNT; 67 c/=JAMO_T_COUNT; 68 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 69 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 70 if(c2==0) { 71 return 2; 72 } else { 73 buffer.append((char)(JAMO_T_BASE+c2)); 74 return 3; 75 } 76 } catch(IOException e) { 77 // Will not occur because we do not write to I/O. 78 throw new ICUUncheckedIOException(e); 79 } 80 } 81 82 /** 83 * Decomposes c, which must be a Hangul syllable, into buffer. 84 * This is the raw, not recursive, decomposition. Its length is always 2. 85 */ 86 public static void getRawDecomposition(int c, Appendable buffer) { 87 try { 88 int orig=c; 89 c-=HANGUL_BASE; 90 int c2=c%JAMO_T_COUNT; 91 if(c2==0) { 92 c/=JAMO_T_COUNT; 93 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 94 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 95 } else { 96 buffer.append((char)(orig-c2)); // LV syllable 97 buffer.append((char)(JAMO_T_BASE+c2)); 98 } 99 } catch(IOException e) { 100 // Will not occur because we do not write to I/O. 101 throw new ICUUncheckedIOException(e); 102 } 103 } 104 } 105 106 /** 107 * Writable buffer that takes care of canonical ordering. 108 * Its Appendable methods behave like the C++ implementation's 109 * appendZeroCC() methods. 110 * <p> 111 * If dest is a StringBuilder, then the buffer writes directly to it. 112 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments 113 * until no further changes are necessary and whole segments are appended. 114 * append() methods that take combining-class values always write to the StringBuilder. 115 * Other append() methods flush and append to the Appendable. 116 */ 117 public static final class ReorderingBuffer implements Appendable { 118 public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) { 119 impl=ni; 120 app=dest; 121 if(app instanceof StringBuilder) { 122 appIsStringBuilder=true; 123 str=(StringBuilder)dest; 124 // In Java, the constructor subsumes public void init(int destCapacity) { 125 str.ensureCapacity(destCapacity); 126 reorderStart=0; 127 if(str.length()==0) { 128 lastCC=0; 129 } else { 130 setIterator(); 131 lastCC=previousCC(); 132 // Set reorderStart after the last code point with cc<=1 if there is one. 133 if(lastCC>1) { 134 while(previousCC()>1) {} 135 } 136 reorderStart=codePointLimit; 137 } 138 } else { 139 appIsStringBuilder=false; 140 str=new StringBuilder(); 141 reorderStart=0; 142 lastCC=0; 143 } 144 } 145 146 public boolean isEmpty() { return str.length()==0; } 147 public int length() { return str.length(); } 148 public int getLastCC() { return lastCC; } 149 150 public StringBuilder getStringBuilder() { return str; } 151 152 public boolean equals(CharSequence s, int start, int limit) { 153 return UTF16Plus.equal(str, 0, str.length(), s, start, limit); 154 } 155 156 // For Hangul composition, replacing the Leading consonant Jamo with the syllable. 157 public void setLastChar(char c) { 158 str.setCharAt(str.length()-1, c); 159 } 160 161 public void append(int c, int cc) { 162 if(lastCC<=cc || cc==0) { 163 str.appendCodePoint(c); 164 lastCC=cc; 165 if(cc<=1) { 166 reorderStart=str.length(); 167 } 168 } else { 169 insert(c, cc); 170 } 171 } 172 // s must be in NFD, otherwise change the implementation. 173 public void append(CharSequence s, int start, int limit, 174 int leadCC, int trailCC) { 175 if(start==limit) { 176 return; 177 } 178 if(lastCC<=leadCC || leadCC==0) { 179 if(trailCC<=1) { 180 reorderStart=str.length()+(limit-start); 181 } else if(leadCC<=1) { 182 reorderStart=str.length()+1; // Ok if not a code point boundary. 183 } 184 str.append(s, start, limit); 185 lastCC=trailCC; 186 } else { 187 int c=Character.codePointAt(s, start); 188 start+=Character.charCount(c); 189 insert(c, leadCC); // insert first code point 190 while(start<limit) { 191 c=Character.codePointAt(s, start); 192 start+=Character.charCount(c); 193 if(start<limit) { 194 // s must be in NFD, otherwise we need to use getCC(). 195 leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); 196 } else { 197 leadCC=trailCC; 198 } 199 append(c, leadCC); 200 } 201 } 202 } 203 // The following append() methods work like C++ appendZeroCC(). 204 // They assume that the cc or trailCC of their input is 0. 205 // Most of them implement Appendable interface methods. 206 // @Override when we switch to Java 6 207 public ReorderingBuffer append(char c) { 208 str.append(c); 209 lastCC=0; 210 reorderStart=str.length(); 211 return this; 212 } 213 public void appendZeroCC(int c) { 214 str.appendCodePoint(c); 215 lastCC=0; 216 reorderStart=str.length(); 217 } 218 // @Override when we switch to Java 6 219 public ReorderingBuffer append(CharSequence s) { 220 if(s.length()!=0) { 221 str.append(s); 222 lastCC=0; 223 reorderStart=str.length(); 224 } 225 return this; 226 } 227 // @Override when we switch to Java 6 228 public ReorderingBuffer append(CharSequence s, int start, int limit) { 229 if(start!=limit) { 230 str.append(s, start, limit); 231 lastCC=0; 232 reorderStart=str.length(); 233 } 234 return this; 235 } 236 /** 237 * Flushes from the intermediate StringBuilder to the Appendable, 238 * if they are different objects. 239 * Used after recomposition. 240 * Must be called at the end when writing to a non-StringBuilder Appendable. 241 */ 242 public void flush() { 243 if(appIsStringBuilder) { 244 reorderStart=str.length(); 245 } else { 246 try { 247 app.append(str); 248 str.setLength(0); 249 reorderStart=0; 250 } catch(IOException e) { 251 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 252 } 253 } 254 lastCC=0; 255 } 256 /** 257 * Flushes from the intermediate StringBuilder to the Appendable, 258 * if they are different objects. 259 * Then appends the new text to the Appendable or StringBuilder. 260 * Normally used after quick check loops find a non-empty sequence. 261 */ 262 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { 263 if(appIsStringBuilder) { 264 str.append(s, start, limit); 265 reorderStart=str.length(); 266 } else { 267 try { 268 app.append(str).append(s, start, limit); 269 str.setLength(0); 270 reorderStart=0; 271 } catch(IOException e) { 272 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 273 } 274 } 275 lastCC=0; 276 return this; 277 } 278 public void remove() { 279 str.setLength(0); 280 lastCC=0; 281 reorderStart=0; 282 } 283 public void removeSuffix(int suffixLength) { 284 int oldLength=str.length(); 285 str.delete(oldLength-suffixLength, oldLength); 286 lastCC=0; 287 reorderStart=str.length(); 288 } 289 290 /* 291 * TODO: Revisit whether it makes sense to track reorderStart. 292 * It is set to after the last known character with cc<=1, 293 * which stops previousCC() before it reads that character and looks up its cc. 294 * previousCC() is normally only called from insert(). 295 * In other words, reorderStart speeds up the insertion of a combining mark 296 * into a multi-combining mark sequence where it does not belong at the end. 297 * This might not be worth the trouble. 298 * On the other hand, it's not a huge amount of trouble. 299 * 300 * We probably need it for UNORM_SIMPLE_APPEND. 301 */ 302 303 // Inserts c somewhere before the last character. 304 // Requires 0<cc<lastCC which implies reorderStart<limit. 305 private void insert(int c, int cc) { 306 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 307 // insert c at codePointLimit, after the character with prevCC<=cc 308 if(c<=0xffff) { 309 str.insert(codePointLimit, (char)c); 310 if(cc<=1) { 311 reorderStart=codePointLimit+1; 312 } 313 } else { 314 str.insert(codePointLimit, Character.toChars(c)); 315 if(cc<=1) { 316 reorderStart=codePointLimit+2; 317 } 318 } 319 } 320 321 private final Normalizer2Impl impl; 322 private final Appendable app; 323 private final StringBuilder str; 324 private final boolean appIsStringBuilder; 325 private int reorderStart; 326 private int lastCC; 327 328 // private backward iterator 329 private void setIterator() { codePointStart=str.length(); } 330 private void skipPrevious() { // Requires 0<codePointStart. 331 codePointLimit=codePointStart; 332 codePointStart=str.offsetByCodePoints(codePointStart, -1); 333 } 334 private int previousCC() { // Returns 0 if there is no previous character. 335 codePointLimit=codePointStart; 336 if(reorderStart>=codePointStart) { 337 return 0; 338 } 339 int c=str.codePointBefore(codePointStart); 340 codePointStart-=Character.charCount(c); 341 if(c<MIN_CCC_LCCC_CP) { 342 return 0; 343 } 344 return getCCFromYesOrMaybe(impl.getNorm16(c)); 345 } 346 347 private int codePointStart, codePointLimit; 348 } 349 350 // TODO: Propose as public API on the UTF16 class. 351 // TODO: Propose widening UTF16 methods that take char to take int. 352 // TODO: Propose widening UTF16 methods that take String to take CharSequence. 353 public static final class UTF16Plus { 354 /** 355 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), 356 * is it a lead surrogate? 357 * @param c code unit or code point 358 * @return true or false 359 */ 360 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } 361 /** 362 * Compares two CharSequence objects for binary equality. 363 * @param s1 first sequence 364 * @param s2 second sequence 365 * @return true if s1 contains the same text as s2 366 */ 367 public static boolean equal(CharSequence s1, CharSequence s2) { 368 if(s1==s2) { 369 return true; 370 } 371 int length=s1.length(); 372 if(length!=s2.length()) { 373 return false; 374 } 375 for(int i=0; i<length; ++i) { 376 if(s1.charAt(i)!=s2.charAt(i)) { 377 return false; 378 } 379 } 380 return true; 381 } 382 /** 383 * Compares two CharSequence subsequences for binary equality. 384 * @param s1 first sequence 385 * @param start1 start offset in first sequence 386 * @param limit1 limit offset in first sequence 387 * @param s2 second sequence 388 * @param start2 start offset in second sequence 389 * @param limit2 limit offset in second sequence 390 * @return true if s1.subSequence(start1, limit1) contains the same text 391 * as s2.subSequence(start2, limit2) 392 */ 393 public static boolean equal(CharSequence s1, int start1, int limit1, 394 CharSequence s2, int start2, int limit2) { 395 if((limit1-start1)!=(limit2-start2)) { 396 return false; 397 } 398 if(s1==s2 && start1==start2) { 399 return true; 400 } 401 while(start1<limit1) { 402 if(s1.charAt(start1++)!=s2.charAt(start2++)) { 403 return false; 404 } 405 } 406 return true; 407 } 408 } 409 410 public Normalizer2Impl() {} 411 412 private static final class IsAcceptable implements ICUBinary.Authenticate { 413 // @Override when we switch to Java 6 414 public boolean isDataVersionAcceptable(byte version[]) { 415 return version[0]==2; 416 } 417 } 418 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 419 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" 420 421 public Normalizer2Impl load(ByteBuffer bytes) { 422 try { 423 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 424 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 425 if(indexesLength<=IX_MIN_MAYBE_YES) { 426 throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes"); 427 } 428 int[] inIndexes=new int[indexesLength]; 429 inIndexes[0]=indexesLength*4; 430 for(int i=1; i<indexesLength; ++i) { 431 inIndexes[i]=bytes.getInt(); 432 } 433 434 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 435 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 436 437 minYesNo=inIndexes[IX_MIN_YES_NO]; 438 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 439 minNoNo=inIndexes[IX_MIN_NO_NO]; 440 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 441 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 442 443 // Read the normTrie. 444 int offset=inIndexes[IX_NORM_TRIE_OFFSET]; 445 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 446 normTrie=Trie2_16.createFromSerialized(bytes); 447 int trieLength=normTrie.getSerializedLength(); 448 if(trieLength>(nextOffset-offset)) { 449 throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie"); 450 } 451 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes 452 453 // Read the composition and mapping data. 454 offset=nextOffset; 455 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 456 int numChars=(nextOffset-offset)/2; 457 char[] chars; 458 if(numChars!=0) { 459 chars=new char[numChars]; 460 for(int i=0; i<numChars; ++i) { 461 chars[i]=bytes.getChar(); 462 } 463 maybeYesCompositions=new String(chars); 464 extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes); 465 } 466 467 // smallFCD: new in formatVersion 2 468 offset=nextOffset; 469 smallFCD=new byte[0x100]; 470 for(int i=0; i<0x100; ++i) { 471 smallFCD[i]=bytes.get(); 472 } 473 474 // Build tccc180[]. 475 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. 476 tccc180=new int[0x180]; 477 int bits=0; 478 for(int c=0; c<0x180; bits>>=1) { 479 if((c&0xff)==0) { 480 bits=smallFCD[c>>8]; // one byte per 0x100 code points 481 } 482 if((bits&1)!=0) { 483 for(int i=0; i<0x20; ++i, ++c) { 484 tccc180[c]=getFCD16FromNormData(c)&0xff; 485 } 486 } else { 487 c+=0x20; 488 } 489 } 490 491 return this; 492 } catch(IOException e) { 493 throw new ICUUncheckedIOException(e); 494 } 495 } 496 public Normalizer2Impl load(String name) { 497 return load(ICUBinary.getRequiredData(name)); 498 } 499 500 private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) { 501 if(isAlgorithmicNoNo(norm16)) { 502 // Range of code points with same-norm16-value algorithmic decompositions. 503 // They might have different non-zero FCD16 values. 504 do { 505 int fcd16=getFCD16(start); 506 if(fcd16>0xff) { set.add(start); } 507 } while(++start<=end); 508 } else { 509 int fcd16=getFCD16(start); 510 if(fcd16>0xff) { set.add(start, end); } 511 } 512 } 513 514 private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) { 515 /* add the start code point to the USet */ 516 set.add(start); 517 if(start!=end && isAlgorithmicNoNo(value)) { 518 // Range of code points with same-norm16-value algorithmic decompositions. 519 // They might have different non-zero FCD16 values. 520 int prevFCD16=getFCD16(start); 521 while(++start<=end) { 522 int fcd16=getFCD16(start); 523 if(fcd16!=prevFCD16) { 524 set.add(start); 525 prevFCD16=fcd16; 526 } 527 } 528 } 529 } 530 531 public void addLcccChars(UnicodeSet set) { 532 /* add the start code point of each same-value range of each trie */ 533 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 534 Trie2.Range range; 535 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 536 enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set); 537 } 538 } 539 540 public void addPropertyStarts(UnicodeSet set) { 541 /* add the start code point of each same-value range of each trie */ 542 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 543 Trie2.Range range; 544 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 545 enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set); 546 } 547 548 /* add Hangul LV syllables and LV+1 because of skippables */ 549 for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) { 550 set.add(c); 551 set.add(c+1); 552 } 553 set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 554 } 555 556 public void addCanonIterPropertyStarts(UnicodeSet set) { 557 /* add the start code point of each same-value range of the canonical iterator data trie */ 558 ensureCanonIterData(); 559 // currently only used for the SEGMENT_STARTER property 560 Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper); 561 Trie2.Range range; 562 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 563 /* add the start code point to the USet */ 564 set.add(range.startCodePoint); 565 } 566 } 567 private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() { 568 public int map(int in) { 569 return in&CANON_NOT_SEGMENT_STARTER; 570 } 571 }; 572 573 // low-level properties ------------------------------------------------ *** 574 575 public Trie2_16 getNormTrie() { return normTrie; } 576 577 // Note: Normalizer2Impl.java r30983 (2011-nov-27) 578 // still had getFCDTrie() which built and cached an FCD trie. 579 // That provided faster access to FCD data than getFCD16FromNormData() 580 // but required synchronization and consumed some 10kB of heap memory 581 // in any process that uses FCD (e.g., via collation). 582 // tccc180[] and smallFCD[] are intended to help with any loss of performance, 583 // at least for Latin & CJK. 584 585 /** 586 * Builds the canonical-iterator data for this instance. 587 * This is required before any of {@link #isCanonSegmentStarter(int)} or 588 * {@link #getCanonStartSet(int, UnicodeSet)} are called, 589 * or else they crash. 590 * @return this 591 */ 592 public synchronized Normalizer2Impl ensureCanonIterData() { 593 if(canonIterData==null) { 594 Trie2Writable newData=new Trie2Writable(0, 0); 595 canonStartSets=new ArrayList<UnicodeSet>(); 596 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 597 Trie2.Range range; 598 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 599 final int norm16=range.value; 600 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) { 601 // Inert, or 2-way mapping (including Hangul syllable). 602 // We do not write a canonStartSet for any yesNo character. 603 // Composites from 2-way mappings are added at runtime from the 604 // starter's compositions list, and the other characters in 605 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 606 // "maybe" characters. 607 continue; 608 } 609 for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) { 610 final int oldValue=newData.get(c); 611 int newValue=oldValue; 612 if(norm16>=minMaybeYes) { 613 // not a segment starter if it occurs in a decomposition or has cc!=0 614 newValue|=CANON_NOT_SEGMENT_STARTER; 615 if(norm16<MIN_NORMAL_MAYBE_YES) { 616 newValue|=CANON_HAS_COMPOSITIONS; 617 } 618 } else if(norm16<minYesNo) { 619 newValue|=CANON_HAS_COMPOSITIONS; 620 } else { 621 // c has a one-way decomposition 622 int c2=c; 623 int norm16_2=norm16; 624 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) { 625 c2=this.mapAlgorithmic(c2, norm16_2); 626 norm16_2=getNorm16(c2); 627 } 628 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) { 629 // c decomposes, get everything from the variable-length extra data 630 int firstUnit=extraData.charAt(norm16_2); 631 int length=firstUnit&MAPPING_LENGTH_MASK; 632 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 633 if(c==c2 && (extraData.charAt(norm16_2-1)&0xff)!=0) { 634 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 635 } 636 } 637 // Skip empty mappings (no characters in the decomposition). 638 if(length!=0) { 639 ++norm16_2; // skip over the firstUnit 640 // add c to first code point's start set 641 int limit=norm16_2+length; 642 c2=extraData.codePointAt(norm16_2); 643 addToStartSet(newData, c, c2); 644 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 645 // one-way mapping. A 2-way mapping is possible here after 646 // intermediate algorithmic mapping. 647 if(norm16_2>=minNoNo) { 648 while((norm16_2+=Character.charCount(c2))<limit) { 649 c2=extraData.codePointAt(norm16_2); 650 int c2Value=newData.get(c2); 651 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 652 newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER); 653 } 654 } 655 } 656 } 657 } else { 658 // c decomposed to c2 algorithmically; c has cc==0 659 addToStartSet(newData, c, c2); 660 } 661 } 662 if(newValue!=oldValue) { 663 newData.set(c, newValue); 664 } 665 } 666 } 667 canonIterData=newData.toTrie2_32(); 668 } 669 return this; 670 } 671 672 public int getNorm16(int c) { return normTrie.get(c); } 673 674 public int getCompQuickCheck(int norm16) { 675 if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { 676 return 1; // yes 677 } else if(minMaybeYes<=norm16) { 678 return 2; // maybe 679 } else { 680 return 0; // no 681 } 682 } 683 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } 684 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } 685 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } 686 687 public int getCC(int norm16) { 688 if(norm16>=MIN_NORMAL_MAYBE_YES) { 689 return norm16&0xff; 690 } 691 if(norm16<minNoNo || limitNoNo<=norm16) { 692 return 0; 693 } 694 return getCCFromNoNo(norm16); 695 } 696 public static int getCCFromYesOrMaybe(int norm16) { 697 return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0; 698 } 699 700 /** 701 * Returns the FCD data for code point c. 702 * @param c A Unicode code point. 703 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 704 */ 705 public int getFCD16(int c) { 706 if(c<0) { 707 return 0; 708 } else if(c<0x180) { 709 return tccc180[c]; 710 } else if(c<=0xffff) { 711 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 712 } 713 return getFCD16FromNormData(c); 714 } 715 /** Returns the FCD data for U+0000<=c<U+0180. */ 716 public int getFCD16FromBelow180(int c) { return tccc180[c]; } 717 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ 718 public boolean singleLeadMightHaveNonZeroFCD16(int lead) { 719 // 0<=lead<=0xffff 720 byte bits=smallFCD[lead>>8]; 721 if(bits==0) { return false; } 722 return ((bits>>((lead>>5)&7))&1)!=0; 723 } 724 725 /** Gets the FCD value from the regular normalization data. */ 726 public int getFCD16FromNormData(int c) { 727 // Only loops for 1:1 algorithmic mappings. 728 for(;;) { 729 int norm16=getNorm16(c); 730 if(norm16<=minYesNo) { 731 // no decomposition or Hangul syllable, all zeros 732 return 0; 733 } else if(norm16>=MIN_NORMAL_MAYBE_YES) { 734 // combining mark 735 norm16&=0xff; 736 return norm16|(norm16<<8); 737 } else if(norm16>=minMaybeYes) { 738 return 0; 739 } else if(isDecompNoAlgorithmic(norm16)) { 740 c=mapAlgorithmic(c, norm16); 741 } else { 742 // c decomposes, get everything from the variable-length extra data 743 int firstUnit=extraData.charAt(norm16); 744 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 745 // A character that is deleted (maps to an empty string) must 746 // get the worst-case lccc and tccc values because arbitrary 747 // characters on both sides will become adjacent. 748 return 0x1ff; 749 } else { 750 int fcd16=firstUnit>>8; // tccc 751 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 752 fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc 753 } 754 return fcd16; 755 } 756 } 757 } 758 } 759 760 /** 761 * Gets the decomposition for one code point. 762 * @param c code point 763 * @return c's decomposition, if it has one; returns null if it does not have a decomposition 764 */ 765 public String getDecomposition(int c) { 766 int decomp=-1; 767 int norm16; 768 for(;;) { 769 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 770 // c does not decompose 771 } else if(isHangul(norm16)) { 772 // Hangul syllable: decompose algorithmically 773 StringBuilder buffer=new StringBuilder(); 774 Hangul.decompose(c, buffer); 775 return buffer.toString(); 776 } else if(isDecompNoAlgorithmic(norm16)) { 777 decomp=c=mapAlgorithmic(c, norm16); 778 continue; 779 } else { 780 // c decomposes, get everything from the variable-length extra data 781 int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK; 782 return extraData.substring(norm16, norm16+length); 783 } 784 if(decomp<0) { 785 return null; 786 } else { 787 return UTF16.valueOf(decomp); 788 } 789 } 790 } 791 792 /** 793 * Gets the raw decomposition for one code point. 794 * @param c code point 795 * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition 796 */ 797 public String getRawDecomposition(int c) { 798 // We do not loop in this method because an algorithmic mapping itself 799 // becomes a final result rather than having to be decomposed recursively. 800 int norm16; 801 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 802 // c does not decompose 803 return null; 804 } else if(isHangul(norm16)) { 805 // Hangul syllable: decompose algorithmically 806 StringBuilder buffer=new StringBuilder(); 807 Hangul.getRawDecomposition(c, buffer); 808 return buffer.toString(); 809 } else if(isDecompNoAlgorithmic(norm16)) { 810 return UTF16.valueOf(mapAlgorithmic(c, norm16)); 811 } else { 812 // c decomposes, get everything from the variable-length extra data 813 int firstUnit=extraData.charAt(norm16); 814 int mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 815 if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) { 816 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 817 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 818 int rawMapping=norm16-((firstUnit>>7)&1)-1; 819 char rm0=extraData.charAt(rawMapping); 820 if(rm0<=MAPPING_LENGTH_MASK) { 821 return extraData.substring(rawMapping-rm0, rawMapping); 822 } else { 823 // Copy the normal mapping and replace its first two code units with rm0. 824 StringBuilder buffer=new StringBuilder(mLength-1).append(rm0); 825 norm16+=1+2; // skip over the firstUnit and the first two mapping code units 826 return buffer.append(extraData, norm16, norm16+mLength-2).toString(); 827 } 828 } else { 829 norm16+=1; // skip over the firstUnit 830 return extraData.substring(norm16, norm16+mLength); 831 } 832 } 833 } 834 835 /** 836 * Returns true if code point c starts a canonical-iterator string segment. 837 * <b>{@link #ensureCanonIterData()} must have been called before this method, 838 * or else this method will crash.</b> 839 * @param c A Unicode code point. 840 * @return true if c starts a canonical-iterator string segment. 841 */ 842 public boolean isCanonSegmentStarter(int c) { 843 return canonIterData.get(c)>=0; 844 } 845 /** 846 * Returns true if there are characters whose decomposition starts with c. 847 * If so, then the set is cleared and then filled with those characters. 848 * <b>{@link #ensureCanonIterData()} must have been called before this method, 849 * or else this method will crash.</b> 850 * @param c A Unicode code point. 851 * @param set A UnicodeSet to receive the characters whose decompositions 852 * start with c, if there are any. 853 * @return true if there are characters whose decomposition starts with c. 854 */ 855 public boolean getCanonStartSet(int c, UnicodeSet set) { 856 int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER; 857 if(canonValue==0) { 858 return false; 859 } 860 set.clear(); 861 int value=canonValue&CANON_VALUE_MASK; 862 if((canonValue&CANON_HAS_SET)!=0) { 863 set.addAll(canonStartSets.get(value)); 864 } else if(value!=0) { 865 set.add(value); 866 } 867 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 868 int norm16=getNorm16(c); 869 if(norm16==JAMO_L) { 870 int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT; 871 set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1); 872 } else { 873 addComposites(getCompositionsList(norm16), set); 874 } 875 } 876 return true; 877 } 878 879 public static final int MIN_CCC_LCCC_CP=0x300; 880 881 public static final int MIN_YES_YES_WITH_CC=0xff01; 882 public static final int JAMO_VT=0xff00; 883 public static final int MIN_NORMAL_MAYBE_YES=0xfe00; 884 public static final int JAMO_L=1; 885 public static final int MAX_DELTA=0x40; 886 887 // Byte offsets from the start of the data, after the generic header. 888 public static final int IX_NORM_TRIE_OFFSET=0; 889 public static final int IX_EXTRA_DATA_OFFSET=1; 890 public static final int IX_SMALL_FCD_OFFSET=2; 891 public static final int IX_RESERVED3_OFFSET=3; 892 public static final int IX_TOTAL_SIZE=7; 893 894 // Code point thresholds for quick check codes. 895 public static final int IX_MIN_DECOMP_NO_CP=8; 896 public static final int IX_MIN_COMP_NO_MAYBE_CP=9; 897 898 // Norm16 value thresholds for quick check combinations and types of extra data. 899 // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. 900 public static final int IX_MIN_YES_NO=10; 901 public static final int IX_MIN_NO_NO=11; 902 public static final int IX_LIMIT_NO_NO=12; 903 public static final int IX_MIN_MAYBE_YES=13; 904 905 // Mappings only in [minYesNoMappingsOnly..minNoNo[. 906 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; 907 908 public static final int IX_COUNT=16; 909 910 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; 911 public static final int MAPPING_HAS_RAW_MAPPING=0x40; 912 public static final int MAPPING_NO_COMP_BOUNDARY_AFTER=0x20; 913 public static final int MAPPING_LENGTH_MASK=0x1f; 914 915 public static final int COMP_1_LAST_TUPLE=0x8000; 916 public static final int COMP_1_TRIPLE=1; 917 public static final int COMP_1_TRAIL_LIMIT=0x3400; 918 public static final int COMP_1_TRAIL_MASK=0x7ffe; 919 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit 920 public static final int COMP_2_TRAIL_SHIFT=6; 921 public static final int COMP_2_TRAIL_MASK=0xffc0; 922 923 // higher-level functionality ------------------------------------------ *** 924 925 // NFD without an NFD Normalizer2 instance. 926 public Appendable decompose(CharSequence s, StringBuilder dest) { 927 decompose(s, 0, s.length(), dest, s.length()); 928 return dest; 929 } 930 /** 931 * Decomposes s[src, limit[ and writes the result to dest. 932 * limit can be NULL if src is NUL-terminated. 933 * destLengthEstimate is the initial dest buffer capacity and can be -1. 934 */ 935 public void decompose(CharSequence s, int src, int limit, StringBuilder dest, 936 int destLengthEstimate) { 937 if(destLengthEstimate<0) { 938 destLengthEstimate=limit-src; 939 } 940 dest.setLength(0); 941 ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); 942 decompose(s, src, limit, buffer); 943 } 944 945 // Dual functionality: 946 // buffer!=NULL: normalize 947 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 948 public int decompose(CharSequence s, int src, int limit, 949 ReorderingBuffer buffer) { 950 int minNoCP=minDecompNoCP; 951 952 int prevSrc; 953 int c=0; 954 int norm16=0; 955 956 // only for quick check 957 int prevBoundary=src; 958 int prevCC=0; 959 960 for(;;) { 961 // count code units below the minimum or with irrelevant data for the quick check 962 for(prevSrc=src; src!=limit;) { 963 if( (c=s.charAt(src))<minNoCP || 964 isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 965 ) { 966 ++src; 967 } else if(!UTF16.isSurrogate((char)c)) { 968 break; 969 } else { 970 char c2; 971 if(UTF16Plus.isSurrogateLead(c)) { 972 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 973 c=Character.toCodePoint((char)c, c2); 974 } 975 } else /* trail surrogate */ { 976 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 977 --src; 978 c=Character.toCodePoint(c2, (char)c); 979 } 980 } 981 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 982 src+=Character.charCount(c); 983 } else { 984 break; 985 } 986 } 987 } 988 // copy these code units all at once 989 if(src!=prevSrc) { 990 if(buffer!=null) { 991 buffer.flushAndAppendZeroCC(s, prevSrc, src); 992 } else { 993 prevCC=0; 994 prevBoundary=src; 995 } 996 } 997 if(src==limit) { 998 break; 999 } 1000 1001 // Check one above-minimum, relevant code point. 1002 src+=Character.charCount(c); 1003 if(buffer!=null) { 1004 decompose(c, norm16, buffer); 1005 } else { 1006 if(isDecompYes(norm16)) { 1007 int cc=getCCFromYesOrMaybe(norm16); 1008 if(prevCC<=cc || cc==0) { 1009 prevCC=cc; 1010 if(cc<=1) { 1011 prevBoundary=src; 1012 } 1013 continue; 1014 } 1015 } 1016 return prevBoundary; // "no" or cc out of order 1017 } 1018 } 1019 return src; 1020 } 1021 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { 1022 int limit=s.length(); 1023 if(limit==0) { 1024 return; 1025 } 1026 if(doDecompose) { 1027 decompose(s, 0, limit, buffer); 1028 return; 1029 } 1030 // Just merge the strings at the boundary. 1031 int c=Character.codePointAt(s, 0); 1032 int src=0; 1033 int firstCC, prevCC, cc; 1034 firstCC=prevCC=cc=getCC(getNorm16(c)); 1035 while(cc!=0) { 1036 prevCC=cc; 1037 src+=Character.charCount(c); 1038 if(src>=limit) { 1039 break; 1040 } 1041 c=Character.codePointAt(s, src); 1042 cc=getCC(getNorm16(c)); 1043 }; 1044 buffer.append(s, 0, src, firstCC, prevCC); 1045 buffer.append(s, src, limit); 1046 } 1047 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 1048 // doCompose: normalize 1049 // !doCompose: isNormalized (buffer must be empty and initialized) 1050 public boolean compose(CharSequence s, int src, int limit, 1051 boolean onlyContiguous, 1052 boolean doCompose, 1053 ReorderingBuffer buffer) { 1054 int minNoMaybeCP=minCompNoMaybeCP; 1055 1056 /* 1057 * prevBoundary points to the last character before the current one 1058 * that has a composition boundary before it with ccc==0 and quick check "yes". 1059 * Keeping track of prevBoundary saves us looking for a composition boundary 1060 * when we find a "no" or "maybe". 1061 * 1062 * When we back out from prevSrc back to prevBoundary, 1063 * then we also remove those same characters (which had been simply copied 1064 * or canonically-order-inserted) from the ReorderingBuffer. 1065 * Therefore, at all times, the [prevBoundary..prevSrc[ source units 1066 * must correspond 1:1 to destination units at the end of the destination buffer. 1067 */ 1068 int prevBoundary=src; 1069 int prevSrc; 1070 int c=0; 1071 int norm16=0; 1072 1073 // only for isNormalized 1074 int prevCC=0; 1075 1076 for(;;) { 1077 // count code units below the minimum or with irrelevant data for the quick check 1078 for(prevSrc=src; src!=limit;) { 1079 if( (c=s.charAt(src))<minNoMaybeCP || 1080 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 1081 ) { 1082 ++src; 1083 } else if(!UTF16.isSurrogate((char)c)) { 1084 break; 1085 } else { 1086 char c2; 1087 if(UTF16Plus.isSurrogateLead(c)) { 1088 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1089 c=Character.toCodePoint((char)c, c2); 1090 } 1091 } else /* trail surrogate */ { 1092 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1093 --src; 1094 c=Character.toCodePoint(c2, (char)c); 1095 } 1096 } 1097 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1098 src+=Character.charCount(c); 1099 } else { 1100 break; 1101 } 1102 } 1103 } 1104 // copy these code units all at once 1105 if(src!=prevSrc) { 1106 if(src==limit) { 1107 if(doCompose) { 1108 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1109 } 1110 break; 1111 } 1112 // Set prevBoundary to the last character in the quick check loop. 1113 prevBoundary=src-1; 1114 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && 1115 Character.isHighSurrogate(s.charAt(prevBoundary-1)) 1116 ) { 1117 --prevBoundary; 1118 } 1119 if(doCompose) { 1120 // The last "quick check yes" character is excluded from the 1121 // flush-and-append call in case it needs to be modified. 1122 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1123 buffer.append(s, prevBoundary, src); 1124 } else { 1125 prevCC=0; 1126 } 1127 // The start of the current character (c). 1128 prevSrc=src; 1129 } else if(src==limit) { 1130 break; 1131 } 1132 1133 src+=Character.charCount(c); 1134 /* 1135 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1136 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1137 * or has ccc!=0. 1138 * Check for Jamo V/T, then for regular characters. 1139 * c is not a Hangul syllable or Jamo L because those have "yes" properties. 1140 */ 1141 if(isJamoVT(norm16) && prevBoundary!=prevSrc) { 1142 char prev=s.charAt(prevSrc-1); 1143 boolean needToDecompose=false; 1144 if(c<Hangul.JAMO_T_BASE) { 1145 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1146 prev-=Hangul.JAMO_L_BASE; 1147 if(prev<Hangul.JAMO_L_COUNT) { 1148 if(!doCompose) { 1149 return false; 1150 } 1151 char syllable=(char) 1152 (Hangul.HANGUL_BASE+ 1153 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1154 Hangul.JAMO_T_COUNT); 1155 char t; 1156 if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1157 ++src; 1158 syllable+=t; // The next character was a Jamo T. 1159 prevBoundary=src; 1160 buffer.setLastChar(syllable); 1161 continue; 1162 } 1163 // If we see L+V+x where x!=T then we drop to the slow path, 1164 // decompose and recompose. 1165 // This is to deal with NFKC finding normal L and V but a 1166 // compatibility variant of a T. We need to either fully compose that 1167 // combination here (which would complicate the code and may not work 1168 // with strange custom data) or use the slow path -- or else our replacing 1169 // two input characters (L+V) with one output character (LV syllable) 1170 // would violate the invariant that [prevBoundary..prevSrc[ has the same 1171 // length as what we appended to the buffer since prevBoundary. 1172 needToDecompose=true; 1173 } 1174 } else if(Hangul.isHangulWithoutJamoT(prev)) { 1175 // c is a Jamo Trailing consonant, 1176 // compose with previous Hangul LV that does not contain a Jamo T. 1177 if(!doCompose) { 1178 return false; 1179 } 1180 buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE)); 1181 prevBoundary=src; 1182 continue; 1183 } 1184 if(!needToDecompose) { 1185 // The Jamo V/T did not compose into a Hangul syllable. 1186 if(doCompose) { 1187 buffer.append((char)c); 1188 } else { 1189 prevCC=0; 1190 } 1191 continue; 1192 } 1193 } 1194 /* 1195 * Source buffer pointers: 1196 * 1197 * all done quick check current char not yet 1198 * "yes" but (c) processed 1199 * may combine 1200 * forward 1201 * [-------------[-------------[-------------[-------------[ 1202 * | | | | | 1203 * orig. src prevBoundary prevSrc src limit 1204 * 1205 * 1206 * Destination buffer pointers inside the ReorderingBuffer: 1207 * 1208 * all done might take not filled yet 1209 * characters for 1210 * reordering 1211 * [-------------[-------------[-------------[ 1212 * | | | | 1213 * start reorderStart limit | 1214 * +remainingCap.+ 1215 */ 1216 if(norm16>=MIN_YES_YES_WITH_CC) { 1217 int cc=norm16&0xff; // cc!=0 1218 if( onlyContiguous && // FCC 1219 (doCompose ? buffer.getLastCC() : prevCC)==0 && 1220 prevBoundary<prevSrc && 1221 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that 1222 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1223 // passed the quick check "yes && ccc==0" test. 1224 // Check whether the last character was a "yesYes" or a "yesNo". 1225 // If a "yesNo", then we get its trailing ccc from its 1226 // mapping and check for canonical order. 1227 // All other cases are ok. 1228 getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc 1229 ) { 1230 // Fails FCD test, need to decompose and contiguously recompose. 1231 if(!doCompose) { 1232 return false; 1233 } 1234 } else if(doCompose) { 1235 buffer.append(c, cc); 1236 continue; 1237 } else if(prevCC<=cc) { 1238 prevCC=cc; 1239 continue; 1240 } else { 1241 return false; 1242 } 1243 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { 1244 return false; 1245 } 1246 1247 /* 1248 * Find appropriate boundaries around this character, 1249 * decompose the source text from between the boundaries, 1250 * and recompose it. 1251 * 1252 * We may need to remove the last few characters from the ReorderingBuffer 1253 * to account for source text that was copied or appended 1254 * but needs to take part in the recomposition. 1255 */ 1256 1257 /* 1258 * Find the last composition boundary in [prevBoundary..src[. 1259 * It is either the decomposition of the current character (at prevSrc), 1260 * or prevBoundary. 1261 */ 1262 if(hasCompBoundaryBefore(c, norm16)) { 1263 prevBoundary=prevSrc; 1264 } else if(doCompose) { 1265 buffer.removeSuffix(prevSrc-prevBoundary); 1266 } 1267 1268 // Find the next composition boundary in [src..limit[ - 1269 // modifies src to point to the next starter. 1270 src=findNextCompBoundary(s, src, limit); 1271 1272 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. 1273 int recomposeStartIndex=buffer.length(); 1274 decomposeShort(s, prevBoundary, src, buffer); 1275 recompose(buffer, recomposeStartIndex, onlyContiguous); 1276 if(!doCompose) { 1277 if(!buffer.equals(s, prevBoundary, src)) { 1278 return false; 1279 } 1280 buffer.remove(); 1281 prevCC=0; 1282 } 1283 1284 // Move to the next starter. We never need to look back before this point again. 1285 prevBoundary=src; 1286 } 1287 return true; 1288 } 1289 /** 1290 * Very similar to compose(): Make the same changes in both places if relevant. 1291 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) 1292 * !doSpan: quickCheck 1293 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and 1294 * bit 0: set if "maybe"; otherwise, if the span length<s.length() 1295 * then the quick check result is "no" 1296 */ 1297 public int composeQuickCheck(CharSequence s, int src, int limit, 1298 boolean onlyContiguous, boolean doSpan) { 1299 int qcResult=0; 1300 int minNoMaybeCP=minCompNoMaybeCP; 1301 1302 /* 1303 * prevBoundary points to the last character before the current one 1304 * that has a composition boundary before it with ccc==0 and quick check "yes". 1305 */ 1306 int prevBoundary=src; 1307 int prevSrc; 1308 int c=0; 1309 int norm16=0; 1310 int prevCC=0; 1311 1312 for(;;) { 1313 // count code units below the minimum or with irrelevant data for the quick check 1314 for(prevSrc=src;;) { 1315 if(src==limit) { 1316 return (src<<1)|qcResult; // "yes" or "maybe" 1317 } 1318 if( (c=s.charAt(src))<minNoMaybeCP || 1319 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 1320 ) { 1321 ++src; 1322 } else if(!UTF16.isSurrogate((char)c)) { 1323 break; 1324 } else { 1325 char c2; 1326 if(UTF16Plus.isSurrogateLead(c)) { 1327 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1328 c=Character.toCodePoint((char)c, c2); 1329 } 1330 } else /* trail surrogate */ { 1331 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1332 --src; 1333 c=Character.toCodePoint(c2, (char)c); 1334 } 1335 } 1336 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1337 src+=Character.charCount(c); 1338 } else { 1339 break; 1340 } 1341 } 1342 } 1343 if(src!=prevSrc) { 1344 // Set prevBoundary to the last character in the quick check loop. 1345 prevBoundary=src-1; 1346 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && 1347 Character.isHighSurrogate(s.charAt(prevBoundary-1)) 1348 ) { 1349 --prevBoundary; 1350 } 1351 prevCC=0; 1352 // The start of the current character (c). 1353 prevSrc=src; 1354 } 1355 1356 src+=Character.charCount(c); 1357 /* 1358 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1359 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1360 * or has ccc!=0. 1361 */ 1362 if(isMaybeOrNonZeroCC(norm16)) { 1363 int cc=getCCFromYesOrMaybe(norm16); 1364 if( onlyContiguous && // FCC 1365 cc!=0 && 1366 prevCC==0 && 1367 prevBoundary<prevSrc && 1368 // prevCC==0 && prevBoundary<prevSrc tell us that 1369 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1370 // passed the quick check "yes && ccc==0" test. 1371 // Check whether the last character was a "yesYes" or a "yesNo". 1372 // If a "yesNo", then we get its trailing ccc from its 1373 // mapping and check for canonical order. 1374 // All other cases are ok. 1375 getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc 1376 ) { 1377 // Fails FCD test. 1378 } else if(prevCC<=cc || cc==0) { 1379 prevCC=cc; 1380 if(norm16<MIN_YES_YES_WITH_CC) { 1381 if(!doSpan) { 1382 qcResult=1; 1383 } else { 1384 return prevBoundary<<1; // spanYes does not care to know it's "maybe" 1385 } 1386 } 1387 continue; 1388 } 1389 } 1390 return prevBoundary<<1; // "no" 1391 } 1392 } 1393 public void composeAndAppend(CharSequence s, 1394 boolean doCompose, 1395 boolean onlyContiguous, 1396 ReorderingBuffer buffer) { 1397 int src=0, limit=s.length(); 1398 if(!buffer.isEmpty()) { 1399 int firstStarterInSrc=findNextCompBoundary(s, 0, limit); 1400 if(0!=firstStarterInSrc) { 1401 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), 1402 buffer.length()); 1403 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ 1404 firstStarterInSrc+16); 1405 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); 1406 buffer.removeSuffix(buffer.length()-lastStarterInDest); 1407 middle.append(s, 0, firstStarterInSrc); 1408 compose(middle, 0, middle.length(), onlyContiguous, true, buffer); 1409 src=firstStarterInSrc; 1410 } 1411 } 1412 if(doCompose) { 1413 compose(s, src, limit, onlyContiguous, true, buffer); 1414 } else { 1415 buffer.append(s, src, limit); 1416 } 1417 } 1418 // Dual functionality: 1419 // buffer!=NULL: normalize 1420 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1421 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { 1422 // Note: In this function we use buffer->appendZeroCC() because we track 1423 // the lead and trail combining classes here, rather than leaving it to 1424 // the ReorderingBuffer. 1425 // The exception is the call to decomposeShort() which uses the buffer 1426 // in the normal way. 1427 1428 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1429 // Similar to the prevBoundary in the compose() implementation. 1430 int prevBoundary=src; 1431 int prevSrc; 1432 int c=0; 1433 int prevFCD16=0; 1434 int fcd16=0; 1435 1436 for(;;) { 1437 // count code units with lccc==0 1438 for(prevSrc=src; src!=limit;) { 1439 if((c=s.charAt(src))<MIN_CCC_LCCC_CP) { 1440 prevFCD16=~c; 1441 ++src; 1442 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1443 prevFCD16=0; 1444 ++src; 1445 } else { 1446 if(UTF16.isSurrogate((char)c)) { 1447 char c2; 1448 if(UTF16Plus.isSurrogateLead(c)) { 1449 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1450 c=Character.toCodePoint((char)c, c2); 1451 } 1452 } else /* trail surrogate */ { 1453 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1454 --src; 1455 c=Character.toCodePoint(c2, (char)c); 1456 } 1457 } 1458 } 1459 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1460 prevFCD16=fcd16; 1461 src+=Character.charCount(c); 1462 } else { 1463 break; 1464 } 1465 } 1466 } 1467 // copy these code units all at once 1468 if(src!=prevSrc) { 1469 if(src==limit) { 1470 if(buffer!=null) { 1471 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1472 } 1473 break; 1474 } 1475 prevBoundary=src; 1476 // We know that the previous character's lccc==0. 1477 if(prevFCD16<0) { 1478 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1479 int prev=~prevFCD16; 1480 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); 1481 if(prevFCD16>1) { 1482 --prevBoundary; 1483 } 1484 } else { 1485 int p=src-1; 1486 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && 1487 Character.isHighSurrogate(s.charAt(p-1)) 1488 ) { 1489 --p; 1490 // Need to fetch the previous character's FCD value because 1491 // prevFCD16 was just for the trail surrogate code point. 1492 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); 1493 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1494 } 1495 if(prevFCD16>1) { 1496 prevBoundary=p; 1497 } 1498 } 1499 if(buffer!=null) { 1500 // The last lccc==0 character is excluded from the 1501 // flush-and-append call in case it needs to be modified. 1502 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1503 buffer.append(s, prevBoundary, src); 1504 } 1505 // The start of the current character (c). 1506 prevSrc=src; 1507 } else if(src==limit) { 1508 break; 1509 } 1510 1511 src+=Character.charCount(c); 1512 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1513 // Check for proper order, and decompose locally if necessary. 1514 if((prevFCD16&0xff)<=(fcd16>>8)) { 1515 // proper order: prev tccc <= current lccc 1516 if((fcd16&0xff)<=1) { 1517 prevBoundary=src; 1518 } 1519 if(buffer!=null) { 1520 buffer.appendZeroCC(c); 1521 } 1522 prevFCD16=fcd16; 1523 continue; 1524 } else if(buffer==null) { 1525 return prevBoundary; // quick check "no" 1526 } else { 1527 /* 1528 * Back out the part of the source that we copied or appended 1529 * already but is now going to be decomposed. 1530 * prevSrc is set to after what was copied/appended. 1531 */ 1532 buffer.removeSuffix(prevSrc-prevBoundary); 1533 /* 1534 * Find the part of the source that needs to be decomposed, 1535 * up to the next safe boundary. 1536 */ 1537 src=findNextFCDBoundary(s, src, limit); 1538 /* 1539 * The source text does not fulfill the conditions for FCD. 1540 * Decompose and reorder a limited piece of the text. 1541 */ 1542 decomposeShort(s, prevBoundary, src, buffer); 1543 prevBoundary=src; 1544 prevFCD16=0; 1545 } 1546 } 1547 return src; 1548 } 1549 public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) { 1550 int src=0, limit=s.length(); 1551 if(!buffer.isEmpty()) { 1552 int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit); 1553 if(0!=firstBoundaryInSrc) { 1554 int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(), 1555 buffer.length()); 1556 StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+ 1557 firstBoundaryInSrc+16); 1558 middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length()); 1559 buffer.removeSuffix(buffer.length()-lastBoundaryInDest); 1560 middle.append(s, 0, firstBoundaryInSrc); 1561 makeFCD(middle, 0, middle.length(), buffer); 1562 src=firstBoundaryInSrc; 1563 } 1564 } 1565 if(doMakeFCD) { 1566 makeFCD(s, src, limit, buffer); 1567 } else { 1568 buffer.append(s, src, limit); 1569 } 1570 } 1571 1572 // Note: hasDecompBoundary() could be implemented as aliases to 1573 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() 1574 // at the cost of building the FCD trie for a decomposition normalizer. 1575 public boolean hasDecompBoundary(int c, boolean before) { 1576 for(;;) { 1577 if(c<minDecompNoCP) { 1578 return true; 1579 } 1580 int norm16=getNorm16(c); 1581 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { 1582 return true; 1583 } else if(norm16>MIN_NORMAL_MAYBE_YES) { 1584 return false; // ccc!=0 1585 } else if(isDecompNoAlgorithmic(norm16)) { 1586 c=mapAlgorithmic(c, norm16); 1587 } else { 1588 // c decomposes, get everything from the variable-length extra data 1589 int firstUnit=extraData.charAt(norm16); 1590 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1591 return false; 1592 } 1593 if(!before) { 1594 // decomp after-boundary: same as hasFCDBoundaryAfter(), 1595 // fcd16<=1 || trailCC==0 1596 if(firstUnit>0x1ff) { 1597 return false; // trailCC>1 1598 } 1599 if(firstUnit<=0xff) { 1600 return true; // trailCC==0 1601 } 1602 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1603 } 1604 // true if leadCC==0 (hasFCDBoundaryBefore()) 1605 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0; 1606 } 1607 } 1608 } 1609 public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } 1610 1611 public boolean hasCompBoundaryBefore(int c) { 1612 return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c)); 1613 } 1614 public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous, boolean testInert) { 1615 for(;;) { 1616 int norm16=getNorm16(c); 1617 if(isInert(norm16)) { 1618 return true; 1619 } else if(norm16<=minYesNo) { 1620 // Hangul: norm16==minYesNo 1621 // Hangul LVT has a boundary after it. 1622 // Hangul LV and non-inert yesYes characters combine forward. 1623 return isHangul(norm16) && !Hangul.isHangulWithoutJamoT((char)c); 1624 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { 1625 return false; 1626 } else if(isDecompNoAlgorithmic(norm16)) { 1627 c=mapAlgorithmic(c, norm16); 1628 } else { 1629 // c decomposes, get everything from the variable-length extra data. 1630 // If testInert, then c must be a yesNo character which has lccc=0, 1631 // otherwise it could be a noNo. 1632 int firstUnit=extraData.charAt(norm16); 1633 // true if 1634 // not MAPPING_NO_COMP_BOUNDARY_AFTER 1635 // (which is set if 1636 // c is not deleted, and 1637 // it and its decomposition do not combine forward, and it has a starter) 1638 // and if FCC then trailCC<=1 1639 return 1640 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && 1641 (!onlyContiguous || firstUnit<=0x1ff); 1642 } 1643 } 1644 } 1645 1646 public boolean hasFCDBoundaryBefore(int c) { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; } 1647 public boolean hasFCDBoundaryAfter(int c) { 1648 int fcd16=getFCD16(c); 1649 return fcd16<=1 || (fcd16&0xff)==0; 1650 } 1651 public boolean isFCDInert(int c) { return getFCD16(c)<=1; } 1652 1653 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } 1654 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } 1655 private static boolean isInert(int norm16) { return norm16==0; } 1656 private static boolean isJamoL(int norm16) { return norm16==1; } 1657 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } 1658 private boolean isHangul(int norm16) { return norm16==minYesNo; } 1659 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } 1660 // UBool isCompYes(uint16_t norm16) const { 1661 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 1662 // } 1663 // UBool isCompYesOrMaybe(uint16_t norm16) const { 1664 // return norm16<minNoNo || minMaybeYes<=norm16; 1665 // } 1666 // private boolean hasZeroCCFromDecompYes(int norm16) { 1667 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1668 // } 1669 private boolean isDecompYesAndZeroCC(int norm16) { 1670 return norm16<minYesNo || 1671 norm16==JAMO_VT || 1672 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 1673 } 1674 /** 1675 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 1676 * the MaybeYes which combine-forward and have ccc=0. 1677 * (Standard Unicode 5.2 normalization does not have such characters.) 1678 */ 1679 private boolean isMostDecompYesAndZeroCC(int norm16) { 1680 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1681 } 1682 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } 1683 1684 // For use with isCompYes(). 1685 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 1686 // static uint8_t getCCFromYes(uint16_t norm16) { 1687 // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; 1688 // } 1689 private int getCCFromNoNo(int norm16) { 1690 if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1691 return extraData.charAt(norm16-1)&0xff; 1692 } else { 1693 return 0; 1694 } 1695 } 1696 // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() 1697 int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) { 1698 int c; 1699 if(cpStart==(cpLimit-1)) { 1700 c=s.charAt(cpStart); 1701 } else { 1702 c=Character.codePointAt(s, cpStart); 1703 } 1704 int prevNorm16=getNorm16(c); 1705 if(prevNorm16<=minYesNo) { 1706 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 1707 } else { 1708 return extraData.charAt(prevNorm16)>>8; // tccc from yesNo 1709 } 1710 } 1711 1712 // Requires algorithmic-NoNo. 1713 private int mapAlgorithmic(int c, int norm16) { 1714 return c+norm16-(minMaybeYes-MAX_DELTA-1); 1715 } 1716 1717 // Requires minYesNo<norm16<limitNoNo. 1718 // private int getMapping(int norm16) { return /*extraData+*/norm16; } 1719 1720 /** 1721 * @return index into maybeYesCompositions, or -1 1722 */ 1723 private int getCompositionsListForDecompYes(int norm16) { 1724 if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) { 1725 return -1; 1726 } else { 1727 if((norm16-=minMaybeYes)<0) { 1728 // norm16<minMaybeYes: index into extraData which is a substring at 1729 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] 1730 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 1731 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list 1732 } 1733 return norm16; 1734 } 1735 } 1736 /** 1737 * @return index into maybeYesCompositions 1738 */ 1739 private int getCompositionsListForComposite(int norm16) { 1740 // composite has both mapping & compositions list 1741 int firstUnit=extraData.charAt(norm16); 1742 return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions 1743 1+ // +1 to skip the first unit with the mapping lenth 1744 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length 1745 } 1746 /** 1747 * @param c code point must have compositions 1748 * @return index into maybeYesCompositions 1749 */ 1750 private int getCompositionsList(int norm16) { 1751 return isDecompYes(norm16) ? 1752 getCompositionsListForDecompYes(norm16) : 1753 getCompositionsListForComposite(norm16); 1754 } 1755 1756 // Decompose a short piece of text which is likely to contain characters that 1757 // fail the quick check loop and/or where the quick check loop's overhead 1758 // is unlikely to be amortized. 1759 // Called by the compose() and makeFCD() implementations. 1760 // Public in Java for collation implementation code. 1761 public void decomposeShort(CharSequence s, int src, int limit, 1762 ReorderingBuffer buffer) { 1763 while(src<limit) { 1764 int c=Character.codePointAt(s, src); 1765 src+=Character.charCount(c); 1766 decompose(c, getNorm16(c), buffer); 1767 } 1768 } 1769 private void decompose(int c, int norm16, 1770 ReorderingBuffer buffer) { 1771 // Only loops for 1:1 algorithmic mappings. 1772 for(;;) { 1773 // get the decomposition and the lead and trail cc's 1774 if(isDecompYes(norm16)) { 1775 // c does not decompose 1776 buffer.append(c, getCCFromYesOrMaybe(norm16)); 1777 } else if(isHangul(norm16)) { 1778 // Hangul syllable: decompose algorithmically 1779 Hangul.decompose(c, buffer); 1780 } else if(isDecompNoAlgorithmic(norm16)) { 1781 c=mapAlgorithmic(c, norm16); 1782 norm16=getNorm16(c); 1783 continue; 1784 } else { 1785 // c decomposes, get everything from the variable-length extra data 1786 int firstUnit=extraData.charAt(norm16); 1787 int length=firstUnit&MAPPING_LENGTH_MASK; 1788 int leadCC, trailCC; 1789 trailCC=firstUnit>>8; 1790 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1791 leadCC=extraData.charAt(norm16-1)>>8; 1792 } else { 1793 leadCC=0; 1794 } 1795 ++norm16; // skip over the firstUnit 1796 buffer.append(extraData, norm16, norm16+length, leadCC, trailCC); 1797 } 1798 return; 1799 } 1800 } 1801 1802 /** 1803 * Finds the recomposition result for 1804 * a forward-combining "lead" character, 1805 * specified with a pointer to its compositions list, 1806 * and a backward-combining "trail" character. 1807 * 1808 * <p>If the lead and trail characters combine, then this function returns 1809 * the following "compositeAndFwd" value: 1810 * <pre> 1811 * Bits 21..1 composite character 1812 * Bit 0 set if the composite is a forward-combining starter 1813 * </pre> 1814 * otherwise it returns -1. 1815 * 1816 * <p>The compositions list has (trail, compositeAndFwd) pair entries, 1817 * encoded as either pairs or triples of 16-bit units. 1818 * The last entry has the high bit of its first unit set. 1819 * 1820 * <p>The list is sorted by ascending trail characters (there are no duplicates). 1821 * A linear search is used. 1822 * 1823 * <p>See normalizer2impl.h for a more detailed description 1824 * of the compositions list format. 1825 */ 1826 private static int combine(String compositions, int list, int trail) { 1827 int key1, firstUnit; 1828 if(trail<COMP_1_TRAIL_LIMIT) { 1829 // trail character is 0..33FF 1830 // result entry may have 2 or 3 units 1831 key1=(trail<<1); 1832 while(key1>(firstUnit=compositions.charAt(list))) { 1833 list+=2+(firstUnit&COMP_1_TRIPLE); 1834 } 1835 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1836 if((firstUnit&COMP_1_TRIPLE)!=0) { 1837 return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2); 1838 } else { 1839 return compositions.charAt(list+1); 1840 } 1841 } 1842 } else { 1843 // trail character is 3400..10FFFF 1844 // result entry has 3 units 1845 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); 1846 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; 1847 int secondUnit; 1848 for(;;) { 1849 if(key1>(firstUnit=compositions.charAt(list))) { 1850 list+=2+(firstUnit&COMP_1_TRIPLE); 1851 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1852 if(key2>(secondUnit=compositions.charAt(list+1))) { 1853 if((firstUnit&COMP_1_LAST_TUPLE)!=0) { 1854 break; 1855 } else { 1856 list+=3; 1857 } 1858 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1859 return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); 1860 } else { 1861 break; 1862 } 1863 } else { 1864 break; 1865 } 1866 } 1867 } 1868 return -1; 1869 } 1870 /** 1871 * @param list some character's compositions list 1872 * @param set recursively receives the composites from these compositions 1873 */ 1874 private void addComposites(int list, UnicodeSet set) { 1875 int firstUnit, compositeAndFwd; 1876 do { 1877 firstUnit=maybeYesCompositions.charAt(list); 1878 if((firstUnit&COMP_1_TRIPLE)==0) { 1879 compositeAndFwd=maybeYesCompositions.charAt(list+1); 1880 list+=2; 1881 } else { 1882 compositeAndFwd=(((int)maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)| 1883 maybeYesCompositions.charAt(list+2); 1884 list+=3; 1885 } 1886 int composite=compositeAndFwd>>1; 1887 if((compositeAndFwd&1)!=0) { 1888 addComposites(getCompositionsListForComposite(getNorm16(composite)), set); 1889 } 1890 set.add(composite); 1891 } while((firstUnit&COMP_1_LAST_TUPLE)==0); 1892 } 1893 /* 1894 * Recomposes the buffer text starting at recomposeStartIndex 1895 * (which is in NFD - decomposed and canonically ordered), 1896 * and truncates the buffer contents. 1897 * 1898 * Note that recomposition never lengthens the text: 1899 * Any character consists of either one or two code units; 1900 * a composition may contain at most one more code unit than the original starter, 1901 * while the combining mark that is removed has at least one code unit. 1902 */ 1903 private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, 1904 boolean onlyContiguous) { 1905 StringBuilder sb=buffer.getStringBuilder(); 1906 int p=recomposeStartIndex; 1907 if(p==sb.length()) { 1908 return; 1909 } 1910 1911 int starter, pRemove; 1912 int compositionsList; 1913 int c, compositeAndFwd; 1914 int norm16; 1915 int cc, prevCC; 1916 boolean starterIsSupplementary; 1917 1918 // Some of the following variables are not used until we have a forward-combining starter 1919 // and are only initialized now to avoid compiler warnings. 1920 compositionsList=-1; // used as indicator for whether we have a forward-combining starter 1921 starter=-1; 1922 starterIsSupplementary=false; 1923 prevCC=0; 1924 1925 for(;;) { 1926 c=sb.codePointAt(p); 1927 p+=Character.charCount(c); 1928 norm16=getNorm16(c); 1929 cc=getCCFromYesOrMaybe(norm16); 1930 if( // this character combines backward and 1931 isMaybe(norm16) && 1932 // we have seen a starter that combines forward and 1933 compositionsList>=0 && 1934 // the backward-combining character is not blocked 1935 (prevCC<cc || prevCC==0) 1936 ) { 1937 if(isJamoVT(norm16)) { 1938 // c is a Jamo V/T, see if we can compose it with the previous character. 1939 if(c<Hangul.JAMO_T_BASE) { 1940 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1941 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); 1942 if(prev<Hangul.JAMO_L_COUNT) { 1943 pRemove=p-1; 1944 char syllable=(char) 1945 (Hangul.HANGUL_BASE+ 1946 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1947 Hangul.JAMO_T_COUNT); 1948 char t; 1949 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1950 ++p; 1951 syllable+=t; // The next character was a Jamo T. 1952 } 1953 sb.setCharAt(starter, syllable); 1954 // remove the Jamo V/T 1955 sb.delete(pRemove, p); 1956 p=pRemove; 1957 } 1958 } 1959 /* 1960 * No "else" for Jamo T: 1961 * Since the input is in NFD, there are no Hangul LV syllables that 1962 * a Jamo T could combine with. 1963 * All Jamo Ts are combined above when handling Jamo Vs. 1964 */ 1965 if(p==sb.length()) { 1966 break; 1967 } 1968 compositionsList=-1; 1969 continue; 1970 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { 1971 // The starter and the combining mark (c) do combine. 1972 int composite=compositeAndFwd>>1; 1973 1974 // Remove the combining mark. 1975 pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark 1976 sb.delete(pRemove, p); 1977 p=pRemove; 1978 // Replace the starter with the composite. 1979 if(starterIsSupplementary) { 1980 if(composite>0xffff) { 1981 // both are supplementary 1982 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1983 sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); 1984 } else { 1985 sb.setCharAt(starter, (char)c); 1986 sb.deleteCharAt(starter+1); 1987 // The composite is shorter than the starter, 1988 // move the intermediate characters forward one. 1989 starterIsSupplementary=false; 1990 --p; 1991 } 1992 } else if(composite>0xffff) { 1993 // The composite is longer than the starter, 1994 // move the intermediate characters back one. 1995 starterIsSupplementary=true; 1996 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1997 sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); 1998 ++p; 1999 } else { 2000 // both are on the BMP 2001 sb.setCharAt(starter, (char)composite); 2002 } 2003 2004 // Keep prevCC because we removed the combining mark. 2005 2006 if(p==sb.length()) { 2007 break; 2008 } 2009 // Is the composite a starter that combines forward? 2010 if((compositeAndFwd&1)!=0) { 2011 compositionsList= 2012 getCompositionsListForComposite(getNorm16(composite)); 2013 } else { 2014 compositionsList=-1; 2015 } 2016 2017 // We combined; continue with looking for compositions. 2018 continue; 2019 } 2020 } 2021 2022 // no combination this time 2023 prevCC=cc; 2024 if(p==sb.length()) { 2025 break; 2026 } 2027 2028 // If c did not combine, then check if it is a starter. 2029 if(cc==0) { 2030 // Found a new starter. 2031 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { 2032 // It may combine with something, prepare for it. 2033 if(c<=0xffff) { 2034 starterIsSupplementary=false; 2035 starter=p-1; 2036 } else { 2037 starterIsSupplementary=true; 2038 starter=p-2; 2039 } 2040 } 2041 } else if(onlyContiguous) { 2042 // FCC: no discontiguous compositions; any intervening character blocks. 2043 compositionsList=-1; 2044 } 2045 } 2046 buffer.flush(); 2047 } 2048 2049 public int composePair(int a, int b) { 2050 int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 2051 int list; 2052 if(isInert(norm16)) { 2053 return -1; 2054 } else if(norm16<minYesNoMappingsOnly) { 2055 if(isJamoL(norm16)) { 2056 b-=Hangul.JAMO_V_BASE; 2057 if(0<=b && b<Hangul.JAMO_V_COUNT) { 2058 return 2059 (Hangul.HANGUL_BASE+ 2060 ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)* 2061 Hangul.JAMO_T_COUNT); 2062 } else { 2063 return -1; 2064 } 2065 } else if(isHangul(norm16)) { 2066 b-=Hangul.JAMO_T_BASE; 2067 if(Hangul.isHangulWithoutJamoT((char)a) && 0<b && b<Hangul.JAMO_T_COUNT) { // not b==0! 2068 return a+b; 2069 } else { 2070 return -1; 2071 } 2072 } else { 2073 // 'a' has a compositions list in extraData 2074 list=norm16; 2075 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 2076 list+= // mapping pointer 2077 1+ // +1 to skip the first unit with the mapping lenth 2078 (extraData.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length 2079 } 2080 // Turn the offset-into-extraData into an offset-into-maybeYesCompositions. 2081 list+=MIN_NORMAL_MAYBE_YES-minMaybeYes; 2082 } 2083 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 2084 return -1; 2085 } else { 2086 list=norm16-minMaybeYes; // offset into maybeYesCompositions 2087 } 2088 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 2089 return -1; 2090 } 2091 return combine(maybeYesCompositions, list, b)>>1; 2092 } 2093 2094 /** 2095 * Does c have a composition boundary before it? 2096 * True if its decomposition begins with a character that has 2097 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 2098 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 2099 * (isCompYesAndZeroCC()) so we need not decompose. 2100 */ 2101 private boolean hasCompBoundaryBefore(int c, int norm16) { 2102 for(;;) { 2103 if(isCompYesAndZeroCC(norm16)) { 2104 return true; 2105 } else if(isMaybeOrNonZeroCC(norm16)) { 2106 return false; 2107 } else if(isDecompNoAlgorithmic(norm16)) { 2108 c=mapAlgorithmic(c, norm16); 2109 norm16=getNorm16(c); 2110 } else { 2111 // c decomposes, get everything from the variable-length extra data 2112 int firstUnit=extraData.charAt(norm16); 2113 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 2114 return false; 2115 } 2116 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) { 2117 return false; // non-zero leadCC 2118 } 2119 return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1))); 2120 } 2121 } 2122 } 2123 private int findPreviousCompBoundary(CharSequence s, int p) { 2124 while(p>0) { 2125 int c=Character.codePointBefore(s, p); 2126 p-=Character.charCount(c); 2127 if(hasCompBoundaryBefore(c)) { 2128 break; 2129 } 2130 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, 2131 // but that's probably not worth the extra cost. 2132 } 2133 return p; 2134 } 2135 private int findNextCompBoundary(CharSequence s, int p, int limit) { 2136 while(p<limit) { 2137 int c=Character.codePointAt(s, p); 2138 int norm16=normTrie.get(c); 2139 if(hasCompBoundaryBefore(c, norm16)) { 2140 break; 2141 } 2142 p+=Character.charCount(c); 2143 } 2144 return p; 2145 } 2146 2147 private int findPreviousFCDBoundary(CharSequence s, int p) { 2148 while(p>0) { 2149 int c=Character.codePointBefore(s, p); 2150 p-=Character.charCount(c); 2151 if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { 2152 break; 2153 } 2154 } 2155 return p; 2156 } 2157 private int findNextFCDBoundary(CharSequence s, int p, int limit) { 2158 while(p<limit) { 2159 int c=Character.codePointAt(s, p); 2160 if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { 2161 break; 2162 } 2163 p+=Character.charCount(c); 2164 } 2165 return p; 2166 } 2167 2168 private void addToStartSet(Trie2Writable newData, int origin, int decompLead) { 2169 int canonValue=newData.get(decompLead); 2170 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 2171 // origin is the first character whose decomposition starts with 2172 // the character for which we are setting the value. 2173 newData.set(decompLead, canonValue|origin); 2174 } else { 2175 // origin is not the first character, or it is U+0000. 2176 UnicodeSet set; 2177 if((canonValue&CANON_HAS_SET)==0) { 2178 int firstOrigin=canonValue&CANON_VALUE_MASK; 2179 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size(); 2180 newData.set(decompLead, canonValue); 2181 canonStartSets.add(set=new UnicodeSet()); 2182 if(firstOrigin!=0) { 2183 set.add(firstOrigin); 2184 } 2185 } else { 2186 set=canonStartSets.get(canonValue&CANON_VALUE_MASK); 2187 } 2188 set.add(origin); 2189 } 2190 } 2191 2192 @SuppressWarnings("unused") 2193 private VersionInfo dataVersion; 2194 2195 // Code point thresholds for quick check codes. 2196 private int minDecompNoCP; 2197 private int minCompNoMaybeCP; 2198 2199 // Norm16 value thresholds for quick check combinations and types of extra data. 2200 private int minYesNo; 2201 private int minYesNoMappingsOnly; 2202 private int minNoNo; 2203 private int limitNoNo; 2204 private int minMaybeYes; 2205 2206 private Trie2_16 normTrie; 2207 private String maybeYesCompositions; 2208 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 2209 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 2210 private int[] tccc180; // [0x180] tccc values for U+0000..U+017F 2211 2212 private Trie2_32 canonIterData; 2213 private ArrayList<UnicodeSet> canonStartSets; 2214 2215 // bits in canonIterData 2216 private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000; 2217 private static final int CANON_HAS_COMPOSITIONS = 0x40000000; 2218 private static final int CANON_HAS_SET = 0x200000; 2219 private static final int CANON_VALUE_MASK = 0x1fffff; 2220} 2221