1/* 2******************************************************************************* 3* Copyright (C) 2012-2015, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* CollationDataBuilder.java, ported from collationdatabuilder.h/.cpp 7* 8* C++ version created on: 2012apr01 9* created by: Markus W. Scherer 10*/ 11 12package com.ibm.icu.impl.coll; 13 14import java.util.ArrayList; 15import java.util.Arrays; 16import java.util.Iterator; 17 18import com.ibm.icu.impl.Norm2AllModes; 19import com.ibm.icu.impl.Normalizer2Impl; 20import com.ibm.icu.impl.Normalizer2Impl.Hangul; 21import com.ibm.icu.impl.Trie2; 22import com.ibm.icu.impl.Trie2Writable; 23import com.ibm.icu.lang.UCharacter; 24import com.ibm.icu.text.UnicodeSet; 25import com.ibm.icu.text.UnicodeSetIterator; 26import com.ibm.icu.util.CharsTrie; 27import com.ibm.icu.util.CharsTrieBuilder; 28import com.ibm.icu.util.StringTrieBuilder; 29 30/** 31 * Low-level CollationData builder. 32 * Takes (character, CE) pairs and builds them into runtime data structures. 33 * Supports characters with context prefixes and contraction suffixes. 34 */ 35final class CollationDataBuilder { // not final in C++ 36 /** 37 * Collation element modifier. Interface class for a modifier 38 * that changes a tailoring builder's temporary CEs to final CEs. 39 * Called for every non-special CE32 and every expansion CE. 40 */ 41 interface CEModifier { 42 /** Returns a new CE to replace the non-special input CE32, or else Collation.NO_CE. */ 43 long modifyCE32(int ce32); 44 /** Returns a new CE to replace the input CE, or else Collation.NO_CE. */ 45 long modifyCE(long ce); 46 } 47 48 CollationDataBuilder() { 49 nfcImpl = Norm2AllModes.getNFCInstance().impl; 50 base = null; 51 baseSettings = null; 52 trie = null; 53 ce32s = new UVector32(); 54 ce64s = new UVector64(); 55 conditionalCE32s = new ArrayList<ConditionalCE32>(); 56 modified = false; 57 fastLatinEnabled = false; 58 fastLatinBuilder = null; 59 collIter = null; 60 // Reserve the first CE32 for U+0000. 61 ce32s.addElement(0); 62 } 63 64 void initForTailoring(CollationData b) { 65 if(trie != null) { 66 throw new IllegalStateException("attempt to reuse a CollationDataBuilder"); 67 } 68 if(b == null) { 69 throw new IllegalArgumentException("null CollationData"); 70 } 71 base = b; 72 73 // For a tailoring, the default is to fall back to the base. 74 trie = new Trie2Writable(Collation.FALLBACK_CE32, Collation.FFFD_CE32); 75 76 // Set the Latin-1 letters block so that it is allocated first in the data array, 77 // to try to improve locality of reference when sorting Latin-1 text. 78 // Do not use utrie2_setRange32() since that will not actually allocate blocks 79 // that are filled with the default value. 80 // ASCII (0..7F) is already preallocated anyway. 81 for(int c = 0xc0; c <= 0xff; ++c) { 82 trie.set(c, Collation.FALLBACK_CE32); 83 } 84 85 // Hangul syllables are not tailorable (except via tailoring Jamos). 86 // Always set the Hangul tag to help performance. 87 // Do this here, rather than in buildMappings(), 88 // so that we see the HANGUL_TAG in various assertions. 89 int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0); 90 trie.setRange(Hangul.HANGUL_BASE, Hangul.HANGUL_END, hangulCE32, true); 91 92 // Copy the set contents but don't copy/clone the set as a whole because 93 // that would copy the isFrozen state too. 94 unsafeBackwardSet.addAll(b.unsafeBackwardSet); 95 } 96 97 boolean isCompressibleLeadByte(int b) { 98 return base.isCompressibleLeadByte(b); 99 } 100 101 boolean isCompressiblePrimary(long p) { 102 return isCompressibleLeadByte((int)p >>> 24); 103 } 104 105 /** 106 * @return true if this builder has mappings (e.g., add() has been called) 107 */ 108 boolean hasMappings() { return modified; } 109 110 /** 111 * @return true if c has CEs in this builder 112 */ 113 boolean isAssigned(int c) { 114 return Collation.isAssignedCE32(trie.get(c)); 115 } 116 117 void add(CharSequence prefix, CharSequence s, long ces[], int cesLength) { 118 int ce32 = encodeCEs(ces, cesLength); 119 addCE32(prefix, s, ce32); 120 } 121 122 /** 123 * Encodes the ces as either the returned ce32 by itself, 124 * or by storing an expansion, with the returned ce32 referring to that. 125 * 126 * <p>add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength)) 127 */ 128 int encodeCEs(long ces[], int cesLength) { 129 if(cesLength < 0 || cesLength > Collation.MAX_EXPANSION_LENGTH) { 130 throw new IllegalArgumentException("mapping to too many CEs"); 131 } 132 if(!isMutable()) { 133 throw new IllegalStateException("attempt to add mappings after build()"); 134 } 135 if(cesLength == 0) { 136 // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE. 137 // Do this here so that callers need not do it. 138 return encodeOneCEAsCE32(0); 139 } else if(cesLength == 1) { 140 return encodeOneCE(ces[0]); 141 } else if(cesLength == 2) { 142 // Try to encode two CEs as one CE32. 143 long ce0 = ces[0]; 144 long ce1 = ces[1]; 145 long p0 = ce0 >>> 32; 146 if((ce0 & 0xffffffffff00ffL) == Collation.COMMON_SECONDARY_CE && 147 (ce1 & 0xffffffff00ffffffL) == Collation.COMMON_TERTIARY_CE && 148 p0 != 0) { 149 // Latin mini expansion 150 return 151 (int)p0 | 152 (((int)ce0 & 0xff00) << 8) | 153 (((int)ce1 >> 16) & 0xff00) | 154 Collation.SPECIAL_CE32_LOW_BYTE | 155 Collation.LATIN_EXPANSION_TAG; 156 } 157 } 158 // Try to encode two or more CEs as CE32s. 159 int[] newCE32s = new int[Collation.MAX_EXPANSION_LENGTH]; // TODO: instance field? 160 for(int i = 0;; ++i) { 161 if(i == cesLength) { 162 return encodeExpansion32(newCE32s, 0, cesLength); 163 } 164 int ce32 = encodeOneCEAsCE32(ces[i]); 165 if(ce32 == Collation.NO_CE32) { break; } 166 newCE32s[i] = ce32; 167 } 168 return encodeExpansion(ces, 0, cesLength); 169 } 170 171 void addCE32(CharSequence prefix, CharSequence s, int ce32) { 172 if(s.length() == 0) { 173 throw new IllegalArgumentException("mapping from empty string"); 174 } 175 if(!isMutable()) { 176 throw new IllegalStateException("attempt to add mappings after build()"); 177 } 178 int c = Character.codePointAt(s, 0); 179 int cLength = Character.charCount(c); 180 int oldCE32 = trie.get(c); 181 boolean hasContext = prefix.length() != 0|| s.length() > cLength; 182 if(oldCE32 == Collation.FALLBACK_CE32) { 183 // First tailoring for c. 184 // If c has contextual base mappings or if we add a contextual mapping, 185 // then copy the base mappings. 186 // Otherwise we just override the base mapping. 187 int baseCE32 = base.getFinalCE32(base.getCE32(c)); 188 if(hasContext || Collation.ce32HasContext(baseCE32)) { 189 oldCE32 = copyFromBaseCE32(c, baseCE32, true); 190 trie.set(c, oldCE32); 191 } 192 } 193 if(!hasContext) { 194 // No prefix, no contraction. 195 if(!isBuilderContextCE32(oldCE32)) { 196 trie.set(c, ce32); 197 } else { 198 ConditionalCE32 cond = getConditionalCE32ForCE32(oldCE32); 199 cond.builtCE32 = Collation.NO_CE32; 200 cond.ce32 = ce32; 201 } 202 } else { 203 ConditionalCE32 cond; 204 if(!isBuilderContextCE32(oldCE32)) { 205 // Replace the simple oldCE32 with a builder context CE32 206 // pointing to a new ConditionalCE32 list head. 207 int index = addConditionalCE32("\0", oldCE32); 208 int contextCE32 = makeBuilderContextCE32(index); 209 trie.set(c, contextCE32); 210 contextChars.add(c); 211 cond = getConditionalCE32(index); 212 } else { 213 cond = getConditionalCE32ForCE32(oldCE32); 214 cond.builtCE32 = Collation.NO_CE32; 215 } 216 CharSequence suffix = s.subSequence(cLength, s.length()); 217 String context = new StringBuilder().append((char)prefix.length()). 218 append(prefix).append(suffix).toString(); 219 unsafeBackwardSet.addAll(suffix); 220 for(;;) { 221 // invariant: context > cond.context 222 int next = cond.next; 223 if(next < 0) { 224 // Append a new ConditionalCE32 after cond. 225 int index = addConditionalCE32(context, ce32); 226 cond.next = index; 227 break; 228 } 229 ConditionalCE32 nextCond = getConditionalCE32(next); 230 int cmp = context.compareTo(nextCond.context); 231 if(cmp < 0) { 232 // Insert a new ConditionalCE32 between cond and nextCond. 233 int index = addConditionalCE32(context, ce32); 234 cond.next = index; 235 getConditionalCE32(index).next = next; 236 break; 237 } else if(cmp == 0) { 238 // Same context as before, overwrite its ce32. 239 nextCond.ce32 = ce32; 240 break; 241 } 242 cond = nextCond; 243 } 244 } 245 modified = true; 246 } 247 248 /** 249 * Copies all mappings from the src builder, with modifications. 250 * This builder here must not be built yet, and should be empty. 251 */ 252 void copyFrom(CollationDataBuilder src, CEModifier modifier) { 253 if(!isMutable()) { 254 throw new IllegalStateException("attempt to copyFrom() after build()"); 255 } 256 CopyHelper helper = new CopyHelper(src, this, modifier); 257 Iterator<Trie2.Range> trieIterator = src.trie.iterator(); 258 Trie2.Range range; 259 while(trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) { 260 enumRangeForCopy(range.startCodePoint, range.endCodePoint, range.value, helper); 261 } 262 // Update the contextChars and the unsafeBackwardSet while copying, 263 // in case a character had conditional mappings in the source builder 264 // and they were removed later. 265 modified |= src.modified; 266 } 267 268 void optimize(UnicodeSet set) { 269 if(set.isEmpty()) { return; } 270 UnicodeSetIterator iter = new UnicodeSetIterator(set); 271 while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) { 272 int c = iter.codepoint; 273 int ce32 = trie.get(c); 274 if(ce32 == Collation.FALLBACK_CE32) { 275 ce32 = base.getFinalCE32(base.getCE32(c)); 276 ce32 = copyFromBaseCE32(c, ce32, true); 277 trie.set(c, ce32); 278 } 279 } 280 modified = true; 281 } 282 283 void suppressContractions(UnicodeSet set) { 284 if(set.isEmpty()) { return; } 285 UnicodeSetIterator iter = new UnicodeSetIterator(set); 286 while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) { 287 int c = iter.codepoint; 288 int ce32 = trie.get(c); 289 if(ce32 == Collation.FALLBACK_CE32) { 290 ce32 = base.getFinalCE32(base.getCE32(c)); 291 if(Collation.ce32HasContext(ce32)) { 292 ce32 = copyFromBaseCE32(c, ce32, false /* without context */); 293 trie.set(c, ce32); 294 } 295 } else if(isBuilderContextCE32(ce32)) { 296 ce32 = getConditionalCE32ForCE32(ce32).ce32; 297 // Simply abandon the list of ConditionalCE32. 298 // The caller will copy this builder in the end, 299 // eliminating unreachable data. 300 trie.set(c, ce32); 301 contextChars.remove(c); 302 } 303 } 304 modified = true; 305 } 306 307 void enableFastLatin() { fastLatinEnabled = true; } 308 void build(CollationData data) { 309 buildMappings(data); 310 if(base != null) { 311 data.numericPrimary = base.numericPrimary; 312 data.compressibleBytes = base.compressibleBytes; 313 data.numScripts = base.numScripts; 314 data.scriptsIndex = base.scriptsIndex; 315 data.scriptStarts = base.scriptStarts; 316 } 317 buildFastLatinTable(data); 318 } 319 320 /** 321 * Looks up CEs for s and appends them to the ces array. 322 * Does not handle normalization: s should be in FCD form. 323 * 324 * Does not write completely ignorable CEs. 325 * Does not write beyond Collation.MAX_EXPANSION_LENGTH. 326 * 327 * @return incremented cesLength 328 */ 329 int getCEs(CharSequence s, long ces[], int cesLength) { 330 return getCEs(s, 0, ces, cesLength); 331 } 332 333 int getCEs(CharSequence prefix, CharSequence s, long ces[], int cesLength) { 334 int prefixLength = prefix.length(); 335 if(prefixLength == 0) { 336 return getCEs(s, 0, ces, cesLength); 337 } else { 338 return getCEs(new StringBuilder(prefix).append(s), prefixLength, ces, cesLength); 339 } 340 } 341 342 /** 343 * Build-time context and CE32 for a code point. 344 * If a code point has contextual mappings, then the default (no-context) mapping 345 * and all conditional mappings are stored in a singly-linked list 346 * of ConditionalCE32, sorted by context strings. 347 * 348 * Context strings sort by prefix length, then by prefix, then by contraction suffix. 349 * Context strings must be unique and in ascending order. 350 */ 351 private static final class ConditionalCE32 { 352 ConditionalCE32(String ct, int ce) { 353 context = ct; 354 ce32 = ce; 355 defaultCE32 = Collation.NO_CE32; 356 builtCE32 = Collation.NO_CE32; 357 next = -1; 358 } 359 360 boolean hasContext() { return context.length() > 1; } 361 int prefixLength() { return context.charAt(0); } 362 363 /** 364 * "\0" for the first entry for any code point, with its default CE32. 365 * 366 * Otherwise one unit with the length of the prefix string, 367 * then the prefix string, then the contraction suffix. 368 */ 369 String context; 370 /** 371 * CE32 for the code point and its context. 372 * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag). 373 */ 374 int ce32; 375 /** 376 * Default CE32 for all contexts with this same prefix. 377 * Initially NO_CE32. Set only while building runtime data structures, 378 * and only on one of the nodes of a sub-list with the same prefix. 379 */ 380 int defaultCE32; 381 /** 382 * CE32 for the built contexts. 383 * When fetching CEs from the builder, the contexts are built into their runtime form 384 * so that the normal collation implementation can process them. 385 * The result is cached in the list head. It is reset when the contexts are modified. 386 */ 387 int builtCE32; 388 /** 389 * Index of the next ConditionalCE32. 390 * Negative for the end of the list. 391 */ 392 int next; 393 } 394 395 protected int getCE32FromOffsetCE32(boolean fromBase, int c, int ce32) { 396 int i = Collation.indexFromCE32(ce32); 397 long dataCE = fromBase ? base.ces[i] : ce64s.elementAti(i); 398 long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE); 399 return Collation.makeLongPrimaryCE32(p); 400 } 401 402 protected int addCE(long ce) { 403 int length = ce64s.size(); 404 for(int i = 0; i < length; ++i) { 405 if(ce == ce64s.elementAti(i)) { return i; } 406 } 407 ce64s.addElement(ce); 408 return length; 409 } 410 411 protected int addCE32(int ce32) { 412 int length = ce32s.size(); 413 for(int i = 0; i < length; ++i) { 414 if(ce32 == ce32s.elementAti(i)) { return i; } 415 } 416 ce32s.addElement(ce32); 417 return length; 418 } 419 420 protected int addConditionalCE32(String context, int ce32) { 421 assert(context.length() != 0); 422 int index = conditionalCE32s.size(); 423 if(index > Collation.MAX_INDEX) { 424 throw new IndexOutOfBoundsException("too many context-sensitive mappings"); 425 // BufferOverflowException is a better fit 426 // but cannot be constructed with a message string. 427 } 428 ConditionalCE32 cond = new ConditionalCE32(context, ce32); 429 conditionalCE32s.add(cond); 430 return index; 431 } 432 433 protected ConditionalCE32 getConditionalCE32(int index) { 434 return conditionalCE32s.get(index); 435 } 436 protected ConditionalCE32 getConditionalCE32ForCE32(int ce32) { 437 return getConditionalCE32(Collation.indexFromCE32(ce32)); 438 } 439 440 protected static int makeBuilderContextCE32(int index) { 441 return Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, index); 442 } 443 protected static boolean isBuilderContextCE32(int ce32) { 444 return Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG); 445 } 446 447 protected static int encodeOneCEAsCE32(long ce) { 448 long p = ce >>> 32; 449 int lower32 = (int)ce; 450 int t = lower32 & 0xffff; 451 assert((t & 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s. 452 if((ce & 0xffff00ff00ffL) == 0) { 453 // normal form ppppsstt 454 return (int)p | (lower32 >>> 16) | (t >> 8); 455 } else if((ce & 0xffffffffffL) == Collation.COMMON_SEC_AND_TER_CE) { 456 // long-primary form ppppppC1 457 return Collation.makeLongPrimaryCE32(p); 458 } else if(p == 0 && (t & 0xff) == 0) { 459 // long-secondary form ssssttC2 460 return Collation.makeLongSecondaryCE32(lower32); 461 } 462 return Collation.NO_CE32; 463 } 464 465 protected int encodeOneCE(long ce) { 466 // Try to encode one CE as one CE32. 467 int ce32 = encodeOneCEAsCE32(ce); 468 if(ce32 != Collation.NO_CE32) { return ce32; } 469 int index = addCE(ce); 470 if(index > Collation.MAX_INDEX) { 471 throw new IndexOutOfBoundsException("too many mappings"); 472 // BufferOverflowException is a better fit 473 // but cannot be constructed with a message string. 474 } 475 return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, index, 1); 476 } 477 478 protected int encodeExpansion(long ces[], int start, int length) { 479 // See if this sequence of CEs has already been stored. 480 long first = ces[start]; 481 int ce64sMax = ce64s.size() - length; 482 for(int i = 0; i <= ce64sMax; ++i) { 483 if(first == ce64s.elementAti(i)) { 484 if(i > Collation.MAX_INDEX) { 485 throw new IndexOutOfBoundsException("too many mappings"); 486 // BufferOverflowException is a better fit 487 // but cannot be constructed with a message string. 488 } 489 for(int j = 1;; ++j) { 490 if(j == length) { 491 return Collation.makeCE32FromTagIndexAndLength( 492 Collation.EXPANSION_TAG, i, length); 493 } 494 if(ce64s.elementAti(i + j) != ces[start + j]) { break; } 495 } 496 } 497 } 498 // Store the new sequence. 499 int i = ce64s.size(); 500 if(i > Collation.MAX_INDEX) { 501 throw new IndexOutOfBoundsException("too many mappings"); 502 // BufferOverflowException is a better fit 503 // but cannot be constructed with a message string. 504 } 505 for(int j = 0; j < length; ++j) { 506 ce64s.addElement(ces[start + j]); 507 } 508 return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, i, length); 509 } 510 511 protected int encodeExpansion32(int newCE32s[], int start, int length) { 512 // See if this sequence of CE32s has already been stored. 513 int first = newCE32s[start]; 514 int ce32sMax = ce32s.size() - length; 515 for(int i = 0; i <= ce32sMax; ++i) { 516 if(first == ce32s.elementAti(i)) { 517 if(i > Collation.MAX_INDEX) { 518 throw new IndexOutOfBoundsException("too many mappings"); 519 // BufferOverflowException is a better fit 520 // but cannot be constructed with a message string. 521 } 522 for(int j = 1;; ++j) { 523 if(j == length) { 524 return Collation.makeCE32FromTagIndexAndLength( 525 Collation.EXPANSION32_TAG, i, length); 526 } 527 if(ce32s.elementAti(i + j) != newCE32s[start + j]) { break; } 528 } 529 } 530 } 531 // Store the new sequence. 532 int i = ce32s.size(); 533 if(i > Collation.MAX_INDEX) { 534 throw new IndexOutOfBoundsException("too many mappings"); 535 // BufferOverflowException is a better fit 536 // but cannot be constructed with a message string. 537 } 538 for(int j = 0; j < length; ++j) { 539 ce32s.addElement(newCE32s[start + j]); 540 } 541 return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION32_TAG, i, length); 542 } 543 544 protected int copyFromBaseCE32(int c, int ce32, boolean withContext) { 545 if(!Collation.isSpecialCE32(ce32)) { return ce32; } 546 switch(Collation.tagFromCE32(ce32)) { 547 case Collation.LONG_PRIMARY_TAG: 548 case Collation.LONG_SECONDARY_TAG: 549 case Collation.LATIN_EXPANSION_TAG: 550 // copy as is 551 break; 552 case Collation.EXPANSION32_TAG: { 553 int index = Collation.indexFromCE32(ce32); 554 int length = Collation.lengthFromCE32(ce32); 555 ce32 = encodeExpansion32(base.ce32s, index, length); 556 break; 557 } 558 case Collation.EXPANSION_TAG: { 559 int index = Collation.indexFromCE32(ce32); 560 int length = Collation.lengthFromCE32(ce32); 561 ce32 = encodeExpansion(base.ces, index, length); 562 break; 563 } 564 case Collation.PREFIX_TAG: { 565 // Flatten prefixes and nested suffixes (contractions) 566 // into a linear list of ConditionalCE32. 567 int trieIndex = Collation.indexFromCE32(ce32); 568 ce32 = base.getCE32FromContexts(trieIndex); // Default if no prefix match. 569 if(!withContext) { 570 return copyFromBaseCE32(c, ce32, false); 571 } 572 ConditionalCE32 head = new ConditionalCE32("", 0); 573 StringBuilder context = new StringBuilder("\0"); 574 int index; 575 if(Collation.isContractionCE32(ce32)) { 576 index = copyContractionsFromBaseCE32(context, c, ce32, head); 577 } else { 578 ce32 = copyFromBaseCE32(c, ce32, true); 579 head.next = index = addConditionalCE32(context.toString(), ce32); 580 } 581 ConditionalCE32 cond = getConditionalCE32(index); // the last ConditionalCE32 so far 582 CharsTrie.Iterator prefixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0); 583 while(prefixes.hasNext()) { 584 CharsTrie.Entry entry = prefixes.next(); 585 context.setLength(0); 586 context.append(entry.chars).reverse().insert(0, (char)entry.chars.length()); 587 ce32 = entry.value; 588 if(Collation.isContractionCE32(ce32)) { 589 index = copyContractionsFromBaseCE32(context, c, ce32, cond); 590 } else { 591 ce32 = copyFromBaseCE32(c, ce32, true); 592 cond.next = index = addConditionalCE32(context.toString(), ce32); 593 } 594 cond = getConditionalCE32(index); 595 } 596 ce32 = makeBuilderContextCE32(head.next); 597 contextChars.add(c); 598 break; 599 } 600 case Collation.CONTRACTION_TAG: { 601 if(!withContext) { 602 int index = Collation.indexFromCE32(ce32); 603 ce32 = base.getCE32FromContexts(index); // Default if no suffix match. 604 return copyFromBaseCE32(c, ce32, false); 605 } 606 ConditionalCE32 head = new ConditionalCE32("", 0); 607 StringBuilder context = new StringBuilder("\0"); 608 copyContractionsFromBaseCE32(context, c, ce32, head); 609 ce32 = makeBuilderContextCE32(head.next); 610 contextChars.add(c); 611 break; 612 } 613 case Collation.HANGUL_TAG: 614 throw new UnsupportedOperationException("We forbid tailoring of Hangul syllables."); 615 case Collation.OFFSET_TAG: 616 ce32 = getCE32FromOffsetCE32(true, c, ce32); 617 break; 618 case Collation.IMPLICIT_TAG: 619 ce32 = encodeOneCE(Collation.unassignedCEFromCodePoint(c)); 620 break; 621 default: 622 throw new AssertionError("copyFromBaseCE32(c, ce32, withContext) " + 623 "requires ce32 == base.getFinalCE32(ce32)"); 624 } 625 return ce32; 626 } 627 628 /** 629 * Copies base contractions to a list of ConditionalCE32. 630 * Sets cond.next to the index of the first new item 631 * and returns the index of the last new item. 632 */ 633 protected int copyContractionsFromBaseCE32(StringBuilder context, int c, int ce32, 634 ConditionalCE32 cond) { 635 int trieIndex = Collation.indexFromCE32(ce32); 636 int index; 637 if((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 638 // No match on the single code point. 639 // We are underneath a prefix, and the default mapping is just 640 // a fallback to the mappings for a shorter prefix. 641 assert(context.length() > 1); 642 index = -1; 643 } else { 644 ce32 = base.getCE32FromContexts(trieIndex); // Default if no suffix match. 645 assert(!Collation.isContractionCE32(ce32)); 646 ce32 = copyFromBaseCE32(c, ce32, true); 647 cond.next = index = addConditionalCE32(context.toString(), ce32); 648 cond = getConditionalCE32(index); 649 } 650 651 int suffixStart = context.length(); 652 CharsTrie.Iterator suffixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0); 653 while(suffixes.hasNext()) { 654 CharsTrie.Entry entry = suffixes.next(); 655 context.append(entry.chars); 656 ce32 = copyFromBaseCE32(c, entry.value, true); 657 cond.next = index = addConditionalCE32(context.toString(), ce32); 658 // No need to update the unsafeBackwardSet because the tailoring set 659 // is already a copy of the base set. 660 cond = getConditionalCE32(index); 661 context.setLength(suffixStart); 662 } 663 assert(index >= 0); 664 return index; 665 } 666 667 private static final class CopyHelper { 668 CopyHelper(CollationDataBuilder s, CollationDataBuilder d, 669 CollationDataBuilder.CEModifier m) { 670 src = s; 671 dest = d; 672 modifier = m; 673 } 674 675 void copyRangeCE32(int start, int end, int ce32) { 676 ce32 = copyCE32(ce32); 677 dest.trie.setRange(start, end, ce32, true); 678 if(CollationDataBuilder.isBuilderContextCE32(ce32)) { 679 dest.contextChars.add(start, end); 680 } 681 } 682 683 int copyCE32(int ce32) { 684 if(!Collation.isSpecialCE32(ce32)) { 685 long ce = modifier.modifyCE32(ce32); 686 if(ce != Collation.NO_CE) { 687 ce32 = dest.encodeOneCE(ce); 688 } 689 } else { 690 int tag = Collation.tagFromCE32(ce32); 691 if(tag == Collation.EXPANSION32_TAG) { 692 int[] srcCE32s = src.ce32s.getBuffer(); 693 int srcIndex = Collation.indexFromCE32(ce32); 694 int length = Collation.lengthFromCE32(ce32); 695 // Inspect the source CE32s. Just copy them if none are modified. 696 // Otherwise copy to modifiedCEs, with modifications. 697 boolean isModified = false; 698 for(int i = 0; i < length; ++i) { 699 ce32 = srcCE32s[srcIndex + i]; 700 long ce; 701 if(Collation.isSpecialCE32(ce32) || 702 (ce = modifier.modifyCE32(ce32)) == Collation.NO_CE) { 703 if(isModified) { 704 modifiedCEs[i] = Collation.ceFromCE32(ce32); 705 } 706 } else { 707 if(!isModified) { 708 for(int j = 0; j < i; ++j) { 709 modifiedCEs[j] = Collation.ceFromCE32(srcCE32s[srcIndex + j]); 710 } 711 isModified = true; 712 } 713 modifiedCEs[i] = ce; 714 } 715 } 716 if(isModified) { 717 ce32 = dest.encodeCEs(modifiedCEs, length); 718 } else { 719 ce32 = dest.encodeExpansion32(srcCE32s, srcIndex, length); 720 } 721 } else if(tag == Collation.EXPANSION_TAG) { 722 long[] srcCEs = src.ce64s.getBuffer(); 723 int srcIndex = Collation.indexFromCE32(ce32); 724 int length = Collation.lengthFromCE32(ce32); 725 // Inspect the source CEs. Just copy them if none are modified. 726 // Otherwise copy to modifiedCEs, with modifications. 727 boolean isModified = false; 728 for(int i = 0; i < length; ++i) { 729 long srcCE = srcCEs[srcIndex + i]; 730 long ce = modifier.modifyCE(srcCE); 731 if(ce == Collation.NO_CE) { 732 if(isModified) { 733 modifiedCEs[i] = srcCE; 734 } 735 } else { 736 if(!isModified) { 737 for(int j = 0; j < i; ++j) { 738 modifiedCEs[j] = srcCEs[srcIndex + j]; 739 } 740 isModified = true; 741 } 742 modifiedCEs[i] = ce; 743 } 744 } 745 if(isModified) { 746 ce32 = dest.encodeCEs(modifiedCEs, length); 747 } else { 748 ce32 = dest.encodeExpansion(srcCEs, srcIndex, length); 749 } 750 } else if(tag == Collation.BUILDER_DATA_TAG) { 751 // Copy the list of ConditionalCE32. 752 ConditionalCE32 cond = src.getConditionalCE32ForCE32(ce32); 753 assert(!cond.hasContext()); 754 int destIndex = dest.addConditionalCE32( 755 cond.context, copyCE32(cond.ce32)); 756 ce32 = CollationDataBuilder.makeBuilderContextCE32(destIndex); 757 while(cond.next >= 0) { 758 cond = src.getConditionalCE32(cond.next); 759 ConditionalCE32 prevDestCond = dest.getConditionalCE32(destIndex); 760 destIndex = dest.addConditionalCE32( 761 cond.context, copyCE32(cond.ce32)); 762 int suffixStart = cond.prefixLength() + 1; 763 dest.unsafeBackwardSet.addAll(cond.context.substring(suffixStart)); 764 prevDestCond.next = destIndex; 765 } 766 } else { 767 // Just copy long CEs and Latin mini expansions (and other expected values) as is, 768 // assuming that the modifier would not modify them. 769 assert(tag == Collation.LONG_PRIMARY_TAG || 770 tag == Collation.LONG_SECONDARY_TAG || 771 tag == Collation.LATIN_EXPANSION_TAG || 772 tag == Collation.HANGUL_TAG); 773 } 774 } 775 return ce32; 776 } 777 778 CollationDataBuilder src; 779 CollationDataBuilder dest; 780 CollationDataBuilder.CEModifier modifier; 781 long[] modifiedCEs = new long[Collation.MAX_EXPANSION_LENGTH]; 782 } 783 784 private static void 785 enumRangeForCopy(int start, int end, int value, CopyHelper helper) { 786 if(value != Collation.UNASSIGNED_CE32 && value != Collation.FALLBACK_CE32) { 787 helper.copyRangeCE32(start, end, value); 788 } 789 } 790 791 protected boolean getJamoCE32s(int jamoCE32s[]) { 792 boolean anyJamoAssigned = base == null; // always set jamoCE32s in the base data 793 boolean needToCopyFromBase = false; 794 for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. 795 int jamo = jamoCpFromIndex(j); 796 boolean fromBase = false; 797 int ce32 = trie.get(jamo); 798 anyJamoAssigned |= Collation.isAssignedCE32(ce32); 799 // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned. 800 // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.) 801 if(ce32 == Collation.FALLBACK_CE32) { 802 fromBase = true; 803 ce32 = base.getCE32(jamo); 804 } 805 if(Collation.isSpecialCE32(ce32)) { 806 switch(Collation.tagFromCE32(ce32)) { 807 case Collation.LONG_PRIMARY_TAG: 808 case Collation.LONG_SECONDARY_TAG: 809 case Collation.LATIN_EXPANSION_TAG: 810 // Copy the ce32 as-is. 811 break; 812 case Collation.EXPANSION32_TAG: 813 case Collation.EXPANSION_TAG: 814 case Collation.PREFIX_TAG: 815 case Collation.CONTRACTION_TAG: 816 if(fromBase) { 817 // Defer copying until we know if anyJamoAssigned. 818 ce32 = Collation.FALLBACK_CE32; 819 needToCopyFromBase = true; 820 } 821 break; 822 case Collation.IMPLICIT_TAG: 823 // An unassigned Jamo should only occur in tests with incomplete bases. 824 assert(fromBase); 825 ce32 = Collation.FALLBACK_CE32; 826 needToCopyFromBase = true; 827 break; 828 case Collation.OFFSET_TAG: 829 ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32); 830 break; 831 case Collation.FALLBACK_TAG: 832 case Collation.RESERVED_TAG_3: 833 case Collation.BUILDER_DATA_TAG: 834 case Collation.DIGIT_TAG: 835 case Collation.U0000_TAG: 836 case Collation.HANGUL_TAG: 837 case Collation.LEAD_SURROGATE_TAG: 838 throw new AssertionError(String.format("unexpected special tag in ce32=0x%08x", ce32)); 839 } 840 } 841 jamoCE32s[j] = ce32; 842 } 843 if(anyJamoAssigned && needToCopyFromBase) { 844 for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) { 845 if(jamoCE32s[j] == Collation.FALLBACK_CE32) { 846 int jamo = jamoCpFromIndex(j); 847 jamoCE32s[j] = copyFromBaseCE32(jamo, base.getCE32(jamo), 848 /*withContext=*/ true); 849 } 850 } 851 } 852 return anyJamoAssigned; 853 } 854 855 protected void setDigitTags() { 856 UnicodeSet digits = new UnicodeSet("[:Nd:]"); 857 UnicodeSetIterator iter = new UnicodeSetIterator(digits); 858 while(iter.next()) { 859 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 860 int c = iter.codepoint; 861 int ce32 = trie.get(c); 862 if(ce32 != Collation.FALLBACK_CE32 && ce32 != Collation.UNASSIGNED_CE32) { 863 int index = addCE32(ce32); 864 if(index > Collation.MAX_INDEX) { 865 throw new IndexOutOfBoundsException("too many mappings"); 866 // BufferOverflowException is a better fit 867 // but cannot be constructed with a message string. 868 } 869 ce32 = Collation.makeCE32FromTagIndexAndLength( 870 Collation.DIGIT_TAG, index, UCharacter.digit(c)); // u_charDigitValue(c) 871 trie.set(c, ce32); 872 } 873 } 874 } 875 876 protected void setLeadSurrogates() { 877 for(char lead = 0xd800; lead < 0xdc00; ++lead) { 878 int leadValue = -1; 879 // utrie2_enumForLeadSurrogate(trie, lead, null, , &value); 880 Iterator<Trie2.Range> trieIterator = trie.iteratorForLeadSurrogate(lead); 881 while(trieIterator.hasNext()) { 882 Trie2.Range range = trieIterator.next(); 883 // The rest of this loop is equivalent to C++ enumRangeLeadValue(). 884 int value = range.value; 885 if(value == Collation.UNASSIGNED_CE32) { 886 value = Collation.LEAD_ALL_UNASSIGNED; 887 } else if(value == Collation.FALLBACK_CE32) { 888 value = Collation.LEAD_ALL_FALLBACK; 889 } else { 890 leadValue = Collation.LEAD_MIXED; 891 break; 892 } 893 if(leadValue < 0) { 894 leadValue = value; 895 } else if(leadValue != value) { 896 leadValue = Collation.LEAD_MIXED; 897 break; 898 } 899 } 900 trie.setForLeadSurrogateCodeUnit(lead, 901 Collation.makeCE32FromTagAndIndex(Collation.LEAD_SURROGATE_TAG, 0) | leadValue); 902 } 903 } 904 905 protected void buildMappings(CollationData data) { 906 if(!isMutable()) { 907 throw new IllegalStateException("attempt to build() after build()"); 908 } 909 910 buildContexts(); 911 912 int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; 913 int jamoIndex = -1; 914 if(getJamoCE32s(jamoCE32s)) { 915 jamoIndex = ce32s.size(); 916 for(int i = 0; i < CollationData.JAMO_CE32S_LENGTH; ++i) { 917 ce32s.addElement(jamoCE32s[i]); 918 } 919 // Small optimization: Use a bit in the Hangul ce32 920 // to indicate that none of the Jamo CE32s are isSpecialCE32() 921 // (as it should be in the root collator). 922 // It allows CollationIterator to avoid recursive function calls and per-Jamo tests. 923 // In order to still have good trie compression and keep this code simple, 924 // we only set this flag if a whole block of 588 Hangul syllables starting with 925 // a common leading consonant (Jamo L) has this property. 926 boolean isAnyJamoVTSpecial = false; 927 for(int i = Hangul.JAMO_L_COUNT; i < CollationData.JAMO_CE32S_LENGTH; ++i) { 928 if(Collation.isSpecialCE32(jamoCE32s[i])) { 929 isAnyJamoVTSpecial = true; 930 break; 931 } 932 } 933 int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0); 934 int c = Hangul.HANGUL_BASE; 935 for(int i = 0; i < Hangul.JAMO_L_COUNT; ++i) { // iterate over the Jamo L 936 int ce32 = hangulCE32; 937 if(!isAnyJamoVTSpecial && !Collation.isSpecialCE32(jamoCE32s[i])) { 938 ce32 |= Collation.HANGUL_NO_SPECIAL_JAMO; 939 } 940 int limit = c + Hangul.JAMO_VT_COUNT; 941 trie.setRange(c, limit - 1, ce32, true); 942 c = limit; 943 } 944 } else { 945 // Copy the Hangul CE32s from the base in blocks per Jamo L, 946 // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks. 947 for(int c = Hangul.HANGUL_BASE; c < Hangul.HANGUL_LIMIT;) { 948 int ce32 = base.getCE32(c); 949 assert(Collation.hasCE32Tag(ce32, Collation.HANGUL_TAG)); 950 int limit = c + Hangul.JAMO_VT_COUNT; 951 trie.setRange(c, limit - 1, ce32, true); 952 c = limit; 953 } 954 } 955 956 setDigitTags(); 957 setLeadSurrogates(); 958 959 // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG. 960 ce32s.setElementAt(trie.get(0), 0); 961 trie.set(0, Collation.makeCE32FromTagAndIndex(Collation.U0000_TAG, 0)); 962 963 data.trie = trie.toTrie2_32(); 964 965 // Mark each lead surrogate as "unsafe" 966 // if any of its 1024 associated supplementary code points is "unsafe". 967 int c = 0x10000; 968 for(char lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { 969 if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) { 970 unsafeBackwardSet.add(lead); 971 } 972 } 973 unsafeBackwardSet.freeze(); 974 975 data.ce32s = ce32s.getBuffer(); 976 data.ces = ce64s.getBuffer(); 977 data.contexts = contexts.toString(); 978 979 data.base = base; 980 if(jamoIndex >= 0) { 981 data.jamoCE32s = jamoCE32s; // C++: data.ce32s + jamoIndex 982 } else { 983 data.jamoCE32s = base.jamoCE32s; 984 } 985 data.unsafeBackwardSet = unsafeBackwardSet; 986 } 987 988 protected void clearContexts() { 989 contexts.setLength(0); 990 UnicodeSetIterator iter = new UnicodeSetIterator(contextChars); 991 while(iter.next()) { 992 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 993 int ce32 = trie.get(iter.codepoint); 994 assert(isBuilderContextCE32(ce32)); 995 getConditionalCE32ForCE32(ce32).builtCE32 = Collation.NO_CE32; 996 } 997 } 998 999 protected void buildContexts() { 1000 // Ignore abandoned lists and the cached builtCE32, 1001 // and build all contexts from scratch. 1002 contexts.setLength(0); 1003 UnicodeSetIterator iter = new UnicodeSetIterator(contextChars); 1004 while(iter.next()) { 1005 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 1006 int c = iter.codepoint; 1007 int ce32 = trie.get(c); 1008 if(!isBuilderContextCE32(ce32)) { 1009 throw new AssertionError("Impossible: No context data for c in contextChars."); 1010 } 1011 ConditionalCE32 cond = getConditionalCE32ForCE32(ce32); 1012 ce32 = buildContext(cond); 1013 trie.set(c, ce32); 1014 } 1015 } 1016 1017 protected int buildContext(ConditionalCE32 head) { 1018 // The list head must have no context. 1019 assert(!head.hasContext()); 1020 // The list head must be followed by one or more nodes that all do have context. 1021 assert(head.next >= 0); 1022 CharsTrieBuilder prefixBuilder = new CharsTrieBuilder(); 1023 CharsTrieBuilder contractionBuilder = new CharsTrieBuilder(); 1024 for(ConditionalCE32 cond = head;; cond = getConditionalCE32(cond.next)) { 1025 // After the list head, the prefix or suffix can be empty, but not both. 1026 assert(cond == head || cond.hasContext()); 1027 int prefixLength = cond.prefixLength(); 1028 StringBuilder prefix = new StringBuilder().append(cond.context, 0, prefixLength + 1); 1029 String prefixString = prefix.toString(); 1030 // Collect all contraction suffixes for one prefix. 1031 ConditionalCE32 firstCond = cond; 1032 ConditionalCE32 lastCond = cond; 1033 while(cond.next >= 0 && 1034 (cond = getConditionalCE32(cond.next)).context.startsWith(prefixString)) { 1035 lastCond = cond; 1036 } 1037 int ce32; 1038 int suffixStart = prefixLength + 1; // == prefix.length() 1039 if(lastCond.context.length() == suffixStart) { 1040 // One prefix without contraction suffix. 1041 assert(firstCond == lastCond); 1042 ce32 = lastCond.ce32; 1043 cond = lastCond; 1044 } else { 1045 // Build the contractions trie. 1046 contractionBuilder.clear(); 1047 // Entry for an empty suffix, to be stored before the trie. 1048 int emptySuffixCE32 = Collation.NO_CE32; // Will always be set to a real value. 1049 int flags = 0; 1050 if(firstCond.context.length() == suffixStart) { 1051 // There is a mapping for the prefix and the single character c. (p|c) 1052 // If no other suffix matches, then we return this value. 1053 emptySuffixCE32 = firstCond.ce32; 1054 cond = getConditionalCE32(firstCond.next); 1055 } else { 1056 // There is no mapping for the prefix and just the single character. 1057 // (There is no p|c, only p|cd, p|ce etc.) 1058 flags |= Collation.CONTRACT_SINGLE_CP_NO_MATCH; 1059 // When the prefix matches but none of the prefix-specific suffixes, 1060 // then we fall back to the mappings with the next-longest prefix, 1061 // and ultimately to mappings with no prefix. 1062 // Each fallback might be another set of contractions. 1063 // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c, 1064 // then in text "pch" we find the ch contraction. 1065 for(cond = head;; cond = getConditionalCE32(cond.next)) { 1066 int length = cond.prefixLength(); 1067 if(length == prefixLength) { break; } 1068 if(cond.defaultCE32 != Collation.NO_CE32 && 1069 (length==0 || prefixString.regionMatches( 1070 prefix.length() - length, cond.context, 1, length) 1071 /* C++: prefix.endsWith(cond.context, 1, length) */)) { 1072 emptySuffixCE32 = cond.defaultCE32; 1073 } 1074 } 1075 cond = firstCond; 1076 } 1077 // Optimization: Set a flag when 1078 // the first character of every contraction suffix has lccc!=0. 1079 // Short-circuits contraction matching when a normal letter follows. 1080 flags |= Collation.CONTRACT_NEXT_CCC; 1081 // Add all of the non-empty suffixes into the contraction trie. 1082 for(;;) { 1083 String suffix = cond.context.substring(suffixStart); 1084 int fcd16 = nfcImpl.getFCD16(suffix.codePointAt(0)); 1085 if(fcd16 <= 0xff) { 1086 flags &= ~Collation.CONTRACT_NEXT_CCC; 1087 } 1088 fcd16 = nfcImpl.getFCD16(suffix.codePointBefore(suffix.length())); 1089 if(fcd16 > 0xff) { 1090 // The last suffix character has lccc!=0, allowing for discontiguous contractions. 1091 flags |= Collation.CONTRACT_TRAILING_CCC; 1092 } 1093 contractionBuilder.add(suffix, cond.ce32); 1094 if(cond == lastCond) { break; } 1095 cond = getConditionalCE32(cond.next); 1096 } 1097 int index = addContextTrie(emptySuffixCE32, contractionBuilder); 1098 if(index > Collation.MAX_INDEX) { 1099 throw new IndexOutOfBoundsException("too many context-sensitive mappings"); 1100 // BufferOverflowException is a better fit 1101 // but cannot be constructed with a message string. 1102 } 1103 ce32 = Collation.makeCE32FromTagAndIndex(Collation.CONTRACTION_TAG, index) | flags; 1104 } 1105 assert(cond == lastCond); 1106 firstCond.defaultCE32 = ce32; 1107 if(prefixLength == 0) { 1108 if(cond.next < 0) { 1109 // No non-empty prefixes, only contractions. 1110 return ce32; 1111 } 1112 } else { 1113 prefix.delete(0, 1); // Remove the length unit. 1114 prefix.reverse(); 1115 prefixBuilder.add(prefix, ce32); 1116 if(cond.next < 0) { break; } 1117 } 1118 } 1119 assert(head.defaultCE32 != Collation.NO_CE32); 1120 int index = addContextTrie(head.defaultCE32, prefixBuilder); 1121 if(index > Collation.MAX_INDEX) { 1122 throw new IndexOutOfBoundsException("too many context-sensitive mappings"); 1123 // BufferOverflowException is a better fit 1124 // but cannot be constructed with a message string. 1125 } 1126 return Collation.makeCE32FromTagAndIndex(Collation.PREFIX_TAG, index); 1127 } 1128 1129 protected int addContextTrie(int defaultCE32, CharsTrieBuilder trieBuilder) { 1130 StringBuilder context = new StringBuilder(); 1131 context.append((char)(defaultCE32 >> 16)).append((char)defaultCE32); 1132 context.append(trieBuilder.buildCharSequence(StringTrieBuilder.Option.SMALL)); 1133 int index = contexts.indexOf(context.toString()); 1134 if(index < 0) { 1135 index = contexts.length(); 1136 contexts.append(context); 1137 } 1138 return index; 1139 } 1140 1141 protected void buildFastLatinTable(CollationData data) { 1142 if(!fastLatinEnabled) { return; } 1143 1144 fastLatinBuilder = new CollationFastLatinBuilder(); 1145 if(fastLatinBuilder.forData(data)) { 1146 char[] header = fastLatinBuilder.getHeader(); 1147 char[] table = fastLatinBuilder.getTable(); 1148 if(base != null && 1149 Arrays.equals(header, base.fastLatinTableHeader) && 1150 Arrays.equals(table, base.fastLatinTable)) { 1151 // Same fast Latin table as in the base, use that one instead. 1152 fastLatinBuilder = null; 1153 header = base.fastLatinTableHeader; 1154 table = base.fastLatinTable; 1155 } 1156 data.fastLatinTableHeader = header; 1157 data.fastLatinTable = table; 1158 } else { 1159 fastLatinBuilder = null; 1160 } 1161 } 1162 1163 protected int getCEs(CharSequence s, int start, long ces[], int cesLength) { 1164 if(collIter == null) { 1165 collIter = new DataBuilderCollationIterator(this, new CollationData(nfcImpl)); 1166 if(collIter == null) { return 0; } 1167 } 1168 return collIter.fetchCEs(s, start, ces, cesLength); 1169 } 1170 1171 protected static int jamoCpFromIndex(int i) { 1172 // 0 <= i < CollationData.JAMO_CE32S_LENGTH = 19 + 21 + 27 1173 if(i < Hangul.JAMO_L_COUNT) { return Hangul.JAMO_L_BASE + i; } 1174 i -= Hangul.JAMO_L_COUNT; 1175 if(i < Hangul.JAMO_V_COUNT) { return Hangul.JAMO_V_BASE + i; } 1176 i -= Hangul.JAMO_V_COUNT; 1177 // i < 27 1178 return Hangul.JAMO_T_BASE + 1 + i; 1179 } 1180 1181 /** 1182 * Build-time collation element and character iterator. 1183 * Uses the runtime CollationIterator for fetching CEs for a string 1184 * but reads from the builder's unfinished data structures. 1185 * In particular, this class reads from the unfinished trie 1186 * and has to avoid CollationIterator.nextCE() and redirect other 1187 * calls to data.getCE32() and data.getCE32FromSupplementary(). 1188 * 1189 * We do this so that we need not implement the collation algorithm 1190 * again for the builder and make it behave exactly like the runtime code. 1191 * That would be more difficult to test and maintain than this indirection. 1192 * 1193 * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data, 1194 * so the data accesses from those code paths need not be modified. 1195 * 1196 * This class iterates directly over whole code points 1197 * so that the CollationIterator does not need the finished trie 1198 * for handling the LEAD_SURROGATE_TAG. 1199 */ 1200 private static final class DataBuilderCollationIterator extends CollationIterator { 1201 DataBuilderCollationIterator(CollationDataBuilder b, CollationData newData) { 1202 super(newData, /*numeric=*/ false); 1203 builder = b; 1204 builderData = newData; 1205 builderData.base = builder.base; 1206 // Set all of the jamoCE32s[] to indirection CE32s. 1207 for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. 1208 int jamo = CollationDataBuilder.jamoCpFromIndex(j); 1209 jamoCE32s[j] = Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, jamo) | 1210 CollationDataBuilder.IS_BUILDER_JAMO_CE32; 1211 } 1212 builderData.jamoCE32s = jamoCE32s; 1213 } 1214 1215 int fetchCEs(CharSequence str, int start, long ces[], int cesLength) { 1216 // Set the pointers each time, in case they changed due to reallocation. 1217 builderData.ce32s = builder.ce32s.getBuffer(); 1218 builderData.ces = builder.ce64s.getBuffer(); 1219 builderData.contexts = builder.contexts.toString(); 1220 // Modified copy of CollationIterator.nextCE() and CollationIterator.nextCEFromCE32(). 1221 reset(); 1222 s = str; 1223 pos = start; 1224 while(pos < s.length()) { 1225 // No need to keep all CEs in the iterator buffer. 1226 clearCEs(); 1227 int c = Character.codePointAt(s, pos); 1228 pos += Character.charCount(c); 1229 int ce32 = builder.trie.get(c); 1230 CollationData d; 1231 if(ce32 == Collation.FALLBACK_CE32) { 1232 d = builder.base; 1233 ce32 = builder.base.getCE32(c); 1234 } else { 1235 d = builderData; 1236 } 1237 appendCEsFromCE32(d, c, ce32, /*forward=*/ true); 1238 for(int i = 0; i < getCEsLength(); ++i) { 1239 long ce = getCE(i); 1240 if(ce != 0) { 1241 if(cesLength < Collation.MAX_EXPANSION_LENGTH) { 1242 ces[cesLength] = ce; 1243 } 1244 ++cesLength; 1245 } 1246 } 1247 } 1248 return cesLength; 1249 } 1250 1251 @Override 1252 public void resetToOffset(int newOffset) { 1253 reset(); 1254 pos = newOffset; 1255 } 1256 1257 @Override 1258 public int getOffset() { 1259 return pos; 1260 } 1261 1262 @Override 1263 public int nextCodePoint() { 1264 if(pos == s.length()) { 1265 return Collation.SENTINEL_CP; 1266 } 1267 int c = Character.codePointAt(s, pos); 1268 pos += Character.charCount(c); 1269 return c; 1270 } 1271 1272 @Override 1273 public int previousCodePoint() { 1274 if(pos == 0) { 1275 return Collation.SENTINEL_CP; 1276 } 1277 int c = Character.codePointBefore(s, pos); 1278 pos -= Character.charCount(c); 1279 return c; 1280 } 1281 1282 @Override 1283 protected void forwardNumCodePoints(int num) { 1284 pos = Character.offsetByCodePoints(s, pos, num); 1285 } 1286 1287 @Override 1288 protected void backwardNumCodePoints(int num) { 1289 pos = Character.offsetByCodePoints(s, pos, -num); 1290 } 1291 1292 @Override 1293 protected int getDataCE32(int c) { 1294 return builder.trie.get(c); 1295 } 1296 1297 @Override 1298 protected int getCE32FromBuilderData(int ce32) { 1299 assert(Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG)); 1300 if((ce32 & CollationDataBuilder.IS_BUILDER_JAMO_CE32) != 0) { 1301 int jamo = Collation.indexFromCE32(ce32); 1302 return builder.trie.get(jamo); 1303 } else { 1304 ConditionalCE32 cond = builder.getConditionalCE32ForCE32(ce32); 1305 if(cond.builtCE32 == Collation.NO_CE32) { 1306 // Build the context-sensitive mappings into their runtime form and cache the result. 1307 try { 1308 cond.builtCE32 = builder.buildContext(cond); 1309 } catch(IndexOutOfBoundsException e) { 1310 builder.clearContexts(); 1311 cond.builtCE32 = builder.buildContext(cond); 1312 } 1313 builderData.contexts = builder.contexts.toString(); 1314 } 1315 return cond.builtCE32; 1316 } 1317 } 1318 1319 protected final CollationDataBuilder builder; 1320 protected final CollationData builderData; 1321 protected final int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; 1322 protected CharSequence s; 1323 protected int pos; 1324 } 1325 1326 protected final boolean isMutable() { 1327 // C++ tests !(trie == NULL || utrie2_isFrozen(trie)) 1328 // but Java Trie2Writable does not have an observable isFrozen() state. 1329 return trie != null && unsafeBackwardSet != null && !unsafeBackwardSet.isFrozen(); 1330 } 1331 1332 /** @see Collation.BUILDER_DATA_TAG */ 1333 private static final int IS_BUILDER_JAMO_CE32 = 0x100; 1334 1335 protected Normalizer2Impl nfcImpl; 1336 protected CollationData base; 1337 protected CollationSettings baseSettings; 1338 protected Trie2Writable trie; 1339 protected UVector32 ce32s; 1340 protected UVector64 ce64s; 1341 protected ArrayList<ConditionalCE32> conditionalCE32s; // vector of ConditionalCE32 1342 // Characters that have context (prefixes or contraction suffixes). 1343 protected UnicodeSet contextChars = new UnicodeSet(); 1344 // Serialized UCharsTrie structures for finalized contexts. 1345 protected StringBuilder contexts = new StringBuilder(); 1346 protected UnicodeSet unsafeBackwardSet = new UnicodeSet(); 1347 protected boolean modified; 1348 1349 protected boolean fastLatinEnabled; 1350 protected CollationFastLatinBuilder fastLatinBuilder; 1351 1352 protected DataBuilderCollationIterator collIter; 1353} 1354