1/* 2******************************************************************************* 3* Copyright (C) 2013-2015, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* CollationDataReader.java, ported from collationdatareader.h/.cpp 7* 8* C++ version created on: 2013feb07 9* created by: Markus W. Scherer 10*/ 11 12package com.ibm.icu.impl.coll; 13 14import java.io.IOException; 15import java.nio.ByteBuffer; 16import java.nio.CharBuffer; 17import java.util.Arrays; 18 19import com.ibm.icu.impl.ICUBinary; 20import com.ibm.icu.impl.Trie2_32; 21import com.ibm.icu.impl.USerializedSet; 22import com.ibm.icu.text.Collator; 23import com.ibm.icu.text.UnicodeSet; 24import com.ibm.icu.util.ICUException; 25 26/** 27 * Collation binary data reader. 28 */ 29final class CollationDataReader /* all static */ { 30 // The following constants are also copied into source/common/ucol_swp.cpp. 31 // Keep them in sync! 32 /** 33 * Number of int indexes. 34 * 35 * Can be 2 if there are only options. 36 * Can be 7 or 8 if there are only options and a script reordering. 37 * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. 38 */ 39 static final int IX_INDEXES_LENGTH = 0; 40 /** 41 * Bits 31..24: numericPrimary, for numeric collation 42 * 23..16: fast Latin format version (0 = no fast Latin table) 43 * 15.. 0: options bit set 44 */ 45 static final int IX_OPTIONS = 1; 46 static final int IX_RESERVED2 = 2; 47 static final int IX_RESERVED3 = 3; 48 49 /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ 50 static final int IX_JAMO_CE32S_START = 4; 51 52 // Byte offsets from the start of the data, after the generic header. 53 // The indexes[] are at byte offset 0, other data follows. 54 // Each data item is aligned properly. 55 // The data items should be in descending order of unit size, 56 // to minimize the need for padding. 57 // Each item's byte length is given by the difference between its offset and 58 // the next index/offset value. 59 /** Byte offset to int reorderCodes[]. */ 60 static final int IX_REORDER_CODES_OFFSET = 5; 61 /** 62 * Byte offset to uint8_t reorderTable[]. 63 * Empty table if <256 bytes (padding only). 64 * Otherwise 256 bytes or more (with padding). 65 */ 66 static final int IX_REORDER_TABLE_OFFSET = 6; 67 /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ 68 static final int IX_TRIE_OFFSET = 7; 69 70 static final int IX_RESERVED8_OFFSET = 8; 71 /** Byte offset to long ces[]. */ 72 static final int IX_CES_OFFSET = 9; 73 static final int IX_RESERVED10_OFFSET = 10; 74 /** Byte offset to int ce32s[]. */ 75 static final int IX_CE32S_OFFSET = 11; 76 77 /** Byte offset to uint32_t rootElements[]. */ 78 static final int IX_ROOT_ELEMENTS_OFFSET = 12; 79 /** Byte offset to UChar *contexts[]. */ 80 static final int IX_CONTEXTS_OFFSET = 13; 81 /** Byte offset to char [] with serialized unsafeBackwardSet. */ 82 static final int IX_UNSAFE_BWD_OFFSET = 14; 83 /** Byte offset to char fastLatinTable[]. */ 84 static final int IX_FAST_LATIN_TABLE_OFFSET = 15; 85 86 /** Byte offset to char scripts[]. */ 87 static final int IX_SCRIPTS_OFFSET = 16; 88 /** 89 * Byte offset to boolean compressibleBytes[]. 90 * Empty table if <256 bytes (padding only). 91 * Otherwise 256 bytes or more (with padding). 92 */ 93 static final int IX_COMPRESSIBLE_BYTES_OFFSET = 17; 94 static final int IX_RESERVED18_OFFSET = 18; 95 static final int IX_TOTAL_SIZE = 19; 96 97 static void read(CollationTailoring base, ByteBuffer inBytes, 98 CollationTailoring tailoring) throws IOException { 99 tailoring.version = ICUBinary.readHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE); 100 if(base != null && base.getUCAVersion() != tailoring.getUCAVersion()) { 101 throw new ICUException("Tailoring UCA version differs from base data UCA version"); 102 } 103 104 int inLength = inBytes.remaining(); 105 if(inLength < 8) { 106 throw new ICUException("not enough bytes"); 107 } 108 int indexesLength = inBytes.getInt(); // inIndexes[IX_INDEXES_LENGTH] 109 if(indexesLength < 2 || inLength < indexesLength * 4) { 110 throw new ICUException("not enough indexes"); 111 } 112 int[] inIndexes = new int[IX_TOTAL_SIZE + 1]; 113 inIndexes[0] = indexesLength; 114 for(int i = 1; i < indexesLength && i < inIndexes.length; ++i) { 115 inIndexes[i] = inBytes.getInt(); 116 } 117 for(int i = indexesLength; i < inIndexes.length; ++i) { 118 inIndexes[i] = -1; 119 } 120 if(indexesLength > inIndexes.length) { 121 ICUBinary.skipBytes(inBytes, (indexesLength - inIndexes.length) * 4); 122 } 123 124 // Assume that the tailoring data is in initial state, 125 // with null pointers and 0 lengths. 126 127 // Set pointers to non-empty data parts. 128 // Do this in order of their byte offsets. (Should help porting to Java.) 129 130 int index; // one of the indexes[] slots 131 int offset; // byte offset for the index part 132 int length; // number of bytes in the index part 133 134 if(indexesLength > IX_TOTAL_SIZE) { 135 length = inIndexes[IX_TOTAL_SIZE]; 136 } else if(indexesLength > IX_REORDER_CODES_OFFSET) { 137 length = inIndexes[indexesLength - 1]; 138 } else { 139 length = 0; // only indexes, and inLength was already checked for them 140 } 141 if(inLength < length) { 142 throw new ICUException("not enough bytes"); 143 } 144 145 CollationData baseData = base == null ? null : base.data; 146 int[] reorderCodes; 147 int reorderCodesLength; 148 index = IX_REORDER_CODES_OFFSET; 149 offset = inIndexes[index]; 150 length = inIndexes[index + 1] - offset; 151 if(length >= 4) { 152 if(baseData == null) { 153 // We assume for collation settings that 154 // the base data does not have a reordering. 155 throw new ICUException("Collation base data must not reorder scripts"); 156 } 157 reorderCodesLength = length / 4; 158 reorderCodes = new int[reorderCodesLength]; 159 for(int i = 0; i < reorderCodesLength; ++i) { 160 reorderCodes[i] = inBytes.getInt(); 161 } 162 length &= 3; 163 164 // The reorderRanges (if any) are the trailing reorderCodes entries. 165 // Split the array at the boundary. 166 // Script or reorder codes do not exceed 16-bit values. 167 // Range limits are stored in the upper 16 bits, and are never 0. 168 int reorderRangesLength = 0; 169 while(reorderRangesLength < reorderCodesLength && 170 (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) { 171 ++reorderRangesLength; 172 } 173 assert(reorderRangesLength < reorderCodesLength); 174 reorderCodesLength -= reorderRangesLength; 175 } else { 176 reorderCodes = new int[0]; 177 reorderCodesLength = 0; 178 } 179 ICUBinary.skipBytes(inBytes, length); 180 181 // There should be a reorder table only if there are reorder codes. 182 // However, when there are reorder codes the reorder table may be omitted to reduce 183 // the data size. 184 byte[] reorderTable = null; 185 index = IX_REORDER_TABLE_OFFSET; 186 offset = inIndexes[index]; 187 length = inIndexes[index + 1] - offset; 188 if(length >= 256) { 189 if(reorderCodesLength == 0) { 190 throw new ICUException("Reordering table without reordering codes"); 191 } 192 reorderTable = new byte[256]; 193 inBytes.get(reorderTable); 194 length -= 256; 195 } else { 196 // If we have reorder codes, then build the reorderTable at the end, 197 // when the CollationData is otherwise complete. 198 } 199 ICUBinary.skipBytes(inBytes, length); 200 201 if(baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) { 202 throw new ICUException("Tailoring numeric primary weight differs from base data"); 203 } 204 CollationData data = null; // Remains null if there are no mappings. 205 206 index = IX_TRIE_OFFSET; 207 offset = inIndexes[index]; 208 length = inIndexes[index + 1] - offset; 209 if(length >= 8) { 210 tailoring.ensureOwnedData(); 211 data = tailoring.ownedData; 212 data.base = baseData; 213 data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L; 214 data.trie = tailoring.trie = Trie2_32.createFromSerialized(inBytes); 215 int trieLength = data.trie.getSerializedLength(); 216 if(trieLength > length) { 217 throw new ICUException("Not enough bytes for the mappings trie"); // No mappings. 218 } 219 length -= trieLength; 220 } else if(baseData != null) { 221 // Use the base data. Only the settings are tailored. 222 tailoring.data = baseData; 223 } else { 224 throw new ICUException("Missing collation data mappings"); // No mappings. 225 } 226 ICUBinary.skipBytes(inBytes, length); 227 228 index = IX_RESERVED8_OFFSET; 229 offset = inIndexes[index]; 230 length = inIndexes[index + 1] - offset; 231 ICUBinary.skipBytes(inBytes, length); 232 233 index = IX_CES_OFFSET; 234 offset = inIndexes[index]; 235 length = inIndexes[index + 1] - offset; 236 if(length >= 8) { 237 if(data == null) { 238 throw new ICUException("Tailored ces without tailored trie"); 239 } 240 data.ces = new long[length / 8]; 241 for(int i = 0; i < length / 8; ++i) { 242 data.ces[i] = inBytes.getLong(); 243 } 244 length &= 7; 245 } 246 ICUBinary.skipBytes(inBytes, length); 247 248 index = IX_RESERVED10_OFFSET; 249 offset = inIndexes[index]; 250 length = inIndexes[index + 1] - offset; 251 ICUBinary.skipBytes(inBytes, length); 252 253 index = IX_CE32S_OFFSET; 254 offset = inIndexes[index]; 255 length = inIndexes[index + 1] - offset; 256 if(length >= 4) { 257 if(data == null) { 258 throw new ICUException("Tailored ce32s without tailored trie"); 259 } 260 data.ce32s = new int[length / 4]; 261 for(int i = 0; i < length / 4; ++i) { 262 data.ce32s[i] = inBytes.getInt(); 263 } 264 length &= 3; 265 } 266 ICUBinary.skipBytes(inBytes, length); 267 268 int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START]; 269 if(jamoCE32sStart >= 0) { 270 if(data == null || data.ce32s == null) { 271 throw new ICUException("JamoCE32sStart index into non-existent ce32s[]"); 272 } 273 data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; 274 System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH); 275 } else if(data == null) { 276 // Nothing to do. 277 } else if(baseData != null) { 278 data.jamoCE32s = baseData.jamoCE32s; 279 } else { 280 throw new ICUException("Missing Jamo CE32s for Hangul processing"); 281 } 282 283 index = IX_ROOT_ELEMENTS_OFFSET; 284 offset = inIndexes[index]; 285 length = inIndexes[index + 1] - offset; 286 if(length >= 4) { 287 int rootElementsLength = length / 4; 288 if(data == null) { 289 throw new ICUException("Root elements but no mappings"); 290 } 291 if(rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) { 292 throw new ICUException("Root elements array too short"); 293 } 294 data.rootElements = new long[rootElementsLength]; 295 for(int i = 0; i < rootElementsLength; ++i) { 296 data.rootElements[i] = inBytes.getInt() & 0xffffffffL; // unsigned int -> long 297 } 298 long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE]; 299 if(commonSecTer != Collation.COMMON_SEC_AND_TER_CE) { 300 throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value"); 301 } 302 long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES]; 303 if((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) { 304 // [fixed last secondary common byte] is too low, 305 // and secondary weights would collide with compressed common secondaries. 306 throw new ICUException("[fixed last secondary common byte] is too low"); 307 } 308 length &= 3; 309 } 310 ICUBinary.skipBytes(inBytes, length); 311 312 index = IX_CONTEXTS_OFFSET; 313 offset = inIndexes[index]; 314 length = inIndexes[index + 1] - offset; 315 if(length >= 2) { 316 if(data == null) { 317 throw new ICUException("Tailored contexts without tailored trie"); 318 } 319 StringBuilder sb = new StringBuilder(length / 2); 320 for(int i = 0; i < length / 2; ++i) { 321 sb.append(inBytes.getChar()); 322 } 323 data.contexts = sb.toString(); 324 length &= 1; 325 } 326 ICUBinary.skipBytes(inBytes, length); 327 328 index = IX_UNSAFE_BWD_OFFSET; 329 offset = inIndexes[index]; 330 length = inIndexes[index + 1] - offset; 331 if(length >= 2) { 332 if(data == null) { 333 throw new ICUException("Unsafe-backward-set but no mappings"); 334 } 335 if(baseData == null) { 336 // Create the unsafe-backward set for the root collator. 337 // Include all non-zero combining marks and trail surrogates. 338 // We do this at load time, rather than at build time, 339 // to simplify Unicode version bootstrapping: 340 // The root data builder only needs the new FractionalUCA.txt data, 341 // but it need not be built with a version of ICU already updated to 342 // the corresponding new Unicode Character Database. 343 // 344 // The following is an optimized version of 345 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]"). 346 // It is faster and requires fewer code dependencies. 347 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates 348 data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet); 349 } else { 350 // Clone the root collator's set contents. 351 tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed(); 352 } 353 // Add the ranges from the data file to the unsafe-backward set. 354 USerializedSet sset = new USerializedSet(); 355 char[] unsafeData = new char[length / 2]; 356 for(int i = 0; i < length / 2; ++i) { 357 unsafeData[i] = inBytes.getChar(); 358 } 359 length &= 1; 360 sset.getSet(unsafeData, 0); 361 int count = sset.countRanges(); 362 int[] range = new int[2]; 363 for(int i = 0; i < count; ++i) { 364 sset.getRange(i, range); 365 tailoring.unsafeBackwardSet.add(range[0], range[1]); 366 } 367 // Mark each lead surrogate as "unsafe" 368 // if any of its 1024 associated supplementary code points is "unsafe". 369 int c = 0x10000; 370 for(int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { 371 if(!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) { 372 tailoring.unsafeBackwardSet.add(lead); 373 } 374 } 375 tailoring.unsafeBackwardSet.freeze(); 376 data.unsafeBackwardSet = tailoring.unsafeBackwardSet; 377 } else if(data == null) { 378 // Nothing to do. 379 } else if(baseData != null) { 380 // No tailoring-specific data: Alias the root collator's set. 381 data.unsafeBackwardSet = baseData.unsafeBackwardSet; 382 } else { 383 throw new ICUException("Missing unsafe-backward-set"); 384 } 385 ICUBinary.skipBytes(inBytes, length); 386 387 // If the fast Latin format version is different, 388 // or the version is set to 0 for "no fast Latin table", 389 // then just always use the normal string comparison path. 390 index = IX_FAST_LATIN_TABLE_OFFSET; 391 offset = inIndexes[index]; 392 length = inIndexes[index + 1] - offset; 393 if(data != null) { 394 data.fastLatinTable = null; 395 data.fastLatinTableHeader = null; 396 if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) { 397 if(length >= 2) { 398 char header0 = inBytes.getChar(); 399 int headerLength = header0 & 0xff; 400 data.fastLatinTableHeader = new char[headerLength]; 401 data.fastLatinTableHeader[0] = header0; 402 for(int i = 1; i < headerLength; ++i) { 403 data.fastLatinTableHeader[i] = inBytes.getChar(); 404 } 405 int tableLength = length / 2 - headerLength; 406 data.fastLatinTable = new char[tableLength]; 407 for(int i = 0; i < tableLength; ++i) { 408 data.fastLatinTable[i] = inBytes.getChar(); 409 } 410 length &= 1; 411 if((header0 >> 8) != CollationFastLatin.VERSION) { 412 throw new ICUException("Fast-Latin table version differs from version in data header"); 413 } 414 } else if(baseData != null) { 415 data.fastLatinTable = baseData.fastLatinTable; 416 data.fastLatinTableHeader = baseData.fastLatinTableHeader; 417 } 418 } 419 } 420 ICUBinary.skipBytes(inBytes, length); 421 422 index = IX_SCRIPTS_OFFSET; 423 offset = inIndexes[index]; 424 length = inIndexes[index + 1] - offset; 425 if(length >= 2) { 426 if(data == null) { 427 throw new ICUException("Script order data but no mappings"); 428 } 429 int scriptsLength = length / 2; 430 CharBuffer inChars = inBytes.asCharBuffer(); 431 data.numScripts = inChars.get(); 432 // There must be enough entries for both arrays, including more than two range starts. 433 int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16); 434 if(scriptStartsLength <= 2) { 435 throw new ICUException("Script order data too short"); 436 } 437 inChars.get(data.scriptsIndex = new char[data.numScripts + 16]); 438 inChars.get(data.scriptStarts = new char[scriptStartsLength]); 439 if(!(data.scriptStarts[0] == 0 && 440 data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) && 441 data.scriptStarts[scriptStartsLength - 1] == 442 (Collation.TRAIL_WEIGHT_BYTE << 8))) { 443 throw new ICUException("Script order data not valid"); 444 } 445 } else if(data == null) { 446 // Nothing to do. 447 } else if(baseData != null) { 448 data.numScripts = baseData.numScripts; 449 data.scriptsIndex = baseData.scriptsIndex; 450 data.scriptStarts = baseData.scriptStarts; 451 } 452 ICUBinary.skipBytes(inBytes, length); 453 454 index = IX_COMPRESSIBLE_BYTES_OFFSET; 455 offset = inIndexes[index]; 456 length = inIndexes[index + 1] - offset; 457 if(length >= 256) { 458 if(data == null) { 459 throw new ICUException("Data for compressible primary lead bytes but no mappings"); 460 } 461 data.compressibleBytes = new boolean[256]; 462 for(int i = 0; i < 256; ++i) { 463 data.compressibleBytes[i] = inBytes.get() != 0; 464 } 465 length -= 256; 466 } else if(data == null) { 467 // Nothing to do. 468 } else if(baseData != null) { 469 data.compressibleBytes = baseData.compressibleBytes; 470 } else { 471 throw new ICUException("Missing data for compressible primary lead bytes"); 472 } 473 ICUBinary.skipBytes(inBytes, length); 474 475 index = IX_RESERVED18_OFFSET; 476 offset = inIndexes[index]; 477 length = inIndexes[index + 1] - offset; 478 ICUBinary.skipBytes(inBytes, length); 479 480 CollationSettings ts = tailoring.settings.readOnly(); 481 int options = inIndexes[IX_OPTIONS] & 0xffff; 482 char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT]; 483 int fastLatinOptions = CollationFastLatin.getOptions( 484 tailoring.data, ts, fastLatinPrimaries); 485 if(options == ts.options && ts.variableTop != 0 && 486 Arrays.equals(reorderCodes, ts.reorderCodes) && 487 fastLatinOptions == ts.fastLatinOptions && 488 (fastLatinOptions < 0 || 489 Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) { 490 return; 491 } 492 493 CollationSettings settings = tailoring.settings.copyOnWrite(); 494 settings.options = options; 495 // Set variableTop from options and scripts data. 496 settings.variableTop = tailoring.data.getLastPrimaryForGroup( 497 Collator.ReorderCodes.FIRST + settings.getMaxVariable()); 498 if(settings.variableTop == 0) { 499 throw new ICUException("The maxVariable could not be mapped to a variableTop"); 500 } 501 502 if(reorderCodesLength != 0) { 503 settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable); 504 } 505 506 settings.fastLatinOptions = CollationFastLatin.getOptions( 507 tailoring.data, settings, 508 settings.fastLatinPrimaries); 509 } 510 511 private static final class IsAcceptable implements ICUBinary.Authenticate { 512 // @Override when we switch to Java 6 513 public boolean isDataVersionAcceptable(byte version[]) { 514 return version[0] == 5; 515 } 516 } 517 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 518 private static final int DATA_FORMAT = 0x55436f6c; // "UCol" 519 520 private CollationDataReader() {} // no constructor 521} 522 523/* 524 * Format of collation data (ucadata.icu, binary data in coll/ *.res files): 525 * See ICU4C source/common/collationdatareader.h. 526 */ 527