1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4******************************************************************************* 5* Copyright (C) 2013-2015, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8* CollationDataReader.java, ported from collationdatareader.h/.cpp 9* 10* C++ version created on: 2013feb07 11* created by: Markus W. Scherer 12*/ 13 14package com.ibm.icu.impl.coll; 15 16import java.io.IOException; 17import java.nio.ByteBuffer; 18import java.nio.CharBuffer; 19import java.util.Arrays; 20 21import com.ibm.icu.impl.ICUBinary; 22import com.ibm.icu.impl.Trie2_32; 23import com.ibm.icu.impl.USerializedSet; 24import com.ibm.icu.text.Collator; 25import com.ibm.icu.text.UnicodeSet; 26import com.ibm.icu.util.ICUException; 27 28/** 29 * Collation binary data reader. 30 */ 31final class CollationDataReader /* all static */ { 32 // The following constants are also copied into source/common/ucol_swp.cpp. 33 // Keep them in sync! 34 /** 35 * Number of int indexes. 36 * 37 * Can be 2 if there are only options. 38 * Can be 7 or 8 if there are only options and a script reordering. 39 * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. 40 */ 41 static final int IX_INDEXES_LENGTH = 0; 42 /** 43 * Bits 31..24: numericPrimary, for numeric collation 44 * 23..16: fast Latin format version (0 = no fast Latin table) 45 * 15.. 0: options bit set 46 */ 47 static final int IX_OPTIONS = 1; 48 static final int IX_RESERVED2 = 2; 49 static final int IX_RESERVED3 = 3; 50 51 /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ 52 static final int IX_JAMO_CE32S_START = 4; 53 54 // Byte offsets from the start of the data, after the generic header. 55 // The indexes[] are at byte offset 0, other data follows. 56 // Each data item is aligned properly. 57 // The data items should be in descending order of unit size, 58 // to minimize the need for padding. 59 // Each item's byte length is given by the difference between its offset and 60 // the next index/offset value. 61 /** Byte offset to int reorderCodes[]. */ 62 static final int IX_REORDER_CODES_OFFSET = 5; 63 /** 64 * Byte offset to uint8_t reorderTable[]. 65 * Empty table if <256 bytes (padding only). 66 * Otherwise 256 bytes or more (with padding). 67 */ 68 static final int IX_REORDER_TABLE_OFFSET = 6; 69 /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ 70 static final int IX_TRIE_OFFSET = 7; 71 72 static final int IX_RESERVED8_OFFSET = 8; 73 /** Byte offset to long ces[]. */ 74 static final int IX_CES_OFFSET = 9; 75 static final int IX_RESERVED10_OFFSET = 10; 76 /** Byte offset to int ce32s[]. */ 77 static final int IX_CE32S_OFFSET = 11; 78 79 /** Byte offset to uint32_t rootElements[]. */ 80 static final int IX_ROOT_ELEMENTS_OFFSET = 12; 81 /** Byte offset to UChar *contexts[]. */ 82 static final int IX_CONTEXTS_OFFSET = 13; 83 /** Byte offset to char [] with serialized unsafeBackwardSet. */ 84 static final int IX_UNSAFE_BWD_OFFSET = 14; 85 /** Byte offset to char fastLatinTable[]. */ 86 static final int IX_FAST_LATIN_TABLE_OFFSET = 15; 87 88 /** Byte offset to char scripts[]. */ 89 static final int IX_SCRIPTS_OFFSET = 16; 90 /** 91 * Byte offset to boolean compressibleBytes[]. 92 * Empty table if <256 bytes (padding only). 93 * Otherwise 256 bytes or more (with padding). 94 */ 95 static final int IX_COMPRESSIBLE_BYTES_OFFSET = 17; 96 static final int IX_RESERVED18_OFFSET = 18; 97 static final int IX_TOTAL_SIZE = 19; 98 99 static void read(CollationTailoring base, ByteBuffer inBytes, 100 CollationTailoring tailoring) throws IOException { 101 tailoring.version = ICUBinary.readHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE); 102 if(base != null && base.getUCAVersion() != tailoring.getUCAVersion()) { 103 throw new ICUException("Tailoring UCA version differs from base data UCA version"); 104 } 105 106 int inLength = inBytes.remaining(); 107 if(inLength < 8) { 108 throw new ICUException("not enough bytes"); 109 } 110 int indexesLength = inBytes.getInt(); // inIndexes[IX_INDEXES_LENGTH] 111 if(indexesLength < 2 || inLength < indexesLength * 4) { 112 throw new ICUException("not enough indexes"); 113 } 114 int[] inIndexes = new int[IX_TOTAL_SIZE + 1]; 115 inIndexes[0] = indexesLength; 116 for(int i = 1; i < indexesLength && i < inIndexes.length; ++i) { 117 inIndexes[i] = inBytes.getInt(); 118 } 119 for(int i = indexesLength; i < inIndexes.length; ++i) { 120 inIndexes[i] = -1; 121 } 122 if(indexesLength > inIndexes.length) { 123 ICUBinary.skipBytes(inBytes, (indexesLength - inIndexes.length) * 4); 124 } 125 126 // Assume that the tailoring data is in initial state, 127 // with null pointers and 0 lengths. 128 129 // Set pointers to non-empty data parts. 130 // Do this in order of their byte offsets. (Should help porting to Java.) 131 132 int index; // one of the indexes[] slots 133 int offset; // byte offset for the index part 134 int length; // number of bytes in the index part 135 136 if(indexesLength > IX_TOTAL_SIZE) { 137 length = inIndexes[IX_TOTAL_SIZE]; 138 } else if(indexesLength > IX_REORDER_CODES_OFFSET) { 139 length = inIndexes[indexesLength - 1]; 140 } else { 141 length = 0; // only indexes, and inLength was already checked for them 142 } 143 if(inLength < length) { 144 throw new ICUException("not enough bytes"); 145 } 146 147 CollationData baseData = base == null ? null : base.data; 148 int[] reorderCodes; 149 int reorderCodesLength; 150 index = IX_REORDER_CODES_OFFSET; 151 offset = inIndexes[index]; 152 length = inIndexes[index + 1] - offset; 153 if(length >= 4) { 154 if(baseData == null) { 155 // We assume for collation settings that 156 // the base data does not have a reordering. 157 throw new ICUException("Collation base data must not reorder scripts"); 158 } 159 reorderCodesLength = length / 4; 160 reorderCodes = ICUBinary.getInts(inBytes, reorderCodesLength, length & 3); 161 162 // The reorderRanges (if any) are the trailing reorderCodes entries. 163 // Split the array at the boundary. 164 // Script or reorder codes do not exceed 16-bit values. 165 // Range limits are stored in the upper 16 bits, and are never 0. 166 int reorderRangesLength = 0; 167 while(reorderRangesLength < reorderCodesLength && 168 (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) { 169 ++reorderRangesLength; 170 } 171 assert(reorderRangesLength < reorderCodesLength); 172 reorderCodesLength -= reorderRangesLength; 173 } else { 174 reorderCodes = new int[0]; 175 reorderCodesLength = 0; 176 ICUBinary.skipBytes(inBytes, length); 177 } 178 179 // There should be a reorder table only if there are reorder codes. 180 // However, when there are reorder codes the reorder table may be omitted to reduce 181 // the data size. 182 byte[] reorderTable = null; 183 index = IX_REORDER_TABLE_OFFSET; 184 offset = inIndexes[index]; 185 length = inIndexes[index + 1] - offset; 186 if(length >= 256) { 187 if(reorderCodesLength == 0) { 188 throw new ICUException("Reordering table without reordering codes"); 189 } 190 reorderTable = new byte[256]; 191 inBytes.get(reorderTable); 192 length -= 256; 193 } else { 194 // If we have reorder codes, then build the reorderTable at the end, 195 // when the CollationData is otherwise complete. 196 } 197 ICUBinary.skipBytes(inBytes, length); 198 199 if(baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) { 200 throw new ICUException("Tailoring numeric primary weight differs from base data"); 201 } 202 CollationData data = null; // Remains null if there are no mappings. 203 204 index = IX_TRIE_OFFSET; 205 offset = inIndexes[index]; 206 length = inIndexes[index + 1] - offset; 207 if(length >= 8) { 208 tailoring.ensureOwnedData(); 209 data = tailoring.ownedData; 210 data.base = baseData; 211 data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L; 212 data.trie = tailoring.trie = Trie2_32.createFromSerialized(inBytes); 213 int trieLength = data.trie.getSerializedLength(); 214 if(trieLength > length) { 215 throw new ICUException("Not enough bytes for the mappings trie"); // No mappings. 216 } 217 length -= trieLength; 218 } else if(baseData != null) { 219 // Use the base data. Only the settings are tailored. 220 tailoring.data = baseData; 221 } else { 222 throw new ICUException("Missing collation data mappings"); // No mappings. 223 } 224 ICUBinary.skipBytes(inBytes, length); 225 226 index = IX_RESERVED8_OFFSET; 227 offset = inIndexes[index]; 228 length = inIndexes[index + 1] - offset; 229 ICUBinary.skipBytes(inBytes, length); 230 231 index = IX_CES_OFFSET; 232 offset = inIndexes[index]; 233 length = inIndexes[index + 1] - offset; 234 if(length >= 8) { 235 if(data == null) { 236 throw new ICUException("Tailored ces without tailored trie"); 237 } 238 data.ces = ICUBinary.getLongs(inBytes, length / 8, length & 7); 239 } else { 240 ICUBinary.skipBytes(inBytes, length); 241 } 242 243 index = IX_RESERVED10_OFFSET; 244 offset = inIndexes[index]; 245 length = inIndexes[index + 1] - offset; 246 ICUBinary.skipBytes(inBytes, length); 247 248 index = IX_CE32S_OFFSET; 249 offset = inIndexes[index]; 250 length = inIndexes[index + 1] - offset; 251 if(length >= 4) { 252 if(data == null) { 253 throw new ICUException("Tailored ce32s without tailored trie"); 254 } 255 data.ce32s = ICUBinary.getInts(inBytes, length / 4, length & 3); 256 } else { 257 ICUBinary.skipBytes(inBytes, length); 258 } 259 260 int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START]; 261 if(jamoCE32sStart >= 0) { 262 if(data == null || data.ce32s == null) { 263 throw new ICUException("JamoCE32sStart index into non-existent ce32s[]"); 264 } 265 data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; 266 System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH); 267 } else if(data == null) { 268 // Nothing to do. 269 } else if(baseData != null) { 270 data.jamoCE32s = baseData.jamoCE32s; 271 } else { 272 throw new ICUException("Missing Jamo CE32s for Hangul processing"); 273 } 274 275 index = IX_ROOT_ELEMENTS_OFFSET; 276 offset = inIndexes[index]; 277 length = inIndexes[index + 1] - offset; 278 if(length >= 4) { 279 int rootElementsLength = length / 4; 280 if(data == null) { 281 throw new ICUException("Root elements but no mappings"); 282 } 283 if(rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) { 284 throw new ICUException("Root elements array too short"); 285 } 286 data.rootElements = new long[rootElementsLength]; 287 for(int i = 0; i < rootElementsLength; ++i) { 288 data.rootElements[i] = inBytes.getInt() & 0xffffffffL; // unsigned int -> long 289 } 290 long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE]; 291 if(commonSecTer != Collation.COMMON_SEC_AND_TER_CE) { 292 throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value"); 293 } 294 long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES]; 295 if((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) { 296 // [fixed last secondary common byte] is too low, 297 // and secondary weights would collide with compressed common secondaries. 298 throw new ICUException("[fixed last secondary common byte] is too low"); 299 } 300 length &= 3; 301 } 302 ICUBinary.skipBytes(inBytes, length); 303 304 index = IX_CONTEXTS_OFFSET; 305 offset = inIndexes[index]; 306 length = inIndexes[index + 1] - offset; 307 if(length >= 2) { 308 if(data == null) { 309 throw new ICUException("Tailored contexts without tailored trie"); 310 } 311 data.contexts = ICUBinary.getString(inBytes, length / 2, length & 1); 312 } else { 313 ICUBinary.skipBytes(inBytes, length); 314 } 315 316 index = IX_UNSAFE_BWD_OFFSET; 317 offset = inIndexes[index]; 318 length = inIndexes[index + 1] - offset; 319 if(length >= 2) { 320 if(data == null) { 321 throw new ICUException("Unsafe-backward-set but no mappings"); 322 } 323 if(baseData == null) { 324 // Create the unsafe-backward set for the root collator. 325 // Include all non-zero combining marks and trail surrogates. 326 // We do this at load time, rather than at build time, 327 // to simplify Unicode version bootstrapping: 328 // The root data builder only needs the new FractionalUCA.txt data, 329 // but it need not be built with a version of ICU already updated to 330 // the corresponding new Unicode Character Database. 331 // 332 // The following is an optimized version of 333 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]"). 334 // It is faster and requires fewer code dependencies. 335 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates 336 data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet); 337 } else { 338 // Clone the root collator's set contents. 339 tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed(); 340 } 341 // Add the ranges from the data file to the unsafe-backward set. 342 USerializedSet sset = new USerializedSet(); 343 char[] unsafeData = ICUBinary.getChars(inBytes, length / 2, length & 1); 344 length = 0; 345 sset.getSet(unsafeData, 0); 346 int count = sset.countRanges(); 347 int[] range = new int[2]; 348 for(int i = 0; i < count; ++i) { 349 sset.getRange(i, range); 350 tailoring.unsafeBackwardSet.add(range[0], range[1]); 351 } 352 // Mark each lead surrogate as "unsafe" 353 // if any of its 1024 associated supplementary code points is "unsafe". 354 int c = 0x10000; 355 for(int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { 356 if(!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) { 357 tailoring.unsafeBackwardSet.add(lead); 358 } 359 } 360 tailoring.unsafeBackwardSet.freeze(); 361 data.unsafeBackwardSet = tailoring.unsafeBackwardSet; 362 } else if(data == null) { 363 // Nothing to do. 364 } else if(baseData != null) { 365 // No tailoring-specific data: Alias the root collator's set. 366 data.unsafeBackwardSet = baseData.unsafeBackwardSet; 367 } else { 368 throw new ICUException("Missing unsafe-backward-set"); 369 } 370 ICUBinary.skipBytes(inBytes, length); 371 372 // If the fast Latin format version is different, 373 // or the version is set to 0 for "no fast Latin table", 374 // then just always use the normal string comparison path. 375 index = IX_FAST_LATIN_TABLE_OFFSET; 376 offset = inIndexes[index]; 377 length = inIndexes[index + 1] - offset; 378 if(data != null) { 379 data.fastLatinTable = null; 380 data.fastLatinTableHeader = null; 381 if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) { 382 if(length >= 2) { 383 char header0 = inBytes.getChar(); 384 int headerLength = header0 & 0xff; 385 data.fastLatinTableHeader = new char[headerLength]; 386 data.fastLatinTableHeader[0] = header0; 387 for(int i = 1; i < headerLength; ++i) { 388 data.fastLatinTableHeader[i] = inBytes.getChar(); 389 } 390 int tableLength = length / 2 - headerLength; 391 data.fastLatinTable = ICUBinary.getChars(inBytes, tableLength, length & 1); 392 length = 0; 393 if((header0 >> 8) != CollationFastLatin.VERSION) { 394 throw new ICUException("Fast-Latin table version differs from version in data header"); 395 } 396 } else if(baseData != null) { 397 data.fastLatinTable = baseData.fastLatinTable; 398 data.fastLatinTableHeader = baseData.fastLatinTableHeader; 399 } 400 } 401 } 402 ICUBinary.skipBytes(inBytes, length); 403 404 index = IX_SCRIPTS_OFFSET; 405 offset = inIndexes[index]; 406 length = inIndexes[index + 1] - offset; 407 if(length >= 2) { 408 if(data == null) { 409 throw new ICUException("Script order data but no mappings"); 410 } 411 int scriptsLength = length / 2; 412 CharBuffer inChars = inBytes.asCharBuffer(); 413 data.numScripts = inChars.get(); 414 // There must be enough entries for both arrays, including more than two range starts. 415 int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16); 416 if(scriptStartsLength <= 2) { 417 throw new ICUException("Script order data too short"); 418 } 419 inChars.get(data.scriptsIndex = new char[data.numScripts + 16]); 420 inChars.get(data.scriptStarts = new char[scriptStartsLength]); 421 if(!(data.scriptStarts[0] == 0 && 422 data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) && 423 data.scriptStarts[scriptStartsLength - 1] == 424 (Collation.TRAIL_WEIGHT_BYTE << 8))) { 425 throw new ICUException("Script order data not valid"); 426 } 427 } else if(data == null) { 428 // Nothing to do. 429 } else if(baseData != null) { 430 data.numScripts = baseData.numScripts; 431 data.scriptsIndex = baseData.scriptsIndex; 432 data.scriptStarts = baseData.scriptStarts; 433 } 434 ICUBinary.skipBytes(inBytes, length); 435 436 index = IX_COMPRESSIBLE_BYTES_OFFSET; 437 offset = inIndexes[index]; 438 length = inIndexes[index + 1] - offset; 439 if(length >= 256) { 440 if(data == null) { 441 throw new ICUException("Data for compressible primary lead bytes but no mappings"); 442 } 443 data.compressibleBytes = new boolean[256]; 444 for(int i = 0; i < 256; ++i) { 445 data.compressibleBytes[i] = inBytes.get() != 0; 446 } 447 length -= 256; 448 } else if(data == null) { 449 // Nothing to do. 450 } else if(baseData != null) { 451 data.compressibleBytes = baseData.compressibleBytes; 452 } else { 453 throw new ICUException("Missing data for compressible primary lead bytes"); 454 } 455 ICUBinary.skipBytes(inBytes, length); 456 457 index = IX_RESERVED18_OFFSET; 458 offset = inIndexes[index]; 459 length = inIndexes[index + 1] - offset; 460 ICUBinary.skipBytes(inBytes, length); 461 462 CollationSettings ts = tailoring.settings.readOnly(); 463 int options = inIndexes[IX_OPTIONS] & 0xffff; 464 char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT]; 465 int fastLatinOptions = CollationFastLatin.getOptions( 466 tailoring.data, ts, fastLatinPrimaries); 467 if(options == ts.options && ts.variableTop != 0 && 468 Arrays.equals(reorderCodes, ts.reorderCodes) && 469 fastLatinOptions == ts.fastLatinOptions && 470 (fastLatinOptions < 0 || 471 Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) { 472 return; 473 } 474 475 CollationSettings settings = tailoring.settings.copyOnWrite(); 476 settings.options = options; 477 // Set variableTop from options and scripts data. 478 settings.variableTop = tailoring.data.getLastPrimaryForGroup( 479 Collator.ReorderCodes.FIRST + settings.getMaxVariable()); 480 if(settings.variableTop == 0) { 481 throw new ICUException("The maxVariable could not be mapped to a variableTop"); 482 } 483 484 if(reorderCodesLength != 0) { 485 settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable); 486 } 487 488 settings.fastLatinOptions = CollationFastLatin.getOptions( 489 tailoring.data, settings, 490 settings.fastLatinPrimaries); 491 } 492 493 private static final class IsAcceptable implements ICUBinary.Authenticate { 494 @Override 495 public boolean isDataVersionAcceptable(byte version[]) { 496 return version[0] == 5; 497 } 498 } 499 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 500 private static final int DATA_FORMAT = 0x55436f6c; // "UCol" 501 502 private CollationDataReader() {} // no constructor 503} 504 505/* 506 * Format of collation data (ucadata.icu, binary data in coll/ *.res files): 507 * See ICU4C source/common/collationdatareader.h. 508 */ 509