// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html#License /* ******************************************************************************* * Copyright (C) 2013-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * CollationDataReader.java, ported from collationdatareader.h/.cpp * * C++ version created on: 2013feb07 * created by: Markus W. Scherer */ package com.ibm.icu.impl.coll; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.util.Arrays; import com.ibm.icu.impl.ICUBinary; import com.ibm.icu.impl.Trie2_32; import com.ibm.icu.impl.USerializedSet; import com.ibm.icu.text.Collator; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.ICUException; /** * Collation binary data reader. */ final class CollationDataReader /* all static */ { // The following constants are also copied into source/common/ucol_swp.cpp. // Keep them in sync! /** * Number of int indexes. * * Can be 2 if there are only options. * Can be 7 or 8 if there are only options and a script reordering. * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. */ static final int IX_INDEXES_LENGTH = 0; /** * Bits 31..24: numericPrimary, for numeric collation * 23..16: fast Latin format version (0 = no fast Latin table) * 15.. 0: options bit set */ static final int IX_OPTIONS = 1; static final int IX_RESERVED2 = 2; static final int IX_RESERVED3 = 3; /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ static final int IX_JAMO_CE32S_START = 4; // Byte offsets from the start of the data, after the generic header. // The indexes[] are at byte offset 0, other data follows. // Each data item is aligned properly. // The data items should be in descending order of unit size, // to minimize the need for padding. // Each item's byte length is given by the difference between its offset and // the next index/offset value. /** Byte offset to int reorderCodes[]. */ static final int IX_REORDER_CODES_OFFSET = 5; /** * Byte offset to uint8_t reorderTable[]. * Empty table if <256 bytes (padding only). * Otherwise 256 bytes or more (with padding). */ static final int IX_REORDER_TABLE_OFFSET = 6; /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ static final int IX_TRIE_OFFSET = 7; static final int IX_RESERVED8_OFFSET = 8; /** Byte offset to long ces[]. */ static final int IX_CES_OFFSET = 9; static final int IX_RESERVED10_OFFSET = 10; /** Byte offset to int ce32s[]. */ static final int IX_CE32S_OFFSET = 11; /** Byte offset to uint32_t rootElements[]. */ static final int IX_ROOT_ELEMENTS_OFFSET = 12; /** Byte offset to UChar *contexts[]. */ static final int IX_CONTEXTS_OFFSET = 13; /** Byte offset to char [] with serialized unsafeBackwardSet. */ static final int IX_UNSAFE_BWD_OFFSET = 14; /** Byte offset to char fastLatinTable[]. */ static final int IX_FAST_LATIN_TABLE_OFFSET = 15; /** Byte offset to char scripts[]. */ static final int IX_SCRIPTS_OFFSET = 16; /** * Byte offset to boolean compressibleBytes[]. * Empty table if <256 bytes (padding only). * Otherwise 256 bytes or more (with padding). */ static final int IX_COMPRESSIBLE_BYTES_OFFSET = 17; static final int IX_RESERVED18_OFFSET = 18; static final int IX_TOTAL_SIZE = 19; static void read(CollationTailoring base, ByteBuffer inBytes, CollationTailoring tailoring) throws IOException { tailoring.version = ICUBinary.readHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE); if(base != null && base.getUCAVersion() != tailoring.getUCAVersion()) { throw new ICUException("Tailoring UCA version differs from base data UCA version"); } int inLength = inBytes.remaining(); if(inLength < 8) { throw new ICUException("not enough bytes"); } int indexesLength = inBytes.getInt(); // inIndexes[IX_INDEXES_LENGTH] if(indexesLength < 2 || inLength < indexesLength * 4) { throw new ICUException("not enough indexes"); } int[] inIndexes = new int[IX_TOTAL_SIZE + 1]; inIndexes[0] = indexesLength; for(int i = 1; i < indexesLength && i < inIndexes.length; ++i) { inIndexes[i] = inBytes.getInt(); } for(int i = indexesLength; i < inIndexes.length; ++i) { inIndexes[i] = -1; } if(indexesLength > inIndexes.length) { ICUBinary.skipBytes(inBytes, (indexesLength - inIndexes.length) * 4); } // Assume that the tailoring data is in initial state, // with null pointers and 0 lengths. // Set pointers to non-empty data parts. // Do this in order of their byte offsets. (Should help porting to Java.) int index; // one of the indexes[] slots int offset; // byte offset for the index part int length; // number of bytes in the index part if(indexesLength > IX_TOTAL_SIZE) { length = inIndexes[IX_TOTAL_SIZE]; } else if(indexesLength > IX_REORDER_CODES_OFFSET) { length = inIndexes[indexesLength - 1]; } else { length = 0; // only indexes, and inLength was already checked for them } if(inLength < length) { throw new ICUException("not enough bytes"); } CollationData baseData = base == null ? null : base.data; int[] reorderCodes; int reorderCodesLength; index = IX_REORDER_CODES_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if(length >= 4) { if(baseData == null) { // We assume for collation settings that // the base data does not have a reordering. throw new ICUException("Collation base data must not reorder scripts"); } reorderCodesLength = length / 4; reorderCodes = ICUBinary.getInts(inBytes, reorderCodesLength, length & 3); // The reorderRanges (if any) are the trailing reorderCodes entries. // Split the array at the boundary. // Script or reorder codes do not exceed 16-bit values. // Range limits are stored in the upper 16 bits, and are never 0. int reorderRangesLength = 0; while(reorderRangesLength < reorderCodesLength && (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) { ++reorderRangesLength; } assert(reorderRangesLength < reorderCodesLength); reorderCodesLength -= reorderRangesLength; } else { reorderCodes = new int[0]; reorderCodesLength = 0; ICUBinary.skipBytes(inBytes, length); } // There should be a reorder table only if there are reorder codes. // However, when there are reorder codes the reorder table may be omitted to reduce // the data size. byte[] reorderTable = null; index = IX_REORDER_TABLE_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if(length >= 256) { if(reorderCodesLength == 0) { throw new ICUException("Reordering table without reordering codes"); } reorderTable = new byte[256]; inBytes.get(reorderTable); length -= 256; } else { // If we have reorder codes, then build the reorderTable at the end, // when the CollationData is otherwise complete. } ICUBinary.skipBytes(inBytes, length); if(baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) { throw new ICUException("Tailoring numeric primary weight differs from base data"); } CollationData data = null; // Remains null if there are no mappings. index = IX_TRIE_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if(length >= 8) { tailoring.ensureOwnedData(); data = tailoring.ownedData; data.base = baseData; data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L; data.trie = tailoring.trie = Trie2_32.createFromSerialized(inBytes); int trieLength = data.trie.getSerializedLength(); if(trieLength > length) { throw new ICUException("Not enough bytes for the mappings trie"); // No mappings. } length -= trieLength; } else if(baseData != null) { // Use the base data. Only the settings are tailored. tailoring.data = baseData; } else { throw new ICUException("Missing collation data mappings"); // No mappings. } ICUBinary.skipBytes(inBytes, length); index = IX_RESERVED8_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; ICUBinary.skipBytes(inBytes, length); index = IX_CES_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if(length >= 8) { if(data == null) { throw new ICUException("Tailored ces without tailored trie"); } data.ces = ICUBinary.getLongs(inBytes, length / 8, length & 7); } else { ICUBinary.skipBytes(inBytes, length); } index = IX_RESERVED10_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; ICUBinary.skipBytes(inBytes, length); index = IX_CE32S_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if(length >= 4) { if(data == null) { throw new ICUException("Tailored ce32s without tailored trie"); } data.ce32s = ICUBinary.getInts(inBytes, length / 4, length & 3); } else { ICUBinary.skipBytes(inBytes, length); } int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START]; if(jamoCE32sStart >= 0) { if(data == null || data.ce32s == null) { throw new ICUException("JamoCE32sStart index into non-existent ce32s[]"); } data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH); } else if(data == null) { // Nothing to do. } else if(baseData != null) { data.jamoCE32s = baseData.jamoCE32s; } else { throw new ICUException("Missing Jamo CE32s for Hangul processing"); } index = IX_ROOT_ELEMENTS_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if(length >= 4) { int rootElementsLength = length / 4; if(data == null) { throw new ICUException("Root elements but no mappings"); } if(rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) { throw new ICUException("Root elements array too short"); } data.rootElements = new long[rootElementsLength]; for(int i = 0; i < rootElementsLength; ++i) { data.rootElements[i] = inBytes.getInt() & 0xffffffffL; // unsigned int -> long } long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE]; if(commonSecTer != Collation.COMMON_SEC_AND_TER_CE) { throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value"); } long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES]; if((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) { // [fixed last secondary common byte] is too low, // and secondary weights would collide with compressed common secondaries. throw new ICUException("[fixed last secondary common byte] is too low"); } length &= 3; } ICUBinary.skipBytes(inBytes, length); index = IX_CONTEXTS_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if(length >= 2) { if(data == null) { throw new ICUException("Tailored contexts without tailored trie"); } data.contexts = ICUBinary.getString(inBytes, length / 2, length & 1); } else { ICUBinary.skipBytes(inBytes, length); } index = IX_UNSAFE_BWD_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if(length >= 2) { if(data == null) { throw new ICUException("Unsafe-backward-set but no mappings"); } if(baseData == null) { // Create the unsafe-backward set for the root collator. // Include all non-zero combining marks and trail surrogates. // We do this at load time, rather than at build time, // to simplify Unicode version bootstrapping: // The root data builder only needs the new FractionalUCA.txt data, // but it need not be built with a version of ICU already updated to // the corresponding new Unicode Character Database. // // The following is an optimized version of // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]"). // It is faster and requires fewer code dependencies. tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet); } else { // Clone the root collator's set contents. tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed(); } // Add the ranges from the data file to the unsafe-backward set. USerializedSet sset = new USerializedSet(); char[] unsafeData = ICUBinary.getChars(inBytes, length / 2, length & 1); length = 0; sset.getSet(unsafeData, 0); int count = sset.countRanges(); int[] range = new int[2]; for(int i = 0; i < count; ++i) { sset.getRange(i, range); tailoring.unsafeBackwardSet.add(range[0], range[1]); } // Mark each lead surrogate as "unsafe" // if any of its 1024 associated supplementary code points is "unsafe". int c = 0x10000; for(int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { if(!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) { tailoring.unsafeBackwardSet.add(lead); } } tailoring.unsafeBackwardSet.freeze(); data.unsafeBackwardSet = tailoring.unsafeBackwardSet; } else if(data == null) { // Nothing to do. } else if(baseData != null) { // No tailoring-specific data: Alias the root collator's set. data.unsafeBackwardSet = baseData.unsafeBackwardSet; } else { throw new ICUException("Missing unsafe-backward-set"); } ICUBinary.skipBytes(inBytes, length); // If the fast Latin format version is different, // or the version is set to 0 for "no fast Latin table", // then just always use the normal string comparison path. index = IX_FAST_LATIN_TABLE_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if(data != null) { data.fastLatinTable = null; data.fastLatinTableHeader = null; if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) { if(length >= 2) { char header0 = inBytes.getChar(); int headerLength = header0 & 0xff; data.fastLatinTableHeader = new char[headerLength]; data.fastLatinTableHeader[0] = header0; for(int i = 1; i < headerLength; ++i) { data.fastLatinTableHeader[i] = inBytes.getChar(); } int tableLength = length / 2 - headerLength; data.fastLatinTable = ICUBinary.getChars(inBytes, tableLength, length & 1); length = 0; if((header0 >> 8) != CollationFastLatin.VERSION) { throw new ICUException("Fast-Latin table version differs from version in data header"); } } else if(baseData != null) { data.fastLatinTable = baseData.fastLatinTable; data.fastLatinTableHeader = baseData.fastLatinTableHeader; } } } ICUBinary.skipBytes(inBytes, length); index = IX_SCRIPTS_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if(length >= 2) { if(data == null) { throw new ICUException("Script order data but no mappings"); } int scriptsLength = length / 2; CharBuffer inChars = inBytes.asCharBuffer(); data.numScripts = inChars.get(); // There must be enough entries for both arrays, including more than two range starts. int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16); if(scriptStartsLength <= 2) { throw new ICUException("Script order data too short"); } inChars.get(data.scriptsIndex = new char[data.numScripts + 16]); inChars.get(data.scriptStarts = new char[scriptStartsLength]); if(!(data.scriptStarts[0] == 0 && data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) && data.scriptStarts[scriptStartsLength - 1] == (Collation.TRAIL_WEIGHT_BYTE << 8))) { throw new ICUException("Script order data not valid"); } } else if(data == null) { // Nothing to do. } else if(baseData != null) { data.numScripts = baseData.numScripts; data.scriptsIndex = baseData.scriptsIndex; data.scriptStarts = baseData.scriptStarts; } ICUBinary.skipBytes(inBytes, length); index = IX_COMPRESSIBLE_BYTES_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if(length >= 256) { if(data == null) { throw new ICUException("Data for compressible primary lead bytes but no mappings"); } data.compressibleBytes = new boolean[256]; for(int i = 0; i < 256; ++i) { data.compressibleBytes[i] = inBytes.get() != 0; } length -= 256; } else if(data == null) { // Nothing to do. } else if(baseData != null) { data.compressibleBytes = baseData.compressibleBytes; } else { throw new ICUException("Missing data for compressible primary lead bytes"); } ICUBinary.skipBytes(inBytes, length); index = IX_RESERVED18_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; ICUBinary.skipBytes(inBytes, length); CollationSettings ts = tailoring.settings.readOnly(); int options = inIndexes[IX_OPTIONS] & 0xffff; char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT]; int fastLatinOptions = CollationFastLatin.getOptions( tailoring.data, ts, fastLatinPrimaries); if(options == ts.options && ts.variableTop != 0 && Arrays.equals(reorderCodes, ts.reorderCodes) && fastLatinOptions == ts.fastLatinOptions && (fastLatinOptions < 0 || Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) { return; } CollationSettings settings = tailoring.settings.copyOnWrite(); settings.options = options; // Set variableTop from options and scripts data. settings.variableTop = tailoring.data.getLastPrimaryForGroup( Collator.ReorderCodes.FIRST + settings.getMaxVariable()); if(settings.variableTop == 0) { throw new ICUException("The maxVariable could not be mapped to a variableTop"); } if(reorderCodesLength != 0) { settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable); } settings.fastLatinOptions = CollationFastLatin.getOptions( tailoring.data, settings, settings.fastLatinPrimaries); } private static final class IsAcceptable implements ICUBinary.Authenticate { @Override public boolean isDataVersionAcceptable(byte version[]) { return version[0] == 5; } } private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); private static final int DATA_FORMAT = 0x55436f6c; // "UCol" private CollationDataReader() {} // no constructor } /* * Format of collation data (ucadata.icu, binary data in coll/ *.res files): * See ICU4C source/common/collationdatareader.h. */