12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others. 22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 5f8a0c400bbd62a2ea4ee9b77641f79cb443d2187Neil Fuller * Copyright (C) 2006-2015, International Business Machines Corporation and 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * others. All Rights Reserved. 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.charset; 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.io.IOException; 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.nio.ByteBuffer; 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.ICUBinary; 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* Format of cnvalias.icu ----------------------------------------------------- 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt. 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This binary form contains several tables. All indexes are to uint16_t 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * units, and not to the bytes (uint8_t units). Addressing everything on 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 16-bit boundaries allows us to store more information with small index 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * numbers, which are also 16-bit in size. The majority of the table (except 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the string table) are 16-bit numbers. 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * First there is the size of the Table of Contents (TOC). The TOC 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * entries contain the size of each section. In order to find the offset 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * you just need to sum up the previous offsets. 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The TOC length and entries are an array of uint32_t values. 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The first section after the TOC starts immediately after the TOC. 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 1) This section contains a list of converters. This list contains indexes 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * into the string table for the converter name. The index of this list is 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * also used by other sections, which are mentioned later on. 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This list is not sorted. 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 2) This section contains a list of tags. This list contains indexes 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * into the string table for the tag name. The index of this list is 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * also used by other sections, which are mentioned later on. 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This list is in priority order of standards. 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 3) This section contains a list of sorted unique aliases. This 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * list contains indexes into the string table for the alias name. The 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * index of this list is also used by other sections, like the 4th section. 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The index for the 3rd and 4th section is used to get the 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * alias -> converter name mapping. Section 3 and 4 form a two column table. 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 4) This section contains a list of mapped converter names. Consider this 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * as a table that maps the 3rd section to the 1st section. This list contains 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * indexes into the 1st section. The index of this list is the same index in 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the 3rd section. There is also some extra information in the high bits of 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * each converter index in this table. Currently it's only used to say that 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the predigested form of the 5th section so that an alias lookup can be fast. 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 5) This section contains a 2D array with indexes to the 6th section. This 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * section is the full form of all alias mappings. The column index is the 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * index into the converter list (column header). The row index is the index 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * to tag list (row header). This 2D array is the top part a 3D array. The 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * third dimension is in the 6th section. 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 6) This is blob of variable length arrays. Each array starts with a size, 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and is followed by indexes to alias names in the string table. This is 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the third dimension to the section 5. No other section should be referencing 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * this section. 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 7) Reserved at this time (There is no information). This _usually_ has a 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * size of 0. Future versions may add more information here. 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 8) This is the string table. All strings are indexed on an even address. 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * There are two reasons for this. First many chip architectures locate strings 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * faster on even address boundaries. Second, since all indexes are 16-bit 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * numbers, this string table can be 128KB in size instead of 64KB when we 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * only have strings starting on an even address. 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Here is the concept of section 5 and 6. It's a 3D cube. Each tag 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * has a unique alias among all converters. That same alias can 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * be mentioned in other standards on different converters, 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * but only one alias per tag can be unique. 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Converter Names (Usually in TR22 form) 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * -------------------------------------------. 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * T / /| 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * a / / | 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * g / / | 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * s / / | 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * / / | 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * ------------------------------------------/ | 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A | | | 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * l | | | 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * i | | / 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * a | | / 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * s | | / 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * e | | / 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * s | |/ 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * ------------------------------------------- 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Here is what it really looks like. It's like swiss cheese. 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * There are holes. Some converters aren't recognized by 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * a standard, or they are really old converters that the 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * standard doesn't recognize anymore. 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Converter Names (Usually in TR22 form) 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * -------------------------------------------. 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * T /##########################################/| 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * a / # # /# 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * g / # ## ## ### # ### ### ### #/ 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * s / # ##### #### ## ## #/# 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * / ### # # ## # # # ### # # #/## 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * ------------------------------------------/# # 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A |### # # ## # # # ### # # #|# # 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * l |# # # # # ## # #|# # 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * i |# # # # # # #|# 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * a |# #|# 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * s | #|# 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * e 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * s 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertfinal class UConverterAliasDataReader implements ICUBinary.Authenticate { 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader"); 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p>Protected constructor.</p> 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param bytes ICU uprop.dat file buffer 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @exception IOException throw if data file fails authentication 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected UConverterAliasDataReader(ByteBuffer bytes) 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throws IOException{ 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //if(debug) System.out.println("Bytes in buffer " + bytes.remaining()); 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert byteBuffer = bytes; 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /*unicodeVersion = */ICUBinary.readHeader(byteBuffer, DATA_FORMAT_ID, this); 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //if(debug) System.out.println("Bytes left in byteBuffer " + byteBuffer.remaining()); 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // protected methods ------------------------------------------------- 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected int[] readToc(int n)throws IOException 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert { 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //Read the toc 150f8a0c400bbd62a2ea4ee9b77641f79cb443d2187Neil Fuller return ICUBinary.getInts(byteBuffer, n, 0); 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1532d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isDataVersionAcceptable(byte version[]) 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert { 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return version.length >= DATA_FORMAT_VERSION.length 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert && version[0] == DATA_FORMAT_VERSION[0] 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert && version[1] == DATA_FORMAT_VERSION[1] 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert && version[2] == DATA_FORMAT_VERSION[2]; 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1612d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /*byte[] getUnicodeVersion(){ 1632d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert return ICUBinary.getVersionByteArrayFromCompactInt(unicodeVersion); 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }*/ 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // private data members ------------------------------------------------- 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * ICU data file buffer 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private ByteBuffer byteBuffer; 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// private int unicodeVersion; 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * File format version that this class understands. 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * No guarantees are made if a older version is used 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * see store.c of gennorm for more information and values 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c) 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int DATA_FORMAT_ID = 0x4376416c; // dataFormat="CvAl" 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final byte DATA_FORMAT_VERSION[] = {3, 0, 1}; 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 184