12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others.
22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License
37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/*
47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *******************************************************************************
5f8a0c400bbd62a2ea4ee9b77641f79cb443d2187Neil Fuller * Copyright (C) 2006-2015, International Business Machines Corporation and
67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * others. All Rights Reserved.
77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *******************************************************************************
87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.charset;
117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.io.IOException;
137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.nio.ByteBuffer;
147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.ICUBinary;
167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* Format of cnvalias.icu -----------------------------------------------------
197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt.
217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This binary form contains several tables. All indexes are to uint16_t
227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * units, and not to the bytes (uint8_t units). Addressing everything on
237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 16-bit boundaries allows us to store more information with small index
247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * numbers, which are also 16-bit in size. The majority of the table (except
257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the string table) are 16-bit numbers.
267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * First there is the size of the Table of Contents (TOC). The TOC
287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * entries contain the size of each section. In order to find the offset
297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * you just need to sum up the previous offsets.
307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The TOC length and entries are an array of uint32_t values.
317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The first section after the TOC starts immediately after the TOC.
327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 1) This section contains a list of converters. This list contains indexes
347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * into the string table for the converter name. The index of this list is
357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * also used by other sections, which are mentioned later on.
367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This list is not sorted.
377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 2) This section contains a list of tags. This list contains indexes
397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * into the string table for the tag name. The index of this list is
407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * also used by other sections, which are mentioned later on.
417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This list is in priority order of standards.
427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 3) This section contains a list of sorted unique aliases. This
447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * list contains indexes into the string table for the alias name. The
457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * index of this list is also used by other sections, like the 4th section.
467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The index for the 3rd and 4th section is used to get the
477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * alias -> converter name mapping. Section 3 and 4 form a two column table.
487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 4) This section contains a list of mapped converter names. Consider this
507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * as a table that maps the 3rd section to the 1st section. This list contains
517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * indexes into the 1st section. The index of this list is the same index in
527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the 3rd section. There is also some extra information in the high bits of
537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * each converter index in this table. Currently it's only used to say that
547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the predigested form of the 5th section so that an alias lookup can be fast.
577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 5) This section contains a 2D array with indexes to the 6th section. This
597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * section is the full form of all alias mappings. The column index is the
607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * index into the converter list (column header). The row index is the index
617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * to tag list (row header). This 2D array is the top part a 3D array. The
627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * third dimension is in the 6th section.
637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 6) This is blob of variable length arrays. Each array starts with a size,
657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and is followed by indexes to alias names in the string table. This is
667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the third dimension to the section 5. No other section should be referencing
677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * this section.
687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 7) Reserved at this time (There is no information). This _usually_ has a
707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * size of 0. Future versions may add more information here.
717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 8) This is the string table. All strings are indexed on an even address.
737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * There are two reasons for this. First many chip architectures locate strings
747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * faster on even address boundaries. Second, since all indexes are 16-bit
757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * numbers, this string table can be 128KB in size instead of 64KB when we
767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * only have strings starting on an even address.
777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Here is the concept of section 5 and 6. It's a 3D cube. Each tag
807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * has a unique alias among all converters. That same alias can
817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * be mentioned in other standards on different converters,
827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * but only one alias per tag can be unique.
837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *              Converter Names (Usually in TR22 form)
867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *           -------------------------------------------.
877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *     T    /                                          /|
887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *     a   /                                          / |
897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *     g  /                                          /  |
907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *     s /                                          /   |
917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *      /                                          /    |
927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *      ------------------------------------------/     |
937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *    A |                                         |     |
947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *    l |                                         |     |
957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *    i |                                         |    /
967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *    a |                                         |   /
977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *    s |                                         |  /
987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *    e |                                         | /
997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *    s |                                         |/
1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *      -------------------------------------------
1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Here is what it really looks like. It's like swiss cheese.
1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * There are holes. Some converters aren't recognized by
1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * a standard, or they are really old converters that the
1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * standard doesn't recognize anymore.
1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *              Converter Names (Usually in TR22 form)
1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *           -------------------------------------------.
1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *     T    /##########################################/|
1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *     a   /     #            #                       /#
1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *     g  /  #      ##     ##     ### # ### ### ### #/
1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *     s / #             #####  ####        ##  ## #/#
1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *      / ### # # ##  #  #   #          ### # #   #/##
1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *      ------------------------------------------/# #
1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *    A |### # # ##  #  #   #          ### # #   #|# #
1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *    l |# # #    #     #               ## #     #|# #
1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *    i |# # #    #     #                #       #|#
1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *    a |#                                       #|#
1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *    s |                                        #|#
1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *    e
1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *    s
1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertfinal class UConverterAliasDataReader implements ICUBinary.Authenticate {
1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert//    private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader");
1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert   /**
1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * <p>Protected constructor.</p>
1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * @param bytes ICU uprop.dat file buffer
1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * @exception IOException throw if data file fails authentication
1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected UConverterAliasDataReader(ByteBuffer bytes)
1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                        throws IOException{
1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //if(debug) System.out.println("Bytes in buffer " + bytes.remaining());
1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        byteBuffer = bytes;
1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        /*unicodeVersion = */ICUBinary.readHeader(byteBuffer, DATA_FORMAT_ID, this);
1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //if(debug) System.out.println("Bytes left in byteBuffer " + byteBuffer.remaining());
1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // protected methods -------------------------------------------------
1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected int[] readToc(int n)throws IOException
1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    {
1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //Read the toc
150f8a0c400bbd62a2ea4ee9b77641f79cb443d2187Neil Fuller        return ICUBinary.getInts(byteBuffer, n, 0);
1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1532d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert    @Override
1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public boolean isDataVersionAcceptable(byte version[])
1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    {
1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return version.length >= DATA_FORMAT_VERSION.length
1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            && version[0] == DATA_FORMAT_VERSION[0]
1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            && version[1] == DATA_FORMAT_VERSION[1]
1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            && version[2] == DATA_FORMAT_VERSION[2];
1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1612d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /*byte[] getUnicodeVersion(){
1632d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert        return ICUBinary.getVersionByteArrayFromCompactInt(unicodeVersion);
1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }*/
1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // private data members -------------------------------------------------
1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * ICU data file buffer
1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private ByteBuffer byteBuffer;
1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert//    private int unicodeVersion;
1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * File format version that this class understands.
1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * No guarantees are made if a older version is used
1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * see store.c of gennorm for more information and values
1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c)
1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static final int DATA_FORMAT_ID = 0x4376416c; // dataFormat="CvAl"
1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static final byte DATA_FORMAT_VERSION[] = {3, 0, 1};
1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert}
184