1/*
2*******************************************************************************
3* Copyright (C) 2013-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* collationdatareader.h
7*
8* created on: 2013feb07
9* created by: Markus W. Scherer
10*/
11
12#ifndef __COLLATIONDATAREADER_H__
13#define __COLLATIONDATAREADER_H__
14
15#include "unicode/utypes.h"
16
17#if !UCONFIG_NO_COLLATION
18
19#include "unicode/udata.h"
20
21struct UDataMemory;
22
23U_NAMESPACE_BEGIN
24
25struct CollationTailoring;
26
27/**
28 * Collation binary data reader.
29 */
30struct U_I18N_API CollationDataReader /* all static */ {
31    // The following constants are also copied into source/common/ucol_swp.cpp.
32    // Keep them in sync!
33    enum {
34        /**
35         * Number of int32_t indexes.
36         *
37         * Can be 2 if there are only options.
38         * Can be 7 or 8 if there are only options and a script reordering.
39         * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
40         */
41        IX_INDEXES_LENGTH,  // 0
42        /**
43         * Bits 31..24: numericPrimary, for numeric collation
44         *      23..16: fast Latin format version (0 = no fast Latin table)
45         *      15.. 0: options bit set
46         */
47        IX_OPTIONS,
48        IX_RESERVED2,
49        IX_RESERVED3,
50
51        /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
52        IX_JAMO_CE32S_START,  // 4
53
54        // Byte offsets from the start of the data, after the generic header.
55        // The indexes[] are at byte offset 0, other data follows.
56        // Each data item is aligned properly.
57        // The data items should be in descending order of unit size,
58        // to minimize the need for padding.
59        // Each item's byte length is given by the difference between its offset and
60        // the next index/offset value.
61        /** Byte offset to int32_t reorderCodes[]. */
62        IX_REORDER_CODES_OFFSET,
63        /**
64         * Byte offset to uint8_t reorderTable[].
65         * Empty table if <256 bytes (padding only).
66         * Otherwise 256 bytes or more (with padding).
67         */
68        IX_REORDER_TABLE_OFFSET,
69        /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
70        IX_TRIE_OFFSET,
71
72        IX_RESERVED8_OFFSET,  // 8
73        /** Byte offset to int64_t ces[]. */
74        IX_CES_OFFSET,
75        IX_RESERVED10_OFFSET,
76        /** Byte offset to uint32_t ce32s[]. */
77        IX_CE32S_OFFSET,
78
79        /** Byte offset to uint32_t rootElements[]. */
80        IX_ROOT_ELEMENTS_OFFSET,  // 12
81        /** Byte offset to UChar *contexts[]. */
82        IX_CONTEXTS_OFFSET,
83        /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */
84        IX_UNSAFE_BWD_OFFSET,
85        /** Byte offset to uint16_t fastLatinTable[]. */
86        IX_FAST_LATIN_TABLE_OFFSET,
87
88        /** Byte offset to uint16_t scripts[]. */
89        IX_SCRIPTS_OFFSET,  // 16
90        /**
91         * Byte offset to UBool compressibleBytes[].
92         * Empty table if <256 bytes (padding only).
93         * Otherwise 256 bytes or more (with padding).
94         */
95        IX_COMPRESSIBLE_BYTES_OFFSET,
96        IX_RESERVED18_OFFSET,
97        IX_TOTAL_SIZE
98    };
99
100    static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
101                     CollationTailoring &tailoring, UErrorCode &errorCode);
102
103    static UBool U_CALLCONV
104    isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
105
106private:
107    CollationDataReader();  // no constructor
108};
109
110/*
111 * Format of collation data (ucadata.icu, binary data in coll/ *.res files).
112 * Format version 4.0.
113 *
114 * The root collation data is stored in the ucadata.icu file.
115 * Tailorings are stored inside .res resource bundle files, with a complete file header.
116 *
117 * Collation data begins with a standard ICU data file header
118 * (DataHeader, see ucmndata.h and unicode/udata.h).
119 * The UDataInfo.dataVersion field contains the UCA and other version numbers,
120 * see the comments for CollationTailoring.version.
121 *
122 * After the header, the file contains the following parts.
123 * Constants are defined as enum values of the CollationDataReader class.
124 * See also the Collation class.
125 *
126 * int32_t indexes[indexesLength];
127 *      The indexes array has variable length.
128 *      Some tailorings only need the length and the options,
129 *      others only add reorderCodes and the reorderTable,
130 *      some need to store mappings.
131 *      Only as many indexes are stored as needed to read all of the data.
132 *
133 *      Index 0: indexesLength
134 *      Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS
135 *      Index 2..3: Unused/reserved/0.
136 *      Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo
137 *               are stored in a short, contiguous part of the ce32s array.
138 *
139 *      Indexes 5..19 are byte offsets in ascending order.
140 *      Each byte offset marks the start of the next part in the data file,
141 *      and the end of the previous one.
142 *      When two consecutive byte offsets are the same (or too short),
143 *      then the corresponding part is empty.
144 *      Byte offsets are offsets from after the header,
145 *      that is, from the beginning of the indexes[].
146 *      Each part starts at an offset with proper alignment for its data.
147 *      If necessary, the previous part may include padding bytes to achieve this alignment.
148 *      The last byte offset that is stored in the indexes indicates the total size of the data
149 *      (starting with the indexes).
150 *
151 * int32_t reorderCodes[]; -- empty in root
152 *      The list of script and reordering codes.
153 *
154 * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes
155 *      Primary-weight lead byte permutation table.
156 *      Normally present when the reorderCodes are, but can be built at load time.
157 *
158 * UTrie2 trie; -- see utrie2_impl.h and utrie2.h
159 *      The trie holds the main collation data. Each code point is mapped to a 32-bit value.
160 *      It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,
161 *      in which case it is a special CE32 and contains a 4-bit tag and further data.
162 *      See the Collation class for details.
163 *
164 *      The trie has a value for each lead surrogate code unit with some bits encoding
165 *      collective properties of the 1024 supplementary characters whose UTF-16 form starts with
166 *      the lead surrogate. See Collation::LEAD_SURROGATE_TAG..
167 *
168 * int64_t ces[];
169 *      64-bit CEs and expansions that cannot be stored in a more compact form.
170 *
171 * uint32_t ce32s[];
172 *      CE32s for expansions in compact form, and for characters whose trie values
173 *      contain special data.
174 *
175 * uint32_t rootElements[]; -- empty in all tailorings
176 *      Compact storage for all of the CEs that occur in the root collation.
177 *      See the CollationRootElements class.
178 *
179 * UChar *contexts[];
180 *      Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings.
181 *
182 * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize()
183 *      Serialized form of characters that are unsafe when iterating backwards,
184 *      and at the end of an identical string prefix.
185 *      Back up to a safe character.
186 *      Lead surrogates are "unsafe" when any of their corresponding supplementary
187 *      code points are unsafe.
188 *      Does not include [:^lccc=0:][:^tccc=0:].
189 *      For each tailoring, the root unsafeBackwardSet is subtracted.
190 *      (As a result, in many tailorings no set needs to be stored.)
191 *
192 * uint16_t fastLatinTable[];
193 *      Optional optimization for Latin text.
194 *      See the CollationFastLatin class.
195 *
196 * uint16_t scripts[]; -- empty in all tailorings
197 *      Table of the reordering groups with their first and last lead bytes,
198 *      and their script and reordering codes.
199 *      See CollationData::scripts.
200 *
201 * UBool compressibleBytes[]; -- empty in all tailorings
202 *      Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
203 */
204
205U_NAMESPACE_END
206
207#endif  // !UCONFIG_NO_COLLATION
208#endif  // __COLLATIONDATAREADER_H__
209