1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*******************************************************************************
3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
4ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*   Copyright (C) 1999-2009, International Business Machines
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Corporation and others.  All Rights Reserved.
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*******************************************************************************
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   file name:  store.c
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   encoding:   US-ASCII
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   tab size:   8 (not used)
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   indentation:4
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created on: 1999dec11
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created by: Markus W. Scherer
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Store Unicode character properties efficiently for
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   random access.
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <stdio.h>
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uchar.h"
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h"
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cstring.h"
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "utrie.h"
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/udata.h"
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unewdata.h"
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "writesrc.h"
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uprops.h"
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "genprops.h"
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define DO_DEBUG_OUT 0
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* Unicode character properties file format ------------------------------------
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe file format prepared and written here contains several data
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustructures that store indexes or data.
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBefore the data contents described below, there are the headers required by
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruthe udata API for loading ICU data. Especially, a UDataInfo structure
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprecedes the actual data. It contains platform properties values and the
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querufile format version.
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
44ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoThe following is a description of format version 6 .
4585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruData contents:
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe contents is a parsed, binary form of several Unicode character
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querudatabase files, most prominently UnicodeData.txt.
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruAny Unicode code point from 0 to 0x10ffff can be looked up to get
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruthe properties, if any, for that code point. This means that the input
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruto the lookup are 21-bit unsigned integers, with not all of the
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru21-bit range used.
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruIt is assumed that client code keeps a uint32_t pointer
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruto the beginning of the data:
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const uint32_t *p32;
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruFormally, the file contains the following structures:
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const int32_t indexes[16] with values i0..i15:
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  i0 indicates the length of the main trie.
66ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  i0..i3 all have the same value in format versions 4.0 and higher;
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru         the related props32[] and exceptions[] and uchars[] were used in format version 3
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i1 exceptionsIndex;  -- 32-bit unit index to the table of 32-bit exception words
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i7..i9 reservedIndexes; -- reserved values; 0 for now
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i12..i15 reservedIndexes; -- reserved values; 0 for now
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
86ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  P, E, and U are not used (empty) in format versions 4 and above
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    P  const uint32_t props32[i1-i0];
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    E  const uint32_t exceptions[i2-i1];
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    U  const UChar uchars[2*(i3-i2)];
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    AT serialized trie for additional properties (byte size: 4*(i4-i3))
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruTrie lookup and properties:
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruIn order to condense the data for the 21-bit code space, several properties of
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruthe Unicode code assignment are exploited:
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The code space is sparse.
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- There are several 10k of consecutive codes with the same properties.
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Characters and scripts are allocated in groups of 16 code points.
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Inside blocks for scripts the properties are often repetitive.
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The 21-bit space is not fully used for Unicode.
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe lookup of properties for a given code point is done with a trie lookup,
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruusing the UTrie implementation.
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe trie lookup result is a 16-bit properties word.
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruWith a given Unicode code point
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar32 c;
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruand 0<=c<0x110000, the lookup is done like this:
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t props;
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UTRIE_GET16(trie, c, props);
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruEach 16-bit properties word contains:
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0.. 4  general category
121ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho     5  reserved
122ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 6..15  numeric type and value (ntv)
123ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
124ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoEncoding of numeric type and value in the 10-bit ntv field:
125ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    ntv             type            value
126ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    0               U_NT_NONE       0
127ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    1..10           U_NT_DECIMAL    0..9
128ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    11..20          U_NT_DIGIT      0..9
129ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    21..0x2ff       U_NT_NUMERIC    see below
130ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    0x300..0x3ff    reserved
131ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
132ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    For U_NT_NUMERIC:
133ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    ntv             value
134ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    21..0xaf        integer     0..154
135ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    0xb0..0x1df     fraction    ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16
136ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    0x1e0..0x2ff    large int   ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
137ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    (only one significant decimal digit)
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru--- Additional properties (new in format version 2.1) ---
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe second trie for additional properties (AT) is also a UTrie with 16-bit data.
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe data words consist of 32-bit unit indexes (not row indexes!) into the
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querutable of unique properties vectors (PV).
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruEach vector contains a set of properties.
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe width of a vector (number of uint32_t per row) may change
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruwith the formatVersion, it is stored in i5.
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruCurrent properties: see icu/source/common/uprops.h
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru--- Changes in format version 3.1 ---
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruSee i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru--- Changes in format version 3.2 ---
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The tries use linear Latin-1 ranges.
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The additional properties bits store full properties XYZ instead
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  of partial Other_XYZ, so that changes in the derivation formulas
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  need not be tracked in runtime library code.
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Joining Type and Line Break are also stored completely, so that uprops.c
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  needs no runtime formulas for enumerated properties either.
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Store the case-sensitive flag in the main properties word.
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- i10 also contains U_LB_COUNT and U_EA_COUNT.
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- i11 contains maxValues2 for vector word 2.
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru--- Changes in format version 4 ---
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe format changes between version 3 and 4 because the properties related to
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querucase mappings and bidi/shaping are pulled out into separate files
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querufor modularization.
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruIn order to reduce the need for code changes, some of the previous data
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustructures are omitted, rather than rearranging everything.
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru(The change to format version 4 is for ICU 3.4. The last CVS revision of
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querugenprops/store.c for format version 3.2 is 1.48.)
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe main trie's data is significantly simplified:
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The trie's 16-bit data word is used directly instead of as an index
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  into props32[].
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The trie uses the default trie folding functions instead of custom ones.
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Numeric values are stored directly in the trie data word, with special
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  encodings.
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- No more exception data (the data that needed it was pulled out, or, in the
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  case of numeric values, encoded differently).
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- No more string data (pulled out - was for case mappings).
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruAlso, some of the previously used properties vector bits are reserved again.
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe indexes[] values for the omitted structures are still filled in
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru(indicating zero-length arrays) so that the swapper code remains unchanged.
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
19285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho--- Changes in format version 5 ---
19385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
194ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoFormat version 5 became necessary because the bit field for script codes
195ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehooverflowed. The changes are incompatible because
196ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoold code would have seen nonsensically low values for new, higher script codes.
197ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
198ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoRearranged bit fields in the second trie (AT) and widened three (Script, Block,
199ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoWord_Break) by one bit each.
200ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
20185bf2e2fbc60a9f938064abc8127d61da7d19882Claire HoModified bit fields in icu/source/common/uprops.h
20285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
203ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho--- Changes in format version 6 ---
204ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
205ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoFormat version 6 became necessary because Unicode 5.2 adds fractions with
206ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehodenominators 9, 10 and 16, and it was easier to redesign the encoding of numeric
207ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehotypes and values rather than add another variant to the previous format.
208ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru----------------------------------------------------------------------------- */
210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* UDataInfo cf. udata.h */
212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic UDataInfo dataInfo={
213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    sizeof(UDataInfo),
214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0,
215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    U_IS_BIG_ENDIAN,
217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    U_CHARSET_FAMILY,
218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    U_SIZEOF_UCHAR,
219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0,
220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    { 0x55, 0x50, 0x72, 0x6f },                 /* dataFormat="UPro" */
222ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    { 6, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
22385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    { 5, 1, 0, 0 }                              /* dataVersion */
224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic UNewTrie *pTrie=NULL;
227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* -------------------------------------------------------------------------- */
229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void
231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerusetUnicodeVersion(const char *v) {
232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UVersionInfo version;
233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    u_versionFromString(version, v);
234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memcpy(dataInfo.dataVersion, version, 4);
235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void
238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruinitStore() {
239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE);
240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(pTrie==NULL) {
241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "error: unable to create a UNewTrie\n");
242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        exit(U_MEMORY_ALLOCATION_ERROR);
243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    initAdditionalProperties();
246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void
249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruexitStore() {
250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    utrie_close(pTrie);
251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    exitAdditionalProperties();
252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* store a character's properties ------------------------------------------- */
255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern uint32_t
257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerumakeProps(Props *p) {
258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint32_t den;
259ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    int32_t type, value, exp, ntv;
260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /* encode numeric type & value */
262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    type=p->numericType;
263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    value=p->numericValue;
264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    den=p->denominator;
265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    exp=p->exponent;
266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
267ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    ntv=-1; /* the numeric type and value cannot be encoded if ntv remains -1 */
268ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    switch(type) {
269ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    case U_NT_NONE:
270ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        if(value==0 && den==0 && exp==0) {
271ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            ntv=UPROPS_NTV_NONE;
272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
273ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        break;
274ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    case U_NT_DECIMAL:
275ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        if(0<=value && value<=9 && den==0 && exp==0) {
276ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            ntv=UPROPS_NTV_DECIMAL_START+value;
277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
278ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        break;
279ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    case U_NT_DIGIT:
280ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        if(0<=value && value<=9 && den==0 && exp==0) {
281ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            ntv=UPROPS_NTV_DIGIT_START+value;
282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
283ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        break;
284ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    case U_NT_NUMERIC:
285ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        if(den==0) {
286ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            if(exp==2 && (value*100)<=UPROPS_NTV_MAX_SMALL_INT) {
287ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                /* small integer parsed like a large one */
288ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                ntv=UPROPS_NTV_NUMERIC_START+value*100;
289ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            } else if(exp==0 && value>=0) {
290ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                if(value<=UPROPS_NTV_MAX_SMALL_INT) {
291ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    /* small integer */
292ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    ntv=UPROPS_NTV_NUMERIC_START+value;
293ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                } else {
294ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    /* large integer parsed like a small one */
295ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    /* split the value into mantissa and exponent, base 10 */
296ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    int32_t mant=value;
297ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    while((mant%10)==0) {
298ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                        mant/=10;
299ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                        ++exp;
300ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    }
301ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    if(mant<=9) {
302ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                        ntv=((mant+14)<<5)+(exp-2);
303ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    }
304ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                }
305ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            } else if(2<=exp && exp<=33 && 1<=value && value<=9) {
306ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                /* large, single-significant-digit integer */
307ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                ntv=((value+14)<<5)+(exp-2);
308ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            }
309ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        } else if(exp==0) {
310ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            if(-1<=value && value<=17 && 1<=den && den<=16) {
311ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                /* fraction */
312ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                ntv=((value+12)<<4)+(den-1);
313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
315ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    default:
316ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        break;
317ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    }
318ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    if(ntv<0) {
319ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        fprintf(stderr, "genprops error: unable to encode numeric type %d & value %ld/%lu E%d\n",
320ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                (int)type, (long)value, (unsigned long)den, exp);
321ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        exit(U_ILLEGAL_ARGUMENT_ERROR);
322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /* encode the properties */
325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return
326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        (uint32_t)p->generalCategory |
327ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        (ntv<<UPROPS_NUMERIC_TYPE_VALUE_SHIFT);
328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void
331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruaddProps(uint32_t c, uint32_t x) {
332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(!utrie_set32(pTrie, (UChar32)c, x)) {
333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "error: too many entries for the properties trie\n");
334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        exit(U_BUFFER_OVERFLOW_ERROR);
335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern uint32_t
339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerugetProps(uint32_t c) {
340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return utrie_get32(pTrie, (UChar32)c, NULL);
341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* areas of same properties ------------------------------------------------- */
344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void
346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerurepeatProps(uint32_t first, uint32_t last, uint32_t x) {
347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(!utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, FALSE)) {
348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "error: too many entries for the properties trie\n");
349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        exit(U_BUFFER_OVERFLOW_ERROR);
350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* generate output data ----------------------------------------------------- */
354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void
356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerugenerateData(const char *dataDir, UBool csource) {
357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    static int32_t indexes[UPROPS_INDEX_COUNT]={
358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        0, 0, 0, 0,
359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        0, 0, 0, 0,
360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        0, 0, 0, 0,
361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        0, 0, 0, 0
362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    };
363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    static uint8_t trieBlock[40000];
364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    static uint8_t additionalProps[120000];
365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UNewDataMemory *pData;
367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode errorCode=U_ZERO_ERROR;
368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint32_t size = 0;
369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t trieSize, additionalPropsSize, offset;
370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    long dataLength;
371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(errorCode)) {
374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        exit(errorCode);
376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    offset=sizeof(indexes)/4;               /* uint32_t offset to the properties trie */
379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /* round up trie size to 4-alignment */
381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    trieSize=(trieSize+3)&~3;
382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    offset+=trieSize>>2;
383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    indexes[UPROPS_PROPS32_INDEX]=          /* set indexes to the same offsets for empty */
384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    indexes[UPROPS_EXCEPTIONS_INDEX]=       /* structures from the old format version 3 */
385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=   /* so that less runtime code has to be changed */
386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(beVerbose) {
389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        printf("trie size in bytes:                    %5u\n", (int)trieSize);
390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(csource) {
393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        /* write .c file for hardcoded data */
394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UTrie trie={ NULL };
39585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        UTrie2 *trie2;
396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        FILE *f;
397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(U_FAILURE(errorCode)) {
400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fprintf(
401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                stderr,
402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "genprops error: failed to utrie_unserialize(uprops.icu main trie) - %s\n",
403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                u_errorName(errorCode));
40485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            exit(errorCode);
40585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        }
40685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
40785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        /* use UTrie2 */
40885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        trie2=utrie2_fromUTrie(&trie, 0, &errorCode);
40985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        if(U_FAILURE(errorCode)) {
41085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            fprintf(
41185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                stderr,
41285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                "genprops error: utrie2_fromUTrie() failed - %s\n",
41385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                u_errorName(errorCode));
41485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            exit(errorCode);
41585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        }
41685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        {
41785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            /* delete lead surrogate code unit values */
41885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            UChar lead;
41985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            trie2=utrie2_cloneAsThawed(trie2, &errorCode);
42085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            for(lead=0xd800; lead<0xdc00; ++lead) {
42185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode);
42285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            }
42385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode);
42485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            if(U_FAILURE(errorCode)) {
42585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                fprintf(
42685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                    stderr,
42785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                    "genprops error: deleting lead surrogate code unit values failed - %s\n",
42885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                    u_errorName(errorCode));
42985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                exit(errorCode);
43085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            }
431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        f=usrc_create(dataDir, "uchar_props_data.c");
434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(f!=NULL) {
435ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            /* unused
436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            usrc_writeArray(f,
437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "static const UVersionInfo formatVersion={",
438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                dataInfo.formatVersion, 8, 4,
439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "};\n\n");
440ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho             */
441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            usrc_writeArray(f,
442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "static const UVersionInfo dataVersion={",
443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                dataInfo.dataVersion, 8, 4,
444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "};\n\n");
44585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            usrc_writeUTrie2Arrays(f,
446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "static const uint16_t propsTrie_index[%ld]={\n", NULL,
44785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                trie2,
448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "\n};\n\n");
44985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            usrc_writeUTrie2Struct(f,
45085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                "static const UTrie2 propsTrie={\n",
45185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                trie2, "propsTrie_index", NULL,
452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "};\n\n");
453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            additionalPropsSize=writeAdditionalData(f, additionalProps, sizeof(additionalProps), indexes);
455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            size=4*offset+additionalPropsSize;      /* total size of data */
456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            usrc_writeArray(f,
458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "static const int32_t indexes[UPROPS_INDEX_COUNT]={",
459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                indexes, 32, UPROPS_INDEX_COUNT,
460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "};\n\n");
461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fclose(f);
462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
46385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        utrie2_close(trie2);
464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        /* write the data */
466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(U_FAILURE(errorCode)) {
469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode));
470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            exit(errorCode);
471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        additionalPropsSize=writeAdditionalData(NULL, additionalProps, sizeof(additionalProps), indexes);
474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        size=4*offset+additionalPropsSize;      /* total size of data */
475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        udata_writeBlock(pData, indexes, sizeof(indexes));
477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        udata_writeBlock(pData, trieBlock, trieSize);
478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        udata_writeBlock(pData, additionalProps, additionalPropsSize);
479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        /* finish up */
481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        dataLength=udata_finish(pData, &errorCode);
482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(U_FAILURE(errorCode)) {
483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            exit(errorCode);
485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(dataLength!=(long)size) {
488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fprintf(stderr, "genprops: data length %ld != calculated size %lu\n",
489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                dataLength, (unsigned long)size);
490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            exit(U_INTERNAL_PROGRAM_ERROR);
491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(beVerbose) {
495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        printf("data size:                            %6lu\n", (unsigned long)size);
496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Hey, Emacs, please set the following:
501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Local Variables:
503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * indent-tabs-mode: nil
504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * End:
505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
507