store.c revision 85bf2e2fbc60a9f938064abc8127d61da7d19882
1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*******************************************************************************
3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*   Copyright (C) 1999-2008, International Business Machines
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Corporation and others.  All Rights Reserved.
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*******************************************************************************
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   file name:  store.c
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   encoding:   US-ASCII
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   tab size:   8 (not used)
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   indentation:4
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created on: 1999dec11
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created by: Markus W. Scherer
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Store Unicode character properties efficiently for
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   random access.
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <stdio.h>
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uchar.h"
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h"
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cstring.h"
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "utrie.h"
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/udata.h"
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unewdata.h"
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "writesrc.h"
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uprops.h"
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "genprops.h"
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define DO_DEBUG_OUT 0
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* Unicode character properties file format ------------------------------------
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe file format prepared and written here contains several data
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustructures that store indexes or data.
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBefore the data contents described below, there are the headers required by
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruthe udata API for loading ICU data. Especially, a UDataInfo structure
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprecedes the actual data. It contains platform properties values and the
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querufile format version.
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
4485bf2e2fbc60a9f938064abc8127d61da7d19882Claire HoThe following is a description of format version 5 .
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe format changes between version 3 and 4 because the properties related to
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querucase mappings and bidi/shaping are pulled out into separate files
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querufor modularization.
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruIn order to reduce the need for code changes, some of the previous data
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustructures are omitted, rather than rearranging everything.
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruFor details see "Changes in format version 4" below.
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
5485bf2e2fbc60a9f938064abc8127d61da7d19882Claire HoFormat version 5 became necessary because the bit field for script codes
5585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Hooverflowed. Several bit fields got rearranged, and three (Script, Block,
5685bf2e2fbc60a9f938064abc8127d61da7d19882Claire HoWord_Break) got widened by one bit each.
5785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruData contents:
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe contents is a parsed, binary form of several Unicode character
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querudatabase files, most prominently UnicodeData.txt.
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruAny Unicode code point from 0 to 0x10ffff can be looked up to get
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruthe properties, if any, for that code point. This means that the input
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruto the lookup are 21-bit unsigned integers, with not all of the
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru21-bit range used.
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruIt is assumed that client code keeps a uint32_t pointer
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruto the beginning of the data:
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const uint32_t *p32;
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruFormally, the file contains the following structures:
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const int32_t indexes[16] with values i0..i15:
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  i0 indicates the length of the main trie.
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  i0..i3 all have the same value in format version 4.0;
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru         the related props32[] and exceptions[] and uchars[] were used in format version 3
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i1 exceptionsIndex;  -- 32-bit unit index to the table of 32-bit exception words
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i7..i9 reservedIndexes; -- reserved values; 0 for now
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    i12..i15 reservedIndexes; -- reserved values; 0 for now
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  P, E, and U are not used (empty) in format version 4
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    P  const uint32_t props32[i1-i0];
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    E  const uint32_t exceptions[i2-i1];
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    U  const UChar uchars[2*(i3-i2)];
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    AT serialized trie for additional properties (byte size: 4*(i4-i3))
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruTrie lookup and properties:
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruIn order to condense the data for the 21-bit code space, several properties of
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruthe Unicode code assignment are exploited:
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The code space is sparse.
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- There are several 10k of consecutive codes with the same properties.
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Characters and scripts are allocated in groups of 16 code points.
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Inside blocks for scripts the properties are often repetitive.
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The 21-bit space is not fully used for Unicode.
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe lookup of properties for a given code point is done with a trie lookup,
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruusing the UTrie implementation.
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe trie lookup result is a 16-bit properties word.
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruWith a given Unicode code point
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar32 c;
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruand 0<=c<0x110000, the lookup is done like this:
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t props;
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UTRIE_GET16(trie, c, props);
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruEach 16-bit properties word contains:
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0.. 4  general category
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 5.. 7  numeric type
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        non-digit numbers are stored with multiple types and pseudo-types
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        in order to facilitate compact encoding:
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        0 no numeric value (0)
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        1 decimal digit value (0..9)
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        2 digit value (0..9)
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        3 (U_NT_NUMERIC) normal non-digit numeric value 0..0xff
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        4 (internal type UPROPS_NT_FRACTION) fraction
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        5 (internal type UPROPS_NT_LARGE) large number >0xff
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        6..7 reserved
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        when returning the numeric type from a public API,
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        internal types must be turned into U_NT_NUMERIC
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 8..15  numeric value
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        encoding of fractions and large numbers see below
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruFractions:
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down)
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t num, den;
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    num=n>>3;       // num=0..31
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    den=(n&7)+2;    // den=2..9
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(num==0) {
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        num=-1;     // num=-1 or 1..31
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    double result=(double)num/(double)den;
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruLarge numbers:
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down)
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t m, e;
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    m=n>>4;         // m=0..15
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    e=(n&0xf);
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(m==0) {
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        m=1;        // for large powers of 10
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        e+=18;      // e=18..33
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        e+=2;       // e=2..17
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } // m==10..15 are reserved
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    double result=(double)m*10^e;
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru--- Additional properties (new in format version 2.1) ---
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe second trie for additional properties (AT) is also a UTrie with 16-bit data.
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe data words consist of 32-bit unit indexes (not row indexes!) into the
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querutable of unique properties vectors (PV).
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruEach vector contains a set of properties.
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe width of a vector (number of uint32_t per row) may change
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruwith the formatVersion, it is stored in i5.
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruCurrent properties: see icu/source/common/uprops.h
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru--- Changes in format version 3.1 ---
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruSee i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru--- Changes in format version 3.2 ---
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The tries use linear Latin-1 ranges.
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The additional properties bits store full properties XYZ instead
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  of partial Other_XYZ, so that changes in the derivation formulas
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  need not be tracked in runtime library code.
194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Joining Type and Line Break are also stored completely, so that uprops.c
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  needs no runtime formulas for enumerated properties either.
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Store the case-sensitive flag in the main properties word.
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- i10 also contains U_LB_COUNT and U_EA_COUNT.
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- i11 contains maxValues2 for vector word 2.
199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru--- Changes in format version 4 ---
201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe format changes between version 3 and 4 because the properties related to
203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querucase mappings and bidi/shaping are pulled out into separate files
204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querufor modularization.
205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruIn order to reduce the need for code changes, some of the previous data
206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustructures are omitted, rather than rearranging everything.
207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru(The change to format version 4 is for ICU 3.4. The last CVS revision of
209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querugenprops/store.c for format version 3.2 is 1.48.)
210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe main trie's data is significantly simplified:
212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The trie's 16-bit data word is used directly instead of as an index
213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  into props32[].
214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The trie uses the default trie folding functions instead of custom ones.
215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Numeric values are stored directly in the trie data word, with special
216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  encodings.
217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- No more exception data (the data that needed it was pulled out, or, in the
218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  case of numeric values, encoded differently).
219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- No more string data (pulled out - was for case mappings).
220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruAlso, some of the previously used properties vector bits are reserved again.
222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe indexes[] values for the omitted structures are still filled in
224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru(indicating zero-length arrays) so that the swapper code remains unchanged.
225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
22685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho--- Changes in format version 5 ---
22785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
22885bf2e2fbc60a9f938064abc8127d61da7d19882Claire HoRearranged bit fields in the second trie (AT) because the script code field
22985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Hooverflowed. Old code would have seen nonsensically low values for new, higher
23085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Hoscript codes.
23185bf2e2fbc60a9f938064abc8127d61da7d19882Claire HoModified bit fields in icu/source/common/uprops.h
23285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru----------------------------------------------------------------------------- */
234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* UDataInfo cf. udata.h */
236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic UDataInfo dataInfo={
237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    sizeof(UDataInfo),
238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0,
239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    U_IS_BIG_ENDIAN,
241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    U_CHARSET_FAMILY,
242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    U_SIZEOF_UCHAR,
243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0,
244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    { 0x55, 0x50, 0x72, 0x6f },                 /* dataFormat="UPro" */
24685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    { 5, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
24785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    { 5, 1, 0, 0 }                              /* dataVersion */
248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic UNewTrie *pTrie=NULL;
251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* -------------------------------------------------------------------------- */
253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void
255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerusetUnicodeVersion(const char *v) {
256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UVersionInfo version;
257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    u_versionFromString(version, v);
258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memcpy(dataInfo.dataVersion, version, 4);
259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void
262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruinitStore() {
263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE);
264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(pTrie==NULL) {
265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "error: unable to create a UNewTrie\n");
266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        exit(U_MEMORY_ALLOCATION_ERROR);
267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    initAdditionalProperties();
270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void
273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruexitStore() {
274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    utrie_close(pTrie);
275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    exitAdditionalProperties();
276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic uint32_t printNumericTypeValueError(Props *p) {
279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fprintf(stderr, "genprops error: unable to encode numeric type & value %d  %ld/%lu E%d\n",
280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            (int)p->numericType, (long)p->numericValue, (unsigned long)p->denominator, p->exponent);
281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    exit(U_ILLEGAL_ARGUMENT_ERROR);
282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return 0;
283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* store a character's properties ------------------------------------------- */
286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern uint32_t
288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerumakeProps(Props *p) {
289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint32_t den;
290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t type, value, exp;
291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /* encode numeric type & value */
293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    type=p->numericType;
294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    value=p->numericValue;
295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    den=p->denominator;
296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    exp=p->exponent;
297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(den!=0) {
299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        /* fraction */
300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if( type!=U_NT_NUMERIC ||
301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            value<-1 || value==0 || value>UPROPS_FRACTION_MAX_NUM ||
302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            den<UPROPS_FRACTION_MIN_DEN || UPROPS_FRACTION_MAX_DEN<den ||
303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            exp!=0
304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ) {
305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return printNumericTypeValueError(p);
306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        type=UPROPS_NT_FRACTION;
308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(value==-1) {
310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            value=0;
311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        den-=UPROPS_FRACTION_DEN_OFFSET;
313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        value=(value<<UPROPS_FRACTION_NUM_SHIFT)|den;
314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else if(exp!=0) {
315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        /* very large value */
316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if( type!=U_NT_NUMERIC ||
317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            value<1 || 9<value ||
318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            exp<UPROPS_LARGE_MIN_EXP || UPROPS_LARGE_MAX_EXP_EXTRA<exp
319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ) {
320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return printNumericTypeValueError(p);
321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        type=UPROPS_NT_LARGE;
323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(exp<=UPROPS_LARGE_MAX_EXP) {
325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            /* 1..9 * 10^(2..17) */
326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            exp-=UPROPS_LARGE_EXP_OFFSET;
327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            /* 1 * 10^(18..33) */
329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(value!=1) {
330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return printNumericTypeValueError(p);
331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            value=0;
333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            exp-=UPROPS_LARGE_EXP_OFFSET_EXTRA;
334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp;
336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else if(value>UPROPS_MAX_SMALL_NUMBER) {
337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        /* large value */
338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(type!=U_NT_NUMERIC) {
339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return printNumericTypeValueError(p);
340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        type=UPROPS_NT_LARGE;
342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        /* split the value into mantissa and exponent, base 10 */
344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        while((value%10)==0) {
345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            value/=10;
346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            ++exp;
347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(value>9) {
349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return printNumericTypeValueError(p);
350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        exp-=UPROPS_LARGE_EXP_OFFSET;
353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp;
354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else if(value<0) {
355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        /* unable to encode negative values, other than fractions -1/x */
356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return printNumericTypeValueError(p);
357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /* } else normal value=0..0xff { */
359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /* encode the properties */
362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return
363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        (uint32_t)p->generalCategory |
364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ((uint32_t)type<<UPROPS_NUMERIC_TYPE_SHIFT) |
365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ((uint32_t)value<<UPROPS_NUMERIC_VALUE_SHIFT);
366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void
369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruaddProps(uint32_t c, uint32_t x) {
370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(!utrie_set32(pTrie, (UChar32)c, x)) {
371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "error: too many entries for the properties trie\n");
372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        exit(U_BUFFER_OVERFLOW_ERROR);
373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern uint32_t
377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerugetProps(uint32_t c) {
378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return utrie_get32(pTrie, (UChar32)c, NULL);
379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* areas of same properties ------------------------------------------------- */
382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void
384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerurepeatProps(uint32_t first, uint32_t last, uint32_t x) {
385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(!utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, FALSE)) {
386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "error: too many entries for the properties trie\n");
387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        exit(U_BUFFER_OVERFLOW_ERROR);
388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* generate output data ----------------------------------------------------- */
392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void
394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerugenerateData(const char *dataDir, UBool csource) {
395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    static int32_t indexes[UPROPS_INDEX_COUNT]={
396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        0, 0, 0, 0,
397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        0, 0, 0, 0,
398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        0, 0, 0, 0,
399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        0, 0, 0, 0
400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    };
401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    static uint8_t trieBlock[40000];
402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    static uint8_t additionalProps[120000];
403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UNewDataMemory *pData;
405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode errorCode=U_ZERO_ERROR;
406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint32_t size = 0;
407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t trieSize, additionalPropsSize, offset;
408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    long dataLength;
409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(errorCode)) {
412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        exit(errorCode);
414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    offset=sizeof(indexes)/4;               /* uint32_t offset to the properties trie */
417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /* round up trie size to 4-alignment */
419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    trieSize=(trieSize+3)&~3;
420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    offset+=trieSize>>2;
421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    indexes[UPROPS_PROPS32_INDEX]=          /* set indexes to the same offsets for empty */
422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    indexes[UPROPS_EXCEPTIONS_INDEX]=       /* structures from the old format version 3 */
423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=   /* so that less runtime code has to be changed */
424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(beVerbose) {
427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        printf("trie size in bytes:                    %5u\n", (int)trieSize);
428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(csource) {
431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        /* write .c file for hardcoded data */
432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UTrie trie={ NULL };
43385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        UTrie2 *trie2;
434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        FILE *f;
435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(U_FAILURE(errorCode)) {
438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fprintf(
439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                stderr,
440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "genprops error: failed to utrie_unserialize(uprops.icu main trie) - %s\n",
441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                u_errorName(errorCode));
44285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            exit(errorCode);
44385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        }
44485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
44585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        /* use UTrie2 */
44685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        dataInfo.formatVersion[0]=6;
44785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        dataInfo.formatVersion[2]=0;
44885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        dataInfo.formatVersion[3]=0;
44985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        trie2=utrie2_fromUTrie(&trie, 0, &errorCode);
45085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        if(U_FAILURE(errorCode)) {
45185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            fprintf(
45285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                stderr,
45385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                "genprops error: utrie2_fromUTrie() failed - %s\n",
45485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                u_errorName(errorCode));
45585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            exit(errorCode);
45685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        }
45785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        {
45885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            /* delete lead surrogate code unit values */
45985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            UChar lead;
46085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            trie2=utrie2_cloneAsThawed(trie2, &errorCode);
46185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            for(lead=0xd800; lead<0xdc00; ++lead) {
46285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode);
46385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            }
46485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode);
46585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            if(U_FAILURE(errorCode)) {
46685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                fprintf(
46785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                    stderr,
46885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                    "genprops error: deleting lead surrogate code unit values failed - %s\n",
46985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                    u_errorName(errorCode));
47085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                exit(errorCode);
47185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            }
472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        f=usrc_create(dataDir, "uchar_props_data.c");
475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(f!=NULL) {
476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            usrc_writeArray(f,
477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "static const UVersionInfo formatVersion={",
478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                dataInfo.formatVersion, 8, 4,
479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "};\n\n");
480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            usrc_writeArray(f,
481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "static const UVersionInfo dataVersion={",
482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                dataInfo.dataVersion, 8, 4,
483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "};\n\n");
48485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            usrc_writeUTrie2Arrays(f,
485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "static const uint16_t propsTrie_index[%ld]={\n", NULL,
48685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                trie2,
487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "\n};\n\n");
48885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            usrc_writeUTrie2Struct(f,
48985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                "static const UTrie2 propsTrie={\n",
49085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                trie2, "propsTrie_index", NULL,
491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "};\n\n");
492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            additionalPropsSize=writeAdditionalData(f, additionalProps, sizeof(additionalProps), indexes);
494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            size=4*offset+additionalPropsSize;      /* total size of data */
495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            usrc_writeArray(f,
497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "static const int32_t indexes[UPROPS_INDEX_COUNT]={",
498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                indexes, 32, UPROPS_INDEX_COUNT,
499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                "};\n\n");
500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fclose(f);
501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
50285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        utrie2_close(trie2);
503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        /* write the data */
505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(U_FAILURE(errorCode)) {
508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode));
509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            exit(errorCode);
510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
511ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
512ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        additionalPropsSize=writeAdditionalData(NULL, additionalProps, sizeof(additionalProps), indexes);
513ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        size=4*offset+additionalPropsSize;      /* total size of data */
514ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
515ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        udata_writeBlock(pData, indexes, sizeof(indexes));
516ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        udata_writeBlock(pData, trieBlock, trieSize);
517ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        udata_writeBlock(pData, additionalProps, additionalPropsSize);
518ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
519ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        /* finish up */
520ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        dataLength=udata_finish(pData, &errorCode);
521ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(U_FAILURE(errorCode)) {
522ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
523ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            exit(errorCode);
524ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
525ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
526ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(dataLength!=(long)size) {
527ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fprintf(stderr, "genprops: data length %ld != calculated size %lu\n",
528ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                dataLength, (unsigned long)size);
529ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            exit(U_INTERNAL_PROGRAM_ERROR);
530ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
531ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
532ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
533ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(beVerbose) {
534ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        printf("data size:                            %6lu\n", (unsigned long)size);
535ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
536ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
537ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
538ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
539ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Hey, Emacs, please set the following:
540ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
541ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Local Variables:
542ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * indent-tabs-mode: nil
543ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * End:
544ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
545ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
546