store.c revision 85bf2e2fbc60a9f938064abc8127d61da7d19882
1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru******************************************************************************* 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho* Copyright (C) 1999-2008, International Business Machines 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru******************************************************************************* 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* file name: store.c 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* encoding: US-ASCII 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* tab size: 8 (not used) 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* indentation:4 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created on: 1999dec11 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created by: Markus W. Scherer 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Store Unicode character properties efficiently for 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* random access. 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/ 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <stdio.h> 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uchar.h" 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h" 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cstring.h" 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "utrie.h" 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/udata.h" 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unewdata.h" 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "writesrc.h" 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uprops.h" 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "genprops.h" 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define DO_DEBUG_OUT 0 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* Unicode character properties file format ------------------------------------ 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe file format prepared and written here contains several data 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustructures that store indexes or data. 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBefore the data contents described below, there are the headers required by 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruthe udata API for loading ICU data. Especially, a UDataInfo structure 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprecedes the actual data. It contains platform properties values and the 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querufile format version. 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 4485bf2e2fbc60a9f938064abc8127d61da7d19882Claire HoThe following is a description of format version 5 . 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe format changes between version 3 and 4 because the properties related to 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querucase mappings and bidi/shaping are pulled out into separate files 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querufor modularization. 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruIn order to reduce the need for code changes, some of the previous data 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustructures are omitted, rather than rearranging everything. 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruFor details see "Changes in format version 4" below. 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 5485bf2e2fbc60a9f938064abc8127d61da7d19882Claire HoFormat version 5 became necessary because the bit field for script codes 5585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Hooverflowed. Several bit fields got rearranged, and three (Script, Block, 5685bf2e2fbc60a9f938064abc8127d61da7d19882Claire HoWord_Break) got widened by one bit each. 5785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruData contents: 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe contents is a parsed, binary form of several Unicode character 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querudatabase files, most prominently UnicodeData.txt. 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruAny Unicode code point from 0 to 0x10ffff can be looked up to get 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruthe properties, if any, for that code point. This means that the input 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruto the lookup are 21-bit unsigned integers, with not all of the 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru21-bit range used. 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruIt is assumed that client code keeps a uint32_t pointer 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruto the beginning of the data: 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const uint32_t *p32; 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruFormally, the file contains the following structures: 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const int32_t indexes[16] with values i0..i15: 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i0 indicates the length of the main trie. 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i0..i3 all have the same value in format version 4.0; 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru the related props32[] and exceptions[] and uchars[] were used in format version 3 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i5 additionalVectorsColumns; -- number of 32-bit words per properties vector 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i7..i9 reservedIndexes; -- reserved values; 0 for now 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+) 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2) 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i12..i15 reservedIndexes; -- reserved values; 0 for now 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru PT serialized properties trie, see utrie.h (byte size: 4*(i0-16)) 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru P, E, and U are not used (empty) in format version 4 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru P const uint32_t props32[i1-i0]; 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru E const uint32_t exceptions[i2-i1]; 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U const UChar uchars[2*(i3-i2)]; 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru AT serialized trie for additional properties (byte size: 4*(i4-i3)) 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4]; 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruTrie lookup and properties: 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruIn order to condense the data for the 21-bit code space, several properties of 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruthe Unicode code assignment are exploited: 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The code space is sparse. 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- There are several 10k of consecutive codes with the same properties. 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Characters and scripts are allocated in groups of 16 code points. 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Inside blocks for scripts the properties are often repetitive. 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The 21-bit space is not fully used for Unicode. 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe lookup of properties for a given code point is done with a trie lookup, 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruusing the UTrie implementation. 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe trie lookup result is a 16-bit properties word. 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruWith a given Unicode code point 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 c; 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruand 0<=c<0x110000, the lookup is done like this: 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t props; 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UTRIE_GET16(trie, c, props); 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruEach 16-bit properties word contains: 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0.. 4 general category 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 5.. 7 numeric type 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru non-digit numbers are stored with multiple types and pseudo-types 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru in order to facilitate compact encoding: 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0 no numeric value (0) 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1 decimal digit value (0..9) 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 2 digit value (0..9) 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 3 (U_NT_NUMERIC) normal non-digit numeric value 0..0xff 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 4 (internal type UPROPS_NT_FRACTION) fraction 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 5 (internal type UPROPS_NT_LARGE) large number >0xff 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 6..7 reserved 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru when returning the numeric type from a public API, 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru internal types must be turned into U_NT_NUMERIC 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 8..15 numeric value 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru encoding of fractions and large numbers see below 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruFractions: 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down) 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t num, den; 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru num=n>>3; // num=0..31 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru den=(n&7)+2; // den=2..9 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(num==0) { 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru num=-1; // num=-1 or 1..31 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru double result=(double)num/(double)den; 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruLarge numbers: 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down) 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t m, e; 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru m=n>>4; // m=0..15 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru e=(n&0xf); 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(m==0) { 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru m=1; // for large powers of 10 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru e+=18; // e=18..33 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru e+=2; // e=2..17 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } // m==10..15 are reserved 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru double result=(double)m*10^e; 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru--- Additional properties (new in format version 2.1) --- 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe second trie for additional properties (AT) is also a UTrie with 16-bit data. 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe data words consist of 32-bit unit indexes (not row indexes!) into the 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querutable of unique properties vectors (PV). 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruEach vector contains a set of properties. 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe width of a vector (number of uint32_t per row) may change 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruwith the formatVersion, it is stored in i5. 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruCurrent properties: see icu/source/common/uprops.h 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru--- Changes in format version 3.1 --- 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruSee i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT. 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru--- Changes in format version 3.2 --- 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The tries use linear Latin-1 ranges. 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The additional properties bits store full properties XYZ instead 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru of partial Other_XYZ, so that changes in the derivation formulas 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru need not be tracked in runtime library code. 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Joining Type and Line Break are also stored completely, so that uprops.c 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru needs no runtime formulas for enumerated properties either. 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Store the case-sensitive flag in the main properties word. 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- i10 also contains U_LB_COUNT and U_EA_COUNT. 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- i11 contains maxValues2 for vector word 2. 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru--- Changes in format version 4 --- 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe format changes between version 3 and 4 because the properties related to 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querucase mappings and bidi/shaping are pulled out into separate files 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querufor modularization. 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruIn order to reduce the need for code changes, some of the previous data 206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustructures are omitted, rather than rearranging everything. 207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru(The change to format version 4 is for ICU 3.4. The last CVS revision of 209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querugenprops/store.c for format version 3.2 is 1.48.) 210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe main trie's data is significantly simplified: 212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The trie's 16-bit data word is used directly instead of as an index 213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru into props32[]. 214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- The trie uses the default trie folding functions instead of custom ones. 215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- Numeric values are stored directly in the trie data word, with special 216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru encodings. 217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- No more exception data (the data that needed it was pulled out, or, in the 218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru case of numeric values, encoded differently). 219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru- No more string data (pulled out - was for case mappings). 220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruAlso, some of the previously used properties vector bits are reserved again. 222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThe indexes[] values for the omitted structures are still filled in 224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru(indicating zero-length arrays) so that the swapper code remains unchanged. 225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 22685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho--- Changes in format version 5 --- 22785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 22885bf2e2fbc60a9f938064abc8127d61da7d19882Claire HoRearranged bit fields in the second trie (AT) because the script code field 22985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Hooverflowed. Old code would have seen nonsensically low values for new, higher 23085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Hoscript codes. 23185bf2e2fbc60a9f938064abc8127d61da7d19882Claire HoModified bit fields in icu/source/common/uprops.h 23285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru----------------------------------------------------------------------------- */ 234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* UDataInfo cf. udata.h */ 236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic UDataInfo dataInfo={ 237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru sizeof(UDataInfo), 238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U_IS_BIG_ENDIAN, 241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U_CHARSET_FAMILY, 242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U_SIZEOF_UCHAR, 243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru { 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */ 24685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho { 5, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ 24785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho { 5, 1, 0, 0 } /* dataVersion */ 248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic UNewTrie *pTrie=NULL; 251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* -------------------------------------------------------------------------- */ 253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void 255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerusetUnicodeVersion(const char *v) { 256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UVersionInfo version; 257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru u_versionFromString(version, v); 258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memcpy(dataInfo.dataVersion, version, 4); 259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void 262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruinitStore() { 263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE); 264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pTrie==NULL) { 265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "error: unable to create a UNewTrie\n"); 266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(U_MEMORY_ALLOCATION_ERROR); 267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru initAdditionalProperties(); 270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void 273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruexitStore() { 274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utrie_close(pTrie); 275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exitAdditionalProperties(); 276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic uint32_t printNumericTypeValueError(Props *p) { 279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "genprops error: unable to encode numeric type & value %d %ld/%lu E%d\n", 280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (int)p->numericType, (long)p->numericValue, (unsigned long)p->denominator, p->exponent); 281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(U_ILLEGAL_ARGUMENT_ERROR); 282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* store a character's properties ------------------------------------------- */ 286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern uint32_t 288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerumakeProps(Props *p) { 289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t den; 290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t type, value, exp; 291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* encode numeric type & value */ 293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru type=p->numericType; 294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru value=p->numericValue; 295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru den=p->denominator; 296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exp=p->exponent; 297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(den!=0) { 299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* fraction */ 300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( type!=U_NT_NUMERIC || 301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru value<-1 || value==0 || value>UPROPS_FRACTION_MAX_NUM || 302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru den<UPROPS_FRACTION_MIN_DEN || UPROPS_FRACTION_MAX_DEN<den || 303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exp!=0 304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return printNumericTypeValueError(p); 306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru type=UPROPS_NT_FRACTION; 308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(value==-1) { 310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru value=0; 311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru den-=UPROPS_FRACTION_DEN_OFFSET; 313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru value=(value<<UPROPS_FRACTION_NUM_SHIFT)|den; 314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(exp!=0) { 315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* very large value */ 316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( type!=U_NT_NUMERIC || 317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru value<1 || 9<value || 318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exp<UPROPS_LARGE_MIN_EXP || UPROPS_LARGE_MAX_EXP_EXTRA<exp 319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return printNumericTypeValueError(p); 321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru type=UPROPS_NT_LARGE; 323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(exp<=UPROPS_LARGE_MAX_EXP) { 325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 1..9 * 10^(2..17) */ 326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exp-=UPROPS_LARGE_EXP_OFFSET; 327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 1 * 10^(18..33) */ 329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(value!=1) { 330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return printNumericTypeValueError(p); 331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru value=0; 333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exp-=UPROPS_LARGE_EXP_OFFSET_EXTRA; 334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp; 336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(value>UPROPS_MAX_SMALL_NUMBER) { 337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* large value */ 338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(type!=U_NT_NUMERIC) { 339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return printNumericTypeValueError(p); 340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru type=UPROPS_NT_LARGE; 342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* split the value into mantissa and exponent, base 10 */ 344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while((value%10)==0) { 345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru value/=10; 346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++exp; 347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(value>9) { 349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return printNumericTypeValueError(p); 350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exp-=UPROPS_LARGE_EXP_OFFSET; 353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp; 354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(value<0) { 355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* unable to encode negative values, other than fractions -1/x */ 356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return printNumericTypeValueError(p); 357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* } else normal value=0..0xff { */ 359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* encode the properties */ 362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (uint32_t)p->generalCategory | 364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((uint32_t)type<<UPROPS_NUMERIC_TYPE_SHIFT) | 365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((uint32_t)value<<UPROPS_NUMERIC_VALUE_SHIFT); 366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void 369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruaddProps(uint32_t c, uint32_t x) { 370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!utrie_set32(pTrie, (UChar32)c, x)) { 371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "error: too many entries for the properties trie\n"); 372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(U_BUFFER_OVERFLOW_ERROR); 373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern uint32_t 377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerugetProps(uint32_t c) { 378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return utrie_get32(pTrie, (UChar32)c, NULL); 379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* areas of same properties ------------------------------------------------- */ 382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void 384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerurepeatProps(uint32_t first, uint32_t last, uint32_t x) { 385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, FALSE)) { 386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "error: too many entries for the properties trie\n"); 387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(U_BUFFER_OVERFLOW_ERROR); 388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* generate output data ----------------------------------------------------- */ 392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruextern void 394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerugenerateData(const char *dataDir, UBool csource) { 395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru static int32_t indexes[UPROPS_INDEX_COUNT]={ 396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0 400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru }; 401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru static uint8_t trieBlock[40000]; 402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru static uint8_t additionalProps[120000]; 403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UNewDataMemory *pData; 405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode errorCode=U_ZERO_ERROR; 406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t size = 0; 407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t trieSize, additionalPropsSize, offset; 408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru long dataLength; 409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode); 411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(errorCode)) { 412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize); 413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(errorCode); 414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */ 417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* round up trie size to 4-alignment */ 419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru trieSize=(trieSize+3)&~3; 420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offset+=trieSize>>2; 421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */ 422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru indexes[UPROPS_EXCEPTIONS_INDEX]= /* structures from the old format version 3 */ 423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru indexes[UPROPS_EXCEPTIONS_TOP_INDEX]= /* so that less runtime code has to be changed */ 424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset; 425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(beVerbose) { 427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("trie size in bytes: %5u\n", (int)trieSize); 428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(csource) { 431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* write .c file for hardcoded data */ 432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UTrie trie={ NULL }; 43385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho UTrie2 *trie2; 434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru FILE *f; 435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utrie_unserialize(&trie, trieBlock, trieSize, &errorCode); 437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(errorCode)) { 438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf( 439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru stderr, 440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "genprops error: failed to utrie_unserialize(uprops.icu main trie) - %s\n", 441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru u_errorName(errorCode)); 44285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho exit(errorCode); 44385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho } 44485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 44585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho /* use UTrie2 */ 44685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho dataInfo.formatVersion[0]=6; 44785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho dataInfo.formatVersion[2]=0; 44885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho dataInfo.formatVersion[3]=0; 44985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho trie2=utrie2_fromUTrie(&trie, 0, &errorCode); 45085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho if(U_FAILURE(errorCode)) { 45185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho fprintf( 45285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho stderr, 45385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho "genprops error: utrie2_fromUTrie() failed - %s\n", 45485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho u_errorName(errorCode)); 45585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho exit(errorCode); 45685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho } 45785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho { 45885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho /* delete lead surrogate code unit values */ 45985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho UChar lead; 46085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho trie2=utrie2_cloneAsThawed(trie2, &errorCode); 46185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho for(lead=0xd800; lead<0xdc00; ++lead) { 46285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode); 46385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho } 46485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode); 46585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho if(U_FAILURE(errorCode)) { 46685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho fprintf( 46785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho stderr, 46885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho "genprops error: deleting lead surrogate code unit values failed - %s\n", 46985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho u_errorName(errorCode)); 47085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho exit(errorCode); 47185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho } 472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru f=usrc_create(dataDir, "uchar_props_data.c"); 475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(f!=NULL) { 476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru usrc_writeArray(f, 477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "static const UVersionInfo formatVersion={", 478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru dataInfo.formatVersion, 8, 4, 479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "};\n\n"); 480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru usrc_writeArray(f, 481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "static const UVersionInfo dataVersion={", 482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru dataInfo.dataVersion, 8, 4, 483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "};\n\n"); 48485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho usrc_writeUTrie2Arrays(f, 485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "static const uint16_t propsTrie_index[%ld]={\n", NULL, 48685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho trie2, 487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "\n};\n\n"); 48885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho usrc_writeUTrie2Struct(f, 48985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho "static const UTrie2 propsTrie={\n", 49085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho trie2, "propsTrie_index", NULL, 491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "};\n\n"); 492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru additionalPropsSize=writeAdditionalData(f, additionalProps, sizeof(additionalProps), indexes); 494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru size=4*offset+additionalPropsSize; /* total size of data */ 495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru usrc_writeArray(f, 497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "static const int32_t indexes[UPROPS_INDEX_COUNT]={", 498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru indexes, 32, UPROPS_INDEX_COUNT, 499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "};\n\n"); 500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fclose(f); 501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 50285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho utrie2_close(trie2); 503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* write the data */ 505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo, 506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); 507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(errorCode)) { 508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode)); 509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(errorCode); 510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 511ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 512ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru additionalPropsSize=writeAdditionalData(NULL, additionalProps, sizeof(additionalProps), indexes); 513ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru size=4*offset+additionalPropsSize; /* total size of data */ 514ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 515ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru udata_writeBlock(pData, indexes, sizeof(indexes)); 516ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru udata_writeBlock(pData, trieBlock, trieSize); 517ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru udata_writeBlock(pData, additionalProps, additionalPropsSize); 518ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 519ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* finish up */ 520ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru dataLength=udata_finish(pData, &errorCode); 521ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(errorCode)) { 522ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "genprops: error %d writing the output file\n", errorCode); 523ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(errorCode); 524ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 525ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 526ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(dataLength!=(long)size) { 527ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "genprops: data length %ld != calculated size %lu\n", 528ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru dataLength, (unsigned long)size); 529ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(U_INTERNAL_PROGRAM_ERROR); 530ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 531ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 532ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 533ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(beVerbose) { 534ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("data size: %6lu\n", (unsigned long)size); 535ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 536ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 537ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 538ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 539ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Hey, Emacs, please set the following: 540ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 541ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Local Variables: 542ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * indent-tabs-mode: nil 543ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * End: 544ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 545ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 546