1/*
2*******************************************************************************
3*
4*   Copyright (C) 2004-2009, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  store.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2004aug28
14*   created by: Markus W. Scherer
15*
16*   Store Unicode case mapping properties efficiently for
17*   random access.
18*/
19
20#include <stdio.h>
21#include <stdlib.h>
22#include "unicode/utypes.h"
23#include "unicode/uchar.h"
24#include "unicode/ustring.h"
25#include "cmemory.h"
26#include "cstring.h"
27#include "filestrm.h"
28#include "utrie.h"
29#include "utrie2.h"
30#include "uarrsort.h"
31#include "unicode/udata.h"
32#include "unewdata.h"
33#include "propsvec.h"
34#include "writesrc.h"
35#include "gencase.h"
36
37#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
38
39/* Unicode case mapping properties file format ---------------------------------
40
41The file format prepared and written here contains several data
42structures that store indexes or data.
43
44Before the data contents described below, there are the headers required by
45the udata API for loading ICU data. Especially, a UDataInfo structure
46precedes the actual data. It contains platform properties values and the
47file format version.
48
49The following is a description of format version 1.2 .
50
51Format version 1.1 adds data for case closure.
52
53Format version 1.2 adds an exception bit for case-ignorable. Needed because
54the Cased and Case_Ignorable properties are not disjoint.
55
56The file contains the following structures:
57
58    const int32_t indexes[i0] with values i0, i1, ...:
59    (see UCASE_IX_... constants for names of indexes)
60
61    i0 indexLength; -- length of indexes[] (UCASE_IX_TOP)
62    i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
63    i2 trieSize; -- size in bytes of the case mapping properties trie
64    i3 exceptionsLength; -- length in uint16_t of the exceptions array
65    i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1)
66
67    i5..i14 reservedIndexes; -- reserved values; 0 for now
68
69    i15 maxFullLength; -- maximum length of a full case mapping/folding string
70
71
72    Serialized trie, see utrie.h;
73
74    const uint16_t exceptions[exceptionsLength];
75
76    const UChar unfold[unfoldLength];
77
78
79Trie data word:
80Bits
81if(exception) {
82    15..4   unsigned exception index
83} else {
84    if(not uncased) {
85        15..6   signed delta to simple case mapping code point
86                (add delta to input code point)
87    } else {
88            6   the code point is case-ignorable
89                (U+0307 is also case-ignorable but has an exception)
90    }
91     5..4   0 normal character with cc=0
92            1 soft-dotted character
93            2 cc=230
94            3 other cc
95}
96    3   exception
97    2   case sensitive
98 1..0   0 uncased
99        1 lowercase
100        2 uppercase
101        3 titlecase
102
103
104Exceptions:
105A sub-array of the exceptions array is indexed by the exception index in a
106trie word.
107The sub-array consists of the following fields:
108    uint16_t excWord;
109    uint16_t optional values [];
110    UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase
111
112excWord: (see UCASE_EXC_...)
113Bits
114    15  conditional case folding
115    14  conditional special casing
11613..12  same as non-exception trie data bits 5..4
117        moved here because the exception index needs more bits than the delta
118        0 normal character with cc=0
119        1 soft-dotted character
120        2 cc=230
121        3 other cc
12211      case-ignorable (used when the character is cased or has another exception)
123        (new in formatVersion 1.2/ICU 4.4)
12410.. 9  reserved
125     8  if set, then for each optional-value slot there are 2 uint16_t values
126        (high and low parts of 32-bit values)
127        instead of single ones
128 7.. 0  bits for which optional value is present
129
130Optional-value slots:
1310   lowercase mapping (code point)
1321   case folding (code point)
1332   uppercase mapping (code point)
1343   titlecase mapping (code point)
1354   reserved
1365   reserved
1376   closure mappings (new in format version 1.1)
1387   there is at least one full (string) case mapping
139    the length of each is encoded in a nibble of this optional value,
140    and the strings follow this optional value in the same order:
141    lower/fold/upper/title
142
143The optional closure mappings value is used as follows:
144Bits 0..3 contain the length of a string of code points for case closure.
145The string immediately follows the full case mappings, or the closure value
146slot if there are no full case mappings.
147Bits 4..15 are reserved and could be used in the future to indicate the
148number of strings for case closure.
149Complete case closure for a code point is given by the union of all simple
150and full case mappings and foldings, plus the case closure code points
151(and potentially, in the future, case closure strings).
152
153For space saving, some values are not stored. Lookups are as follows:
154- If special casing is conditional, then no full lower/upper/title mapping
155  strings are stored.
156- If case folding is conditional, then no simple or full case foldings are
157  stored.
158- Fall back in this order:
159    full (string) mapping -- if full mappings are used
160    simple (code point) mapping of the same type
161    simple fold->simple lower
162    simple title->simple upper
163    finally, the original code point (no mapping)
164
165This fallback order is strict:
166In particular, the fallback from full case folding is to simple case folding,
167not to full lowercase mapping.
168
169Reverse case folding data ("unfold") array: (new in format version 1.1)
170
171This array stores some miscellaneous values followed by a table. The data maps
172back from multi-character strings to their original code points, for use
173in case closure.
174
175The table contains two columns of strings.
176The string in the first column is the case folding of each of the code points
177in the second column. The strings are terminated with NUL or by the end of the
178column, whichever comes first.
179
180The miscellaneous data takes up one pseudo-row and includes:
181- number of rows
182- number of UChars per row
183- number of UChars in the left (folding string) column
184
185The table is sorted by its first column. Values in the first column are unique.
186
187----------------------------------------------------------------------------- */
188
189/* UDataInfo cf. udata.h */
190static UDataInfo dataInfo={
191    sizeof(UDataInfo),
192    0,
193
194    U_IS_BIG_ENDIAN,
195    U_CHARSET_FAMILY,
196    U_SIZEOF_UCHAR,
197    0,
198
199    /* dataFormat="cAsE" */
200    { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 },
201    { 1, 1, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
202    { 4, 0, 1, 0 }                              /* dataVersion */
203};
204
205enum {
206    /* maximum number of exceptions expected */
207    MAX_EXC_COUNT=1000
208};
209
210/* exceptions values */
211static uint16_t exceptions[UCASE_MAX_EXCEPTIONS+100];
212static uint16_t exceptionsTop=0;
213static Props excProps[MAX_EXC_COUNT];
214static uint16_t exceptionsCount=0;
215
216/* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */
217static int32_t maxFullLength=U16_MAX_LENGTH;
218
219/* reverse case folding ("unfold") data */
220static UChar unfold[UGENCASE_UNFOLD_MAX_ROWS*UGENCASE_UNFOLD_WIDTH]={
221    0, UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH, 0, 0
222};
223static uint16_t unfoldRows=0;
224static uint16_t unfoldTop=UGENCASE_UNFOLD_WIDTH;
225
226/* Unicode versions --------------------------------------------------------- */
227
228static const UVersionInfo
229unicodeVersions[]={
230    { 1, 0, 0, 0 },
231    { 1, 1, 0, 0 },
232    { 2, 0, 0, 0 },
233    { 3, 0, 0, 0 },
234    { 3, 1, 0, 0 },
235    { 3, 2, 0, 0 },
236    { 4, 0, 0, 0 },
237    { 4, 0, 1, 0 },
238    { 4, 1, 0, 0 },
239    { 5, 1, 0, 0 },
240    { 5, 2, 0, 0 },
241    { 6, 0, 0, 0 }
242};
243
244int32_t ucdVersion=UNI_4_1;
245
246static int32_t
247findUnicodeVersion(const UVersionInfo version) {
248    int32_t i;
249
250    for(i=0; /* while(version>unicodeVersions[i]) {} */
251        i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0;
252        ++i) {}
253    if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) {
254        --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
255    }
256    return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
257}
258
259extern void
260setUnicodeVersion(const char *v) {
261    UVersionInfo version;
262    u_versionFromString(version, v);
263    uprv_memcpy(dataInfo.dataVersion, version, 4);
264    ucdVersion=findUnicodeVersion(version);
265}
266
267/* -------------------------------------------------------------------------- */
268
269static void
270addUnfolding(UChar32 c, const UChar *s, int32_t length) {
271    int32_t i;
272
273    if(length>UGENCASE_UNFOLD_STRING_WIDTH) {
274        fprintf(stderr, "gencase error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n",
275                (long)length, UGENCASE_UNFOLD_STRING_WIDTH);
276        exit(U_INTERNAL_PROGRAM_ERROR);
277    }
278    if(unfoldTop >= (LENGTHOF(unfold) - UGENCASE_UNFOLD_STRING_WIDTH)) {
279        fprintf(stderr, "gencase error: too many multi-character case foldings\n");
280        exit(U_BUFFER_OVERFLOW_ERROR);
281    }
282    u_memset(unfold+unfoldTop, 0, UGENCASE_UNFOLD_WIDTH);
283    u_memcpy(unfold+unfoldTop, s, length);
284
285    i=unfoldTop+UGENCASE_UNFOLD_STRING_WIDTH;
286    U16_APPEND_UNSAFE(unfold, i, c);
287
288    ++unfoldRows;
289    unfoldTop+=UGENCASE_UNFOLD_WIDTH;
290}
291
292/* store a character's properties ------------------------------------------- */
293
294extern void
295setProps(Props *p) {
296    UErrorCode errorCode;
297    uint32_t value, oldValue;
298    int32_t delta;
299
300    /* get the non-UnicodeData.txt properties */
301    value=oldValue=upvec_getValue(pv, p->code, 0);
302
303    /* default: map to self */
304    delta=0;
305
306    if(p->gc==U_TITLECASE_LETTER) {
307        /* the Titlecase property is read late, from UnicodeData.txt */
308        value|=UCASE_TITLE;
309    }
310
311    if(p->upperCase!=0) {
312        /* uppercase mapping as delta if the character is lowercase */
313        if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
314            delta=p->upperCase-p->code;
315        } else {
316            value|=UCASE_EXCEPTION;
317        }
318    }
319    if(p->lowerCase!=0) {
320        /* lowercase mapping as delta if the character is uppercase or titlecase */
321        if((value&UCASE_TYPE_MASK)>=UCASE_UPPER) {
322            delta=p->lowerCase-p->code;
323        } else {
324            value|=UCASE_EXCEPTION;
325        }
326    }
327    if(p->upperCase!=p->titleCase) {
328        value|=UCASE_EXCEPTION;
329    }
330    if(p->closure[0]!=0) {
331        value|=UCASE_EXCEPTION;
332    }
333    if(p->specialCasing!=NULL) {
334        value|=UCASE_EXCEPTION;
335    }
336    if(p->caseFolding!=NULL) {
337        value|=UCASE_EXCEPTION;
338    }
339
340    if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
341        value|=UCASE_EXCEPTION;
342    }
343
344    if(p->cc!=0) {
345        if(value&UCASE_DOT_MASK) {
346            fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n");
347            exit(U_INTERNAL_PROGRAM_ERROR);
348        }
349        if(p->cc==230) {
350            value|=UCASE_ABOVE;
351        } else {
352            value|=UCASE_OTHER_ACCENT;
353        }
354    }
355
356    /*
357     * Encode case-ignorable as delta==1 on uncased characters,
358     * and with an exception bit on cased characters and characters with another exception.
359     */
360    if(ucdVersion>=UNI_4_1) {
361        /*
362         * Unicode 4.1 & 5.0: (D47a) Word_Break=MidLetter or Mn, Me, Cf, Lm, Sk
363         * Unicode 5.1: Word_Break=(MidLetter or MidNumLet) or Mn, Me, Cf, Lm, Sk
364         *   The UGENCASE_IS_MID_LETTER_SHIFT bit is set for both WB=MidLetter and WB=MidNumLet.
365         * Unicode 5.2: The definition (Unicode Standard Definition D121) is unchanged,
366         *   but now Case_Ignorable is a public property
367         *   with its values listed in DerivedCoreProperties.txt.
368         *   gencase.c parses those values as well, just in case the definition changes
369         *   in the future. gencase.c sets the UGENCASE_IS_MID_LETTER_SHIFT bit
370         *   for each Case_Ignorable entry. (It never resets that bit.)
371         */
372        if(
373            (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
374            (upvec_getValue(pv, p->code, 1)&U_MASK(UGENCASE_IS_MID_LETTER_SHIFT))!=0
375        ) {
376            p->isCaseIgnorable=TRUE;
377        }
378    } else {
379        /* before Unicode 4.1: Mn, Me, Cf, Lm, Sk or 0027 or 00AD or 2019 */
380        if(
381            (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
382            p->code==0x27 || p->code==0xad || p->code==0x2019
383        ) {
384            p->isCaseIgnorable=TRUE;
385        }
386    }
387    if(p->isCaseIgnorable) {
388        if((value&UCASE_TYPE_MASK)==UCASE_NONE) {
389            /*
390             * We use one of the delta/exception bits for
391             * the case-ignorable flag for uncased characters.
392             * There is no delta for uncased characters (see checks above).
393             */
394            delta=1;
395        } else {
396            /*
397             * If the character is cased or has another exception,
398             * then we store the case-ignorable flag as an exception bit.
399             */
400            value|=UCASE_EXCEPTION;
401        }
402    }
403
404    /* handle exceptions */
405    if(value&UCASE_EXCEPTION) {
406        /* simply store exceptions for later processing and encoding */
407        value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
408        uprv_memcpy(excProps+exceptionsCount, p, sizeof(*p));
409        if(++exceptionsCount==MAX_EXC_COUNT) {
410            fprintf(stderr, "gencase: too many exceptions\n");
411            exit(U_INDEX_OUTOFBOUNDS_ERROR);
412        }
413    } else {
414        /* store the simple case mapping delta */
415        value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK;
416    }
417
418    errorCode=U_ZERO_ERROR;
419    if(value!=oldValue) {
420        upvec_setValue(pv, p->code, p->code, 0, value, 0xffffffff, &errorCode);
421        if(U_FAILURE(errorCode)) {
422            fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
423                            u_errorName(errorCode));
424            exit(errorCode);
425        }
426    }
427
428    /* add the multi-character case folding to the "unfold" data */
429    if(p->caseFolding!=NULL) {
430        int32_t length=p->caseFolding->full[0];
431        if(length>1 && u_strHasMoreChar32Than(p->caseFolding->full+1, length, 1)) {
432            addUnfolding(p->code, p->caseFolding->full+1, length);
433        }
434    }
435}
436
437extern void
438addCaseSensitive(UChar32 first, UChar32 last) {
439    UErrorCode errorCode=U_ZERO_ERROR;
440    upvec_setValue(pv, first, last, 0, UCASE_SENSITIVE, UCASE_SENSITIVE, &errorCode);
441    if(U_FAILURE(errorCode)) {
442        fprintf(stderr, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n",
443                        u_errorName(errorCode));
444        exit(errorCode);
445    }
446}
447
448/* finalize reverse case folding ("unfold") data ---------------------------- */
449
450static int32_t U_CALLCONV
451compareUnfold(const void *context, const void *left, const void *right) {
452    return u_memcmp((const UChar *)left, (const UChar *)right, UGENCASE_UNFOLD_WIDTH);
453}
454
455static void
456makeUnfoldData() {
457    static const UChar
458        iDot[2]=        { 0x69, 0x307 };
459
460    UChar *p, *q;
461    int32_t i, j, k;
462    UErrorCode errorCode;
463
464    /*
465     * add a case folding that we missed because it's conditional:
466     * 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
467     */
468    addUnfolding(0x130, iDot, 2);
469
470    /* sort the data */
471    errorCode=U_ZERO_ERROR;
472    uprv_sortArray(unfold+UGENCASE_UNFOLD_WIDTH, unfoldRows, UGENCASE_UNFOLD_WIDTH*2,
473                   compareUnfold, NULL, FALSE, &errorCode);
474
475    /* make unique-string rows by merging adjacent ones' code point columns */
476
477    /* make p point to row i-1 */
478    p=(UChar *)unfold+UGENCASE_UNFOLD_WIDTH;
479
480    for(i=1; i<unfoldRows;) {
481        if(0==u_memcmp(p, p+UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH)) {
482            /* concatenate code point columns */
483            q=p+UGENCASE_UNFOLD_STRING_WIDTH;
484            for(j=1; j<UGENCASE_UNFOLD_CP_WIDTH && q[j]!=0; ++j) {}
485            for(k=0; k<UGENCASE_UNFOLD_CP_WIDTH && q[UGENCASE_UNFOLD_WIDTH+k]!=0; ++j, ++k) {
486                q[j]=q[UGENCASE_UNFOLD_WIDTH+k];
487            }
488            if(j>UGENCASE_UNFOLD_CP_WIDTH) {
489                fprintf(stderr, "gencase error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n",
490                        (long)j, UGENCASE_UNFOLD_CP_WIDTH);
491                exit(U_BUFFER_OVERFLOW_ERROR);
492            }
493
494            /* move following rows up one */
495            --unfoldRows;
496            unfoldTop-=UGENCASE_UNFOLD_WIDTH;
497            u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH);
498        } else {
499            p+=UGENCASE_UNFOLD_WIDTH;
500            ++i;
501        }
502    }
503
504    unfold[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows;
505
506    if(beVerbose) {
507        puts("unfold data:");
508
509        p=(UChar *)unfold;
510        for(i=0; i<unfoldRows; ++i) {
511            p+=UGENCASE_UNFOLD_WIDTH;
512            printf("[%2d] %04x %04x %04x <- %04x %04x\n",
513                   (int)i, p[0], p[1], p[2], p[3], p[4]);
514        }
515    }
516}
517
518/* case closure ------------------------------------------------------------- */
519
520static void
521addClosureMapping(UChar32 src, UChar32 dest) {
522    uint32_t value;
523
524    if(beVerbose) {
525        printf("add closure mapping U+%04lx->U+%04lx\n",
526                (unsigned long)src, (unsigned long)dest);
527    }
528
529    value=upvec_getValue(pv, src, 0);
530    if(value&UCASE_EXCEPTION) {
531        Props *p=excProps+(value>>UGENCASE_EXC_SHIFT);
532        int32_t i;
533
534        /* append dest to src's closure array */
535        for(i=0;; ++i) {
536            if(i==LENGTHOF(p->closure)) {
537                fprintf(stderr, "closure[] overflow for U+%04lx->U+%04lx\n",
538                                (unsigned long)src, (unsigned long)dest);
539                exit(U_BUFFER_OVERFLOW_ERROR);
540            } else if(p->closure[i]==dest) {
541                break; /* do not store duplicates */
542            } else if(p->closure[i]==0) {
543                p->closure[i]=dest;
544                break;
545            }
546        }
547    } else {
548        Props p2={ 0 };
549        UChar32 next;
550        UErrorCode errorCode;
551
552        /*
553         * decode value into p2 (enough for makeException() to work properly),
554         * add the closure mapping,
555         * and set the new exception for src
556         */
557        p2.code=src;
558        p2.closure[0]=dest;
559
560        if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
561            /* one simple case mapping, don't care which one */
562            next=src+((int16_t)value>>UCASE_DELTA_SHIFT);
563            if(next!=src) {
564                if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
565                    p2.upperCase=p2.titleCase=next;
566                } else {
567                    p2.lowerCase=next;
568                }
569            }
570        } else if(value&UCASE_DELTA_MASK) {
571            fprintf(stderr, "gencase error: unable to add case closure exception to case-ignorable U+%04lx\n",
572                            (unsigned long)src);
573            exit(U_INTERNAL_PROGRAM_ERROR);
574        }
575
576        value&=~(UGENCASE_EXC_MASK|UCASE_DELTA_MASK); /* remove previous simple mapping */
577        value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
578        value|=UCASE_EXCEPTION;
579        uprv_memcpy(excProps+exceptionsCount, &p2, sizeof(p2));
580        if(++exceptionsCount==MAX_EXC_COUNT) {
581            fprintf(stderr, "gencase: too many exceptions\n");
582            exit(U_INDEX_OUTOFBOUNDS_ERROR);
583        }
584
585        errorCode=U_ZERO_ERROR;
586        upvec_setValue(pv, src, src, 0, value, 0xffffffff, &errorCode);
587        if(U_FAILURE(errorCode)) {
588            fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
589                            u_errorName(errorCode));
590            exit(errorCode);
591        }
592    }
593}
594
595/*
596 * Find missing case mapping relationships and add mappings for case closure.
597 * This function starts from an "original" code point and recursively
598 * finds its case mappings and the case mappings of where it maps to.
599 *
600 * The recursion depth is capped at 3 nested calls of this function.
601 * In each call, the current code point is c, and the function enumerates
602 * all of c's simple (single-code point) case mappings.
603 * prev is the code point that case-mapped to c.
604 * prev2 is the code point that case-mapped to prev.
605 *
606 * The initial function call has prev2<0, prev<0, and c==orig
607 * (marking no code points).
608 * It enumerates c's case mappings and recurses without further action.
609 *
610 * The second-level function call has prev2<0, prev==orig, and c is
611 * the destination code point of one of prev's case mappings.
612 * The function checks if any of c's case mappings go back to orig
613 * and adds a closure mapping if not.
614 * In other words, it turns a case mapping relationship of
615 *   orig->c
616 * into
617 *   orig<->c
618 *
619 * The third-level function call has prev2==orig, prev>=0, and c is
620 * the destination code point of one of prev's case mappings.
621 * (And prev is the destination of one of prev2's case mappings.)
622 * The function checks if any of c's case mappings go back to orig
623 * and adds a closure mapping if not.
624 * In other words, it turns case mapping relationships of
625 *   orig->prev->c or orig->prev<->c
626 * into
627 *   orig->prev->c->orig or orig->prev<->c->orig
628 * etc.
629 * (Graphically, this closes a triangle.)
630 *
631 * With repeated application on all code points until no more closure mappings
632 * are added, all case equivalence groups get complete mappings.
633 * That is, in each group of code points with case relationships
634 * each code point will in the end have some mapping to each other
635 * code point in the group.
636 *
637 * @return TRUE if a closure mapping was added
638 */
639static UBool
640addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value) {
641    UChar32 next;
642    UBool someMappingsAdded=FALSE;
643
644    if(c!=orig) {
645        /* get the properties for c */
646        value=upvec_getValue(pv, c, 0);
647    }
648    /* else if c==orig then c's value was passed in */
649
650    if(value&UCASE_EXCEPTION) {
651        UChar32 set[32];
652        int32_t i, count=0;
653
654        Props *p=excProps+(value>>UGENCASE_EXC_SHIFT);
655
656        /*
657         * marker for whether any of c's mappings goes to orig
658         * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings
659         */
660        UBool mapsToOrig=(UBool)(c==orig);
661
662        /* collect c's case mapping destinations in set[] */
663        if((next=p->upperCase)!=0 && next!=c) {
664            set[count++]=next;
665        }
666        if((next=p->lowerCase)!=0 && next!=c) {
667            set[count++]=next;
668        }
669        if(p->upperCase!=(next=p->titleCase) && next!=c) {
670            set[count++]=next;
671        }
672        if(p->caseFolding!=NULL && (next=p->caseFolding->simple)!=0 && next!=c) {
673            set[count++]=next;
674        }
675
676        /* append c's current closure mappings to set[] */
677        for(i=0; i<LENGTHOF(p->closure) && (next=p->closure[i])!=0; ++i) {
678            set[count++]=next;
679        }
680
681        /* process all code points to which c case-maps */
682        for(i=0; i<count; ++i) {
683            next=set[i]; /* next!=c */
684
685            if(next==orig) {
686                mapsToOrig=TRUE; /* remember that we map to orig */
687            } else if(prev2<0 && next!=prev) {
688                /*
689                 * recurse unless
690                 * we have reached maximum depth (prev2>=0) or
691                 * this is a mapping to one of the previous code points (orig, prev, c)
692                 */
693                someMappingsAdded|=addClosure(orig, prev, c, next, 0);
694            }
695        }
696
697        if(!mapsToOrig) {
698            addClosureMapping(c, orig);
699            return TRUE;
700        }
701    } else {
702        if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
703            /* one simple case mapping, don't care which one */
704            next=c+((int16_t)value>>UCASE_DELTA_SHIFT);
705            if(next!=c) {
706                /*
707                 * recurse unless
708                 * we have reached maximum depth (prev2>=0) or
709                 * this is a mapping to one of the previous code points (orig, prev, c)
710                 */
711                if(prev2<0 && next!=orig && next!=prev) {
712                    someMappingsAdded|=addClosure(orig, prev, c, next, 0);
713                }
714
715                if(c!=orig && next!=orig) {
716                    /* c does not map to orig, add a closure mapping c->orig */
717                    addClosureMapping(c, orig);
718                    return TRUE;
719                }
720            }
721        }
722    }
723
724    return someMappingsAdded;
725}
726
727extern void
728makeCaseClosure() {
729    UChar *p;
730    uint32_t *row;
731    uint32_t value;
732    UChar32 start, end, c, c2;
733    int32_t i, j;
734    UBool someMappingsAdded;
735
736    /*
737     * finalize the "unfold" data because we need to use it to add closure mappings
738     * for situations like FB05->"st"<-FB06
739     * where we would otherwise miss the FB05<->FB06 relationship
740     */
741    makeUnfoldData();
742
743    /* use the "unfold" data to add mappings */
744
745    /* p always points to the code points; this loop ignores the strings completely */
746    p=unfold+UGENCASE_UNFOLD_WIDTH+UGENCASE_UNFOLD_STRING_WIDTH;
747
748    for(i=0; i<unfoldRows; p+=UGENCASE_UNFOLD_WIDTH, ++i) {
749        j=0;
750        U16_NEXT_UNSAFE(p, j, c);
751        while(j<UGENCASE_UNFOLD_CP_WIDTH && p[j]!=0) {
752            U16_NEXT_UNSAFE(p, j, c2);
753            addClosure(c, U_SENTINEL, c, c2, 0);
754        }
755    }
756
757    if(beVerbose) {
758        puts("---- ---- ---- ---- (done with closures from unfolding)");
759    }
760
761    /* add further closure mappings from analyzing simple mappings */
762    do {
763        someMappingsAdded=FALSE;
764
765        i=0;
766        while((row=upvec_getRow(pv, i, &start, &end))!=NULL && start<UPVEC_FIRST_SPECIAL_CP) {
767            value=*row;
768            if(value!=0) {
769                while(start<=end) {
770                    if(addClosure(start, U_SENTINEL, U_SENTINEL, start, value)) {
771                        someMappingsAdded=TRUE;
772
773                        /*
774                         * stop this loop because pv was changed and row is not valid any more
775                         * skip all rows below the current start
776                         */
777                        while((row=upvec_getRow(pv, i, NULL, &end))!=NULL && start>end) {
778                            ++i;
779                        }
780                        row=NULL; /* signal to continue with outer loop, without further ++i */
781                        break;
782                    }
783                    ++start;
784                }
785                if(row==NULL) {
786                    continue; /* see row=NULL above */
787                }
788            }
789            ++i;
790        }
791
792        if(beVerbose && someMappingsAdded) {
793            puts("---- ---- ---- ----");
794        }
795    } while(someMappingsAdded);
796}
797
798/* exceptions --------------------------------------------------------------- */
799
800/* get the string length from zero-terminated code points in a limited-length array */
801static int32_t
802getLengthOfCodePoints(const UChar32 *s, int32_t maxLength) {
803    int32_t i, length;
804
805    for(i=length=0; i<maxLength && s[i]!=0; ++i) {
806        length+=U16_LENGTH(s[i]);
807    }
808    return length;
809}
810
811static UBool
812fullMappingEqualsSimple(const UChar *s, UChar32 simple, UChar32 c) {
813    int32_t i, length;
814    UChar32 full;
815
816    length=*s++;
817    if(length==0 || length>U16_MAX_LENGTH) {
818        return FALSE;
819    }
820    i=0;
821    U16_NEXT(s, i, length, full);
822
823    if(simple==0) {
824        simple=c; /* UCD has no simple mapping if it's the same as the code point itself */
825    }
826    return (UBool)(i==length && full==simple);
827}
828
829static uint16_t
830makeException(uint32_t value, Props *p) {
831    uint32_t slots[8];
832    uint32_t slotBits;
833    uint16_t excWord, i, count, length, fullLengths;
834    UBool doubleSlots;
835
836    /* exceptionsTop might be returned for storing in the trie word */
837    if(exceptionsTop>=UCASE_MAX_EXCEPTIONS) {
838        fprintf(stderr, "gencase error: too many exceptions words\n");
839        exit(U_BUFFER_OVERFLOW_ERROR);
840    }
841
842    /* copy and shift the soft-dotted bits */
843    excWord=((uint16_t)value&UCASE_DOT_MASK)<<UCASE_EXC_DOT_SHIFT;
844
845    if(p->isCaseIgnorable) {
846        excWord|=UCASE_EXC_CASE_IGNORABLE;
847    }
848
849    /* update maxFullLength */
850    if(p->specialCasing!=NULL) {
851        length=p->specialCasing->lowerCase[0];
852        if(length>maxFullLength) {
853            maxFullLength=length;
854        }
855        length=p->specialCasing->upperCase[0];
856        if(length>maxFullLength) {
857            maxFullLength=length;
858        }
859        length=p->specialCasing->titleCase[0];
860        if(length>maxFullLength) {
861            maxFullLength=length;
862        }
863    }
864    if(p->caseFolding!=NULL) {
865        length=p->caseFolding->full[0];
866        if(length>maxFullLength) {
867            maxFullLength=length;
868        }
869    }
870
871    /* set the bits for conditional mappings */
872    if(p->specialCasing!=NULL && p->specialCasing->isComplex) {
873        excWord|=UCASE_EXC_CONDITIONAL_SPECIAL;
874        p->specialCasing=NULL;
875    }
876    if(p->caseFolding!=NULL && p->caseFolding->simple==0 && p->caseFolding->full[0]==0) {
877        excWord|=UCASE_EXC_CONDITIONAL_FOLD;
878        p->caseFolding=NULL;
879    }
880
881    /*
882     * Note:
883     * UCD stores no simple mappings when they are the same as the code point itself.
884     * SpecialCasing and CaseFolding do store simple mappings even if they are
885     * the same as the code point itself.
886     * Comparisons between simple regular mappings and simple special/folding
887     * mappings need to compensate for the difference by comparing with the
888     * original code point if a simple UCD mapping is missing (0).
889     */
890
891    /* remove redundant data */
892    if(p->specialCasing!=NULL) {
893        /* do not store full mappings if they are the same as the simple ones */
894        if(fullMappingEqualsSimple(p->specialCasing->lowerCase, p->lowerCase, p->code)) {
895            p->specialCasing->lowerCase[0]=0;
896        }
897        if(fullMappingEqualsSimple(p->specialCasing->upperCase, p->upperCase, p->code)) {
898            p->specialCasing->upperCase[0]=0;
899        }
900        if(fullMappingEqualsSimple(p->specialCasing->titleCase, p->titleCase, p->code)) {
901            p->specialCasing->titleCase[0]=0;
902        }
903    }
904    if( p->caseFolding!=NULL &&
905        fullMappingEqualsSimple(p->caseFolding->full, p->caseFolding->simple, p->code)
906    ) {
907        p->caseFolding->full[0]=0;
908    }
909
910    /* write the optional slots */
911    slotBits=0;
912    count=0;
913
914    if(p->lowerCase!=0) {
915        slots[count]=(uint32_t)p->lowerCase;
916        slotBits|=slots[count];
917        ++count;
918        excWord|=U_MASK(UCASE_EXC_LOWER);
919    }
920    if( p->caseFolding!=NULL &&
921        p->caseFolding->simple!=0 &&
922        (p->lowerCase!=0 ?
923            p->caseFolding->simple!=p->lowerCase :
924            p->caseFolding->simple!=p->code)
925    ) {
926        slots[count]=(uint32_t)p->caseFolding->simple;
927        slotBits|=slots[count];
928        ++count;
929        excWord|=U_MASK(UCASE_EXC_FOLD);
930    }
931    if(p->upperCase!=0) {
932        slots[count]=(uint32_t)p->upperCase;
933        slotBits|=slots[count];
934        ++count;
935        excWord|=U_MASK(UCASE_EXC_UPPER);
936    }
937    if(p->upperCase!=p->titleCase) {
938        if(p->titleCase!=0) {
939            slots[count]=(uint32_t)p->titleCase;
940        } else {
941            slots[count]=(uint32_t)p->code;
942        }
943        slotBits|=slots[count];
944        ++count;
945        excWord|=U_MASK(UCASE_EXC_TITLE);
946    }
947
948    /* length of case closure */
949    if(p->closure[0]!=0) {
950        length=getLengthOfCodePoints(p->closure, LENGTHOF(p->closure));
951        slots[count]=(uint32_t)length; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */
952        slotBits|=slots[count];
953        ++count;
954        excWord|=U_MASK(UCASE_EXC_CLOSURE);
955    }
956
957    /* lengths of full case mapping strings, stored in the last slot */
958    fullLengths=0;
959    if(p->specialCasing!=NULL) {
960        fullLengths=p->specialCasing->lowerCase[0];
961        fullLengths|=p->specialCasing->upperCase[0]<<8;
962        fullLengths|=p->specialCasing->titleCase[0]<<12;
963    }
964    if(p->caseFolding!=NULL) {
965        fullLengths|=p->caseFolding->full[0]<<4;
966    }
967    if(fullLengths!=0) {
968        slots[count]=fullLengths;
969        slotBits|=slots[count];
970        ++count;
971        excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS);
972    }
973
974    if(count==0) {
975        /* No optional slots: Try to share excWord entries. */
976        uint16_t excIndex;
977        for(excIndex=0; excIndex<exceptionsTop; ++excIndex) {
978            if(excWord==exceptions[excIndex]) {
979                return excIndex;
980            }
981        }
982        /* not found */
983        ++exceptionsTop;
984        exceptions[excIndex]=excWord;
985        return excIndex;
986    } else {
987        /* write slots */
988        uint16_t excIndex=exceptionsTop;
989        uint16_t excTop=excIndex+1; /* +1 for excWord which will be stored at excIndex */
990
991        doubleSlots=(UBool)(slotBits>0xffff);
992        if(!doubleSlots) {
993            for(i=0; i<count; ++i) {
994                exceptions[excTop++]=(uint16_t)slots[i];
995            }
996        } else {
997            excWord|=UCASE_EXC_DOUBLE_SLOTS;
998            for(i=0; i<count; ++i) {
999                exceptions[excTop++]=(uint16_t)(slots[i]>>16);
1000                exceptions[excTop++]=(uint16_t)slots[i];
1001            }
1002        }
1003
1004        /* write the full case mapping strings */
1005        if(p->specialCasing!=NULL) {
1006            length=(uint16_t)p->specialCasing->lowerCase[0];
1007            u_memcpy((UChar *)exceptions+excTop, p->specialCasing->lowerCase+1, length);
1008            excTop+=length;
1009        }
1010        if(p->caseFolding!=NULL) {
1011            length=(uint16_t)p->caseFolding->full[0];
1012            u_memcpy((UChar *)exceptions+excTop, p->caseFolding->full+1, length);
1013            excTop+=length;
1014        }
1015        if(p->specialCasing!=NULL) {
1016            length=(uint16_t)p->specialCasing->upperCase[0];
1017            u_memcpy((UChar *)exceptions+excTop, p->specialCasing->upperCase+1, length);
1018            excTop+=length;
1019
1020            length=(uint16_t)p->specialCasing->titleCase[0];
1021            u_memcpy((UChar *)exceptions+excTop, p->specialCasing->titleCase+1, length);
1022            excTop+=length;
1023        }
1024
1025        /* write the closure data */
1026        if(p->closure[0]!=0) {
1027            UChar32 c;
1028
1029            for(i=0; i<LENGTHOF(p->closure) && (c=p->closure[i])!=0; ++i) {
1030                U16_APPEND_UNSAFE((UChar *)exceptions, excTop, c);
1031            }
1032        }
1033
1034        exceptionsTop=excTop;
1035
1036        /* write the main exceptions word */
1037        exceptions[excIndex]=excWord;
1038
1039        return excIndex;
1040    }
1041}
1042
1043extern void
1044makeExceptions() {
1045    uint32_t *row;
1046    uint32_t value;
1047    int32_t i;
1048    uint16_t excIndex;
1049
1050    i=0;
1051    while((row=upvec_getRow(pv, i, NULL, NULL))!=NULL) {
1052        value=*row;
1053        if(value&UCASE_EXCEPTION) {
1054            excIndex=makeException(value, excProps+(value>>UGENCASE_EXC_SHIFT));
1055            *row=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|(excIndex<<UCASE_EXC_SHIFT);
1056        }
1057        ++i;
1058    }
1059}
1060
1061/* generate output data ----------------------------------------------------- */
1062
1063extern void
1064generateData(const char *dataDir, UBool csource) {
1065    static int32_t indexes[UCASE_IX_TOP]={
1066        UCASE_IX_TOP
1067    };
1068    static uint8_t trieBlock[40000];
1069
1070    const uint32_t *row;
1071    UChar32 start, end;
1072    int32_t i;
1073
1074    UNewDataMemory *pData;
1075    UNewTrie *pTrie;
1076    UErrorCode errorCode=U_ZERO_ERROR;
1077    int32_t trieSize;
1078    long dataLength;
1079
1080    pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE);
1081    if(pTrie==NULL) {
1082        fprintf(stderr, "gencase error: unable to create a UNewTrie\n");
1083        exit(U_MEMORY_ALLOCATION_ERROR);
1084    }
1085
1086    for(i=0; (row=upvec_getRow(pv, i, &start, &end))!=NULL; ++i) {
1087        if(start<UPVEC_FIRST_SPECIAL_CP && !utrie_setRange32(pTrie, start, end+1, *row, TRUE)) {
1088            fprintf(stderr, "gencase error: unable to set trie value (overflow)\n");
1089            exit(U_BUFFER_OVERFLOW_ERROR);
1090        }
1091    }
1092
1093    trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
1094    if(U_FAILURE(errorCode)) {
1095        fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
1096        exit(errorCode);
1097    }
1098
1099    indexes[UCASE_IX_EXC_LENGTH]=exceptionsTop;
1100    indexes[UCASE_IX_TRIE_SIZE]=trieSize;
1101    indexes[UCASE_IX_UNFOLD_LENGTH]=unfoldTop;
1102    indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop+2*unfoldTop;
1103
1104    indexes[UCASE_IX_MAX_FULL_LENGTH]=maxFullLength;
1105
1106    if(beVerbose) {
1107        printf("trie size in bytes:                    %5d\n", (int)trieSize);
1108        printf("number of code points with exceptions: %5d\n", exceptionsCount);
1109        printf("size in bytes of exceptions:           %5d\n", 2*exceptionsTop);
1110        printf("size in bytes of reverse foldings:     %5d\n", 2*unfoldTop);
1111        printf("data size:                             %5d\n", (int)indexes[UCASE_IX_LENGTH]);
1112    }
1113
1114    if(csource) {
1115        /* write .c file for hardcoded data */
1116        UTrie trie={ NULL };
1117        UTrie2 *trie2;
1118        FILE *f;
1119
1120        utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
1121        if(U_FAILURE(errorCode)) {
1122            fprintf(
1123                stderr,
1124                "gencase error: failed to utrie_unserialize(ucase.icu trie) - %s\n",
1125                u_errorName(errorCode));
1126            exit(errorCode);
1127        }
1128
1129        /* use UTrie2 */
1130        dataInfo.formatVersion[0]=2;
1131        dataInfo.formatVersion[2]=0;
1132        dataInfo.formatVersion[3]=0;
1133        trie2=utrie2_fromUTrie(&trie, 0, &errorCode);
1134        if(U_FAILURE(errorCode)) {
1135            fprintf(
1136                stderr,
1137                "gencase error: utrie2_fromUTrie() failed - %s\n",
1138                u_errorName(errorCode));
1139            exit(errorCode);
1140        }
1141        {
1142            /* delete lead surrogate code unit values */
1143            UChar lead;
1144            trie2=utrie2_cloneAsThawed(trie2, &errorCode);
1145            for(lead=0xd800; lead<0xdc00; ++lead) {
1146                utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode);
1147            }
1148            utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode);
1149            if(U_FAILURE(errorCode)) {
1150                fprintf(
1151                    stderr,
1152                    "gencase error: deleting lead surrogate code unit values failed - %s\n",
1153                    u_errorName(errorCode));
1154                exit(errorCode);
1155            }
1156        }
1157
1158        f=usrc_create(dataDir, "ucase_props_data.c");
1159        if(f!=NULL) {
1160            usrc_writeArray(f,
1161                "static const UVersionInfo ucase_props_dataVersion={",
1162                dataInfo.dataVersion, 8, 4,
1163                "};\n\n");
1164            usrc_writeArray(f,
1165                "static const int32_t ucase_props_indexes[UCASE_IX_TOP]={",
1166                indexes, 32, UCASE_IX_TOP,
1167                "};\n\n");
1168            usrc_writeUTrie2Arrays(f,
1169                "static const uint16_t ucase_props_trieIndex[%ld]={\n", NULL,
1170                trie2,
1171                "\n};\n\n");
1172            usrc_writeArray(f,
1173                "static const uint16_t ucase_props_exceptions[%ld]={\n",
1174                exceptions, 16, exceptionsTop,
1175                "\n};\n\n");
1176            usrc_writeArray(f,
1177                "static const uint16_t ucase_props_unfold[%ld]={\n",
1178                unfold, 16, unfoldTop,
1179                "\n};\n\n");
1180            fputs(
1181                "static const UCaseProps ucase_props_singleton={\n"
1182                "  NULL,\n"
1183                "  ucase_props_indexes,\n"
1184                "  ucase_props_exceptions,\n"
1185                "  ucase_props_unfold,\n",
1186                f);
1187            usrc_writeUTrie2Struct(f,
1188                "  {\n",
1189                trie2, "ucase_props_trieIndex", NULL,
1190                "  },\n");
1191            usrc_writeArray(f, "  { ", dataInfo.formatVersion, 8, 4, " }\n");
1192            fputs("};\n", f);
1193            fclose(f);
1194        }
1195        utrie2_close(trie2);
1196    } else {
1197        /* write the data */
1198        pData=udata_create(dataDir, UCASE_DATA_TYPE, UCASE_DATA_NAME, &dataInfo,
1199                        haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
1200        if(U_FAILURE(errorCode)) {
1201            fprintf(stderr, "gencase: unable to create data memory, %s\n", u_errorName(errorCode));
1202            exit(errorCode);
1203        }
1204
1205        udata_writeBlock(pData, indexes, sizeof(indexes));
1206        udata_writeBlock(pData, trieBlock, trieSize);
1207        udata_writeBlock(pData, exceptions, 2*exceptionsTop);
1208        udata_writeBlock(pData, unfold, 2*unfoldTop);
1209
1210        /* finish up */
1211        dataLength=udata_finish(pData, &errorCode);
1212        if(U_FAILURE(errorCode)) {
1213            fprintf(stderr, "gencase: error %d writing the output file\n", errorCode);
1214            exit(errorCode);
1215        }
1216
1217        if(dataLength!=indexes[UCASE_IX_LENGTH]) {
1218            fprintf(stderr, "gencase: data length %ld != calculated size %d\n",
1219                dataLength, (int)indexes[UCASE_IX_LENGTH]);
1220            exit(U_INTERNAL_PROGRAM_ERROR);
1221        }
1222    }
1223
1224    utrie_close(pTrie);
1225}
1226
1227/*
1228 * Hey, Emacs, please set the following:
1229 *
1230 * Local Variables:
1231 * indent-tabs-mode: nil
1232 * End:
1233 *
1234 */
1235