1/*
2*******************************************************************************
3*
4*   Copyright (C) 2004-2008, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  store.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2004aug28
14*   created by: Markus W. Scherer
15*
16*   Store Unicode case mapping properties efficiently for
17*   random access.
18*/
19
20#include <stdio.h>
21#include <stdlib.h>
22#include "unicode/utypes.h"
23#include "unicode/uchar.h"
24#include "unicode/ustring.h"
25#include "cmemory.h"
26#include "cstring.h"
27#include "filestrm.h"
28#include "utrie.h"
29#include "utrie2.h"
30#include "uarrsort.h"
31#include "unicode/udata.h"
32#include "unewdata.h"
33#include "propsvec.h"
34#include "writesrc.h"
35#include "gencase.h"
36
37#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
38
39/* Unicode case mapping properties file format ---------------------------------
40
41The file format prepared and written here contains several data
42structures that store indexes or data.
43
44Before the data contents described below, there are the headers required by
45the udata API for loading ICU data. Especially, a UDataInfo structure
46precedes the actual data. It contains platform properties values and the
47file format version.
48
49The following is a description of format version 1.1 .
50
51Format version 1.1 adds data for case closure.
52
53The file contains the following structures:
54
55    const int32_t indexes[i0] with values i0, i1, ...:
56    (see UCASE_IX_... constants for names of indexes)
57
58    i0 indexLength; -- length of indexes[] (UCASE_IX_TOP)
59    i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
60    i2 trieSize; -- size in bytes of the case mapping properties trie
61    i3 exceptionsLength; -- length in uint16_t of the exceptions array
62    i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1)
63
64    i5..i14 reservedIndexes; -- reserved values; 0 for now
65
66    i15 maxFullLength; -- maximum length of a full case mapping/folding string
67
68
69    Serialized trie, see utrie.h;
70
71    const uint16_t exceptions[exceptionsLength];
72
73    const UChar unfold[unfoldLength];
74
75
76Trie data word:
77Bits
78if(exception) {
79    15..4   unsigned exception index
80} else {
81    if(not uncased) {
82        15..6   signed delta to simple case mapping code point
83                (add delta to input code point)
84    } else {
85            6   the code point is case-ignorable
86                (U+0307 is also case-ignorable but has an exception)
87    }
88     5..4   0 normal character with cc=0
89            1 soft-dotted character
90            2 cc=230
91            3 other cc
92}
93    3   exception
94    2   case sensitive
95 1..0   0 uncased
96        1 lowercase
97        2 uppercase
98        3 titlecase
99
100
101Exceptions:
102A sub-array of the exceptions array is indexed by the exception index in a
103trie word.
104The sub-array consists of the following fields:
105    uint16_t excWord;
106    uint16_t optional values [];
107    UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase
108
109excWord: (see UCASE_EXC_...)
110Bits
111    15  conditional case folding
112    14  conditional special casing
11313..12  same as non-exception trie data bits 5..4
114        moved here because the exception index needs more bits than the delta
115        0 normal character with cc=0
116        1 soft-dotted character
117        2 cc=230
118        3 other cc
11911.. 9  reserved
120     8  if set, then for each optional-value slot there are 2 uint16_t values
121        (high and low parts of 32-bit values)
122        instead of single ones
123 7.. 0  bits for which optional value is present
124
125Optional-value slots:
1260   lowercase mapping (code point)
1271   case folding (code point)
1282   uppercase mapping (code point)
1293   titlecase mapping (code point)
1304   reserved
1315   reserved
1326   closure mappings (new in format version 1.1)
1337   there is at least one full (string) case mapping
134    the length of each is encoded in a nibble of this optional value,
135    and the strings follow this optional value in the same order:
136    lower/fold/upper/title
137
138The optional closure mappings value is used as follows:
139Bits 0..3 contain the length of a string of code points for case closure.
140The string immediately follows the full case mappings, or the closure value
141slot if there are no full case mappings.
142Bits 4..15 are reserved and could be used in the future to indicate the
143number of strings for case closure.
144Complete case closure for a code point is given by the union of all simple
145and full case mappings and foldings, plus the case closure code points
146(and potentially, in the future, case closure strings).
147
148For space saving, some values are not stored. Lookups are as follows:
149- If special casing is conditional, then no full lower/upper/title mapping
150  strings are stored.
151- If case folding is conditional, then no simple or full case foldings are
152  stored.
153- Fall back in this order:
154    full (string) mapping -- if full mappings are used
155    simple (code point) mapping of the same type
156    simple fold->simple lower
157    simple title->simple upper
158    finally, the original code point (no mapping)
159
160This fallback order is strict:
161In particular, the fallback from full case folding is to simple case folding,
162not to full lowercase mapping.
163
164Reverse case folding data ("unfold") array: (new in format version 1.1)
165
166This array stores some miscellaneous values followed by a table. The data maps
167back from multi-character strings to their original code points, for use
168in case closure.
169
170The table contains two columns of strings.
171The string in the first column is the case folding of each of the code points
172in the second column. The strings are terminated with NUL or by the end of the
173column, whichever comes first.
174
175The miscellaneous data takes up one pseudo-row and includes:
176- number of rows
177- number of UChars per row
178- number of UChars in the left (folding string) column
179
180The table is sorted by its first column. Values in the first column are unique.
181
182----------------------------------------------------------------------------- */
183
184/* UDataInfo cf. udata.h */
185static UDataInfo dataInfo={
186    sizeof(UDataInfo),
187    0,
188
189    U_IS_BIG_ENDIAN,
190    U_CHARSET_FAMILY,
191    U_SIZEOF_UCHAR,
192    0,
193
194    /* dataFormat="cAsE" */
195    { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 },
196    { 1, 1, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
197    { 4, 0, 1, 0 }                              /* dataVersion */
198};
199
200enum {
201    /* maximum number of exceptions expected */
202    MAX_EXC_COUNT=1000
203};
204
205/* exceptions values */
206static uint16_t exceptions[UCASE_MAX_EXCEPTIONS+100];
207static uint16_t exceptionsTop=0;
208static Props excProps[MAX_EXC_COUNT];
209static uint16_t exceptionsCount=0;
210
211/* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */
212static int32_t maxFullLength=U16_MAX_LENGTH;
213
214/* reverse case folding ("unfold") data */
215static UChar unfold[UGENCASE_UNFOLD_MAX_ROWS*UGENCASE_UNFOLD_WIDTH]={
216    0, UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH, 0, 0
217};
218static uint16_t unfoldRows=0;
219static uint16_t unfoldTop=UGENCASE_UNFOLD_WIDTH;
220
221/* Unicode versions --------------------------------------------------------- */
222
223static const UVersionInfo
224unicodeVersions[]={
225    { 1, 0, 0, 0 },
226    { 1, 1, 0, 0 },
227    { 2, 0, 0, 0 },
228    { 3, 0, 0, 0 },
229    { 3, 1, 0, 0 },
230    { 3, 2, 0, 0 },
231    { 4, 0, 0, 0 },
232    { 4, 0, 1, 0 },
233    { 4, 1, 0, 0 }
234};
235
236int32_t ucdVersion=UNI_4_1;
237
238static int32_t
239findUnicodeVersion(const UVersionInfo version) {
240    int32_t i;
241
242    for(i=0; /* while(version>unicodeVersions[i]) {} */
243        i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0;
244        ++i) {}
245    if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) {
246        --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
247    }
248    return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
249}
250
251extern void
252setUnicodeVersion(const char *v) {
253    UVersionInfo version;
254    u_versionFromString(version, v);
255    uprv_memcpy(dataInfo.dataVersion, version, 4);
256    ucdVersion=findUnicodeVersion(version);
257}
258
259/* -------------------------------------------------------------------------- */
260
261static void
262addUnfolding(UChar32 c, const UChar *s, int32_t length) {
263    int32_t i;
264
265    if(length>UGENCASE_UNFOLD_STRING_WIDTH) {
266        fprintf(stderr, "gencase error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n",
267                (long)length, UGENCASE_UNFOLD_STRING_WIDTH);
268        exit(U_INTERNAL_PROGRAM_ERROR);
269    }
270    if(unfoldTop >= (LENGTHOF(unfold) - UGENCASE_UNFOLD_STRING_WIDTH)) {
271        fprintf(stderr, "gencase error: too many multi-character case foldings\n");
272        exit(U_BUFFER_OVERFLOW_ERROR);
273    }
274    u_memset(unfold+unfoldTop, 0, UGENCASE_UNFOLD_WIDTH);
275    u_memcpy(unfold+unfoldTop, s, length);
276
277    i=unfoldTop+UGENCASE_UNFOLD_STRING_WIDTH;
278    U16_APPEND_UNSAFE(unfold, i, c);
279
280    ++unfoldRows;
281    unfoldTop+=UGENCASE_UNFOLD_WIDTH;
282}
283
284/* store a character's properties ------------------------------------------- */
285
286extern void
287setProps(Props *p) {
288    UErrorCode errorCode;
289    uint32_t value, oldValue;
290    int32_t delta;
291    UBool isCaseIgnorable;
292
293    /* get the non-UnicodeData.txt properties */
294    value=oldValue=upvec_getValue(pv, p->code, 0);
295
296    /* default: map to self */
297    delta=0;
298
299    if(p->gc==U_TITLECASE_LETTER) {
300        /* the Titlecase property is read late, from UnicodeData.txt */
301        value|=UCASE_TITLE;
302    }
303
304    if(p->upperCase!=0) {
305        /* uppercase mapping as delta if the character is lowercase */
306        if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
307            delta=p->upperCase-p->code;
308        } else {
309            value|=UCASE_EXCEPTION;
310        }
311    }
312    if(p->lowerCase!=0) {
313        /* lowercase mapping as delta if the character is uppercase or titlecase */
314        if((value&UCASE_TYPE_MASK)>=UCASE_UPPER) {
315            delta=p->lowerCase-p->code;
316        } else {
317            value|=UCASE_EXCEPTION;
318        }
319    }
320    if(p->upperCase!=p->titleCase) {
321        value|=UCASE_EXCEPTION;
322    }
323    if(p->closure[0]!=0) {
324        value|=UCASE_EXCEPTION;
325    }
326    if(p->specialCasing!=NULL) {
327        value|=UCASE_EXCEPTION;
328    }
329    if(p->caseFolding!=NULL) {
330        value|=UCASE_EXCEPTION;
331    }
332
333    if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
334        value|=UCASE_EXCEPTION;
335    }
336
337    if(p->cc!=0) {
338        if(value&UCASE_DOT_MASK) {
339            fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n");
340            exit(U_INTERNAL_PROGRAM_ERROR);
341        }
342        if(p->cc==230) {
343            value|=UCASE_ABOVE;
344        } else {
345            value|=UCASE_OTHER_ACCENT;
346        }
347    }
348
349    /* encode case-ignorable as delta==1 on uncased characters */
350    isCaseIgnorable=FALSE;
351    if((value&UCASE_TYPE_MASK)==UCASE_NONE) {
352        if(ucdVersion>=UNI_4_1) {
353            /*
354             * Unicode 4.1 and up: (D47a) Word_Break=MidLetter or Mn, Me, Cf, Lm, Sk
355             * Unicode 5.1 and up: Word_Break=(MidLetter or MidNumLet) or Mn, Me, Cf, Lm, Sk
356             *   The UGENCASE_IS_MID_LETTER_SHIFT bit is set for both WB=MidLetter and WB=MidNumLet.
357             */
358            if(
359                (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
360                (upvec_getValue(pv, p->code, 1)&U_MASK(UGENCASE_IS_MID_LETTER_SHIFT))!=0
361            ) {
362                isCaseIgnorable=TRUE;
363            }
364        } else {
365            /* before Unicode 4.1: Mn, Me, Cf, Lm, Sk or 0027 or 00AD or 2019 */
366            if(
367                (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
368                p->code==0x27 || p->code==0xad || p->code==0x2019
369            ) {
370                isCaseIgnorable=TRUE;
371            }
372        }
373    }
374
375    if(isCaseIgnorable && p->code!=0x307) {
376        /*
377         * We use one of the delta/exception bits, which works because we only
378         * store the case-ignorable flag for uncased characters.
379         * There is no delta for uncased characters (see checks above).
380         * If there is an exception for an uncased, case-ignorable character
381         * (although there should not be any case mappings if it's uncased)
382         * then we have a problem.
383         * There is one character which is case-ignorable but has an exception:
384         * U+0307 is uncased, Mn, has conditional special casing and
385         * is therefore handled in code instead.
386         */
387        if(value&UCASE_EXCEPTION) {
388            fprintf(stderr, "gencase error: unable to encode case-ignorable for U+%04lx with exceptions\n",
389                            (unsigned long)p->code);
390            exit(U_INTERNAL_PROGRAM_ERROR);
391        }
392
393        delta=1;
394    }
395
396    /* handle exceptions */
397    if(value&UCASE_EXCEPTION) {
398        /* simply store exceptions for later processing and encoding */
399        value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
400        uprv_memcpy(excProps+exceptionsCount, p, sizeof(*p));
401        if(++exceptionsCount==MAX_EXC_COUNT) {
402            fprintf(stderr, "gencase: too many exceptions\n");
403            exit(U_INDEX_OUTOFBOUNDS_ERROR);
404        }
405    } else {
406        /* store the simple case mapping delta */
407        value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK;
408    }
409
410    errorCode=U_ZERO_ERROR;
411    if(value!=oldValue) {
412        upvec_setValue(pv, p->code, p->code, 0, value, 0xffffffff, &errorCode);
413        if(U_FAILURE(errorCode)) {
414            fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
415                            u_errorName(errorCode));
416            exit(errorCode);
417        }
418    }
419
420    /* add the multi-character case folding to the "unfold" data */
421    if(p->caseFolding!=NULL) {
422        int32_t length=p->caseFolding->full[0];
423        if(length>1 && u_strHasMoreChar32Than(p->caseFolding->full+1, length, 1)) {
424            addUnfolding(p->code, p->caseFolding->full+1, length);
425        }
426    }
427}
428
429extern void
430addCaseSensitive(UChar32 first, UChar32 last) {
431    UErrorCode errorCode=U_ZERO_ERROR;
432    upvec_setValue(pv, first, last, 0, UCASE_SENSITIVE, UCASE_SENSITIVE, &errorCode);
433    if(U_FAILURE(errorCode)) {
434        fprintf(stderr, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n",
435                        u_errorName(errorCode));
436        exit(errorCode);
437    }
438}
439
440/* finalize reverse case folding ("unfold") data ---------------------------- */
441
442static int32_t U_CALLCONV
443compareUnfold(const void *context, const void *left, const void *right) {
444    return u_memcmp((const UChar *)left, (const UChar *)right, UGENCASE_UNFOLD_WIDTH);
445}
446
447static void
448makeUnfoldData() {
449    static const UChar
450        iDot[2]=        { 0x69, 0x307 };
451
452    UChar *p, *q;
453    int32_t i, j, k;
454    UErrorCode errorCode;
455
456    /*
457     * add a case folding that we missed because it's conditional:
458     * 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
459     */
460    addUnfolding(0x130, iDot, 2);
461
462    /* sort the data */
463    errorCode=U_ZERO_ERROR;
464    uprv_sortArray(unfold+UGENCASE_UNFOLD_WIDTH, unfoldRows, UGENCASE_UNFOLD_WIDTH*2,
465                   compareUnfold, NULL, FALSE, &errorCode);
466
467    /* make unique-string rows by merging adjacent ones' code point columns */
468
469    /* make p point to row i-1 */
470    p=(UChar *)unfold+UGENCASE_UNFOLD_WIDTH;
471
472    for(i=1; i<unfoldRows;) {
473        if(0==u_memcmp(p, p+UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH)) {
474            /* concatenate code point columns */
475            q=p+UGENCASE_UNFOLD_STRING_WIDTH;
476            for(j=1; j<UGENCASE_UNFOLD_CP_WIDTH && q[j]!=0; ++j) {}
477            for(k=0; k<UGENCASE_UNFOLD_CP_WIDTH && q[UGENCASE_UNFOLD_WIDTH+k]!=0; ++j, ++k) {
478                q[j]=q[UGENCASE_UNFOLD_WIDTH+k];
479            }
480            if(j>UGENCASE_UNFOLD_CP_WIDTH) {
481                fprintf(stderr, "gencase error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n",
482                        (long)j, UGENCASE_UNFOLD_CP_WIDTH);
483                exit(U_BUFFER_OVERFLOW_ERROR);
484            }
485
486            /* move following rows up one */
487            --unfoldRows;
488            unfoldTop-=UGENCASE_UNFOLD_WIDTH;
489            u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH);
490        } else {
491            p+=UGENCASE_UNFOLD_WIDTH;
492            ++i;
493        }
494    }
495
496    unfold[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows;
497
498    if(beVerbose) {
499        puts("unfold data:");
500
501        p=(UChar *)unfold;
502        for(i=0; i<unfoldRows; ++i) {
503            p+=UGENCASE_UNFOLD_WIDTH;
504            printf("[%2d] %04x %04x %04x <- %04x %04x\n",
505                   (int)i, p[0], p[1], p[2], p[3], p[4]);
506        }
507    }
508}
509
510/* case closure ------------------------------------------------------------- */
511
512static void
513addClosureMapping(UChar32 src, UChar32 dest) {
514    uint32_t value;
515
516    if(beVerbose) {
517        printf("add closure mapping U+%04lx->U+%04lx\n",
518                (unsigned long)src, (unsigned long)dest);
519    }
520
521    value=upvec_getValue(pv, src, 0);
522    if(value&UCASE_EXCEPTION) {
523        Props *p=excProps+(value>>UGENCASE_EXC_SHIFT);
524        int32_t i;
525
526        /* append dest to src's closure array */
527        for(i=0;; ++i) {
528            if(i==LENGTHOF(p->closure)) {
529                fprintf(stderr, "closure[] overflow for U+%04lx->U+%04lx\n",
530                                (unsigned long)src, (unsigned long)dest);
531                exit(U_BUFFER_OVERFLOW_ERROR);
532            } else if(p->closure[i]==dest) {
533                break; /* do not store duplicates */
534            } else if(p->closure[i]==0) {
535                p->closure[i]=dest;
536                break;
537            }
538        }
539    } else {
540        Props p2={ 0 };
541        UChar32 next;
542        UErrorCode errorCode;
543
544        /*
545         * decode value into p2 (enough for makeException() to work properly),
546         * add the closure mapping,
547         * and set the new exception for src
548         */
549        p2.code=src;
550        p2.closure[0]=dest;
551
552        if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
553            /* one simple case mapping, don't care which one */
554            next=src+((int16_t)value>>UCASE_DELTA_SHIFT);
555            if(next!=src) {
556                if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
557                    p2.upperCase=p2.titleCase=next;
558                } else {
559                    p2.lowerCase=next;
560                }
561            }
562        } else if(value&UCASE_DELTA_MASK) {
563            fprintf(stderr, "gencase error: unable to add case closure exception to case-ignorable U+%04lx\n",
564                            (unsigned long)src);
565            exit(U_INTERNAL_PROGRAM_ERROR);
566        }
567
568        value&=~(UGENCASE_EXC_MASK|UCASE_DELTA_MASK); /* remove previous simple mapping */
569        value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
570        value|=UCASE_EXCEPTION;
571        uprv_memcpy(excProps+exceptionsCount, &p2, sizeof(p2));
572        if(++exceptionsCount==MAX_EXC_COUNT) {
573            fprintf(stderr, "gencase: too many exceptions\n");
574            exit(U_INDEX_OUTOFBOUNDS_ERROR);
575        }
576
577        errorCode=U_ZERO_ERROR;
578        upvec_setValue(pv, src, src, 0, value, 0xffffffff, &errorCode);
579        if(U_FAILURE(errorCode)) {
580            fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
581                            u_errorName(errorCode));
582            exit(errorCode);
583        }
584    }
585}
586
587/*
588 * Find missing case mapping relationships and add mappings for case closure.
589 * This function starts from an "original" code point and recursively
590 * finds its case mappings and the case mappings of where it maps to.
591 *
592 * The recursion depth is capped at 3 nested calls of this function.
593 * In each call, the current code point is c, and the function enumerates
594 * all of c's simple (single-code point) case mappings.
595 * prev is the code point that case-mapped to c.
596 * prev2 is the code point that case-mapped to prev.
597 *
598 * The initial function call has prev2<0, prev<0, and c==orig
599 * (marking no code points).
600 * It enumerates c's case mappings and recurses without further action.
601 *
602 * The second-level function call has prev2<0, prev==orig, and c is
603 * the destination code point of one of prev's case mappings.
604 * The function checks if any of c's case mappings go back to orig
605 * and adds a closure mapping if not.
606 * In other words, it turns a case mapping relationship of
607 *   orig->c
608 * into
609 *   orig<->c
610 *
611 * The third-level function call has prev2==orig, prev>=0, and c is
612 * the destination code point of one of prev's case mappings.
613 * (And prev is the destination of one of prev2's case mappings.)
614 * The function checks if any of c's case mappings go back to orig
615 * and adds a closure mapping if not.
616 * In other words, it turns case mapping relationships of
617 *   orig->prev->c or orig->prev<->c
618 * into
619 *   orig->prev->c->orig or orig->prev<->c->orig
620 * etc.
621 * (Graphically, this closes a triangle.)
622 *
623 * With repeated application on all code points until no more closure mappings
624 * are added, all case equivalence groups get complete mappings.
625 * That is, in each group of code points with case relationships
626 * each code point will in the end have some mapping to each other
627 * code point in the group.
628 *
629 * @return TRUE if a closure mapping was added
630 */
631static UBool
632addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value) {
633    UChar32 next;
634    UBool someMappingsAdded=FALSE;
635
636    if(c!=orig) {
637        /* get the properties for c */
638        value=upvec_getValue(pv, c, 0);
639    }
640    /* else if c==orig then c's value was passed in */
641
642    if(value&UCASE_EXCEPTION) {
643        UChar32 set[32];
644        int32_t i, count=0;
645
646        Props *p=excProps+(value>>UGENCASE_EXC_SHIFT);
647
648        /*
649         * marker for whether any of c's mappings goes to orig
650         * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings
651         */
652        UBool mapsToOrig=(UBool)(c==orig);
653
654        /* collect c's case mapping destinations in set[] */
655        if((next=p->upperCase)!=0 && next!=c) {
656            set[count++]=next;
657        }
658        if((next=p->lowerCase)!=0 && next!=c) {
659            set[count++]=next;
660        }
661        if(p->upperCase!=(next=p->titleCase) && next!=c) {
662            set[count++]=next;
663        }
664        if(p->caseFolding!=NULL && (next=p->caseFolding->simple)!=0 && next!=c) {
665            set[count++]=next;
666        }
667
668        /* append c's current closure mappings to set[] */
669        for(i=0; i<LENGTHOF(p->closure) && (next=p->closure[i])!=0; ++i) {
670            set[count++]=next;
671        }
672
673        /* process all code points to which c case-maps */
674        for(i=0; i<count; ++i) {
675            next=set[i]; /* next!=c */
676
677            if(next==orig) {
678                mapsToOrig=TRUE; /* remember that we map to orig */
679            } else if(prev2<0 && next!=prev) {
680                /*
681                 * recurse unless
682                 * we have reached maximum depth (prev2>=0) or
683                 * this is a mapping to one of the previous code points (orig, prev, c)
684                 */
685                someMappingsAdded|=addClosure(orig, prev, c, next, 0);
686            }
687        }
688
689        if(!mapsToOrig) {
690            addClosureMapping(c, orig);
691            return TRUE;
692        }
693    } else {
694        if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
695            /* one simple case mapping, don't care which one */
696            next=c+((int16_t)value>>UCASE_DELTA_SHIFT);
697            if(next!=c) {
698                /*
699                 * recurse unless
700                 * we have reached maximum depth (prev2>=0) or
701                 * this is a mapping to one of the previous code points (orig, prev, c)
702                 */
703                if(prev2<0 && next!=orig && next!=prev) {
704                    someMappingsAdded|=addClosure(orig, prev, c, next, 0);
705                }
706
707                if(c!=orig && next!=orig) {
708                    /* c does not map to orig, add a closure mapping c->orig */
709                    addClosureMapping(c, orig);
710                    return TRUE;
711                }
712            }
713        }
714    }
715
716    return someMappingsAdded;
717}
718
719extern void
720makeCaseClosure() {
721    UChar *p;
722    uint32_t *row;
723    uint32_t value;
724    UChar32 start, end, c, c2;
725    int32_t i, j;
726    UBool someMappingsAdded;
727
728    /*
729     * finalize the "unfold" data because we need to use it to add closure mappings
730     * for situations like FB05->"st"<-FB06
731     * where we would otherwise miss the FB05<->FB06 relationship
732     */
733    makeUnfoldData();
734
735    /* use the "unfold" data to add mappings */
736
737    /* p always points to the code points; this loop ignores the strings completely */
738    p=unfold+UGENCASE_UNFOLD_WIDTH+UGENCASE_UNFOLD_STRING_WIDTH;
739
740    for(i=0; i<unfoldRows; p+=UGENCASE_UNFOLD_WIDTH, ++i) {
741        j=0;
742        U16_NEXT_UNSAFE(p, j, c);
743        while(j<UGENCASE_UNFOLD_CP_WIDTH && p[j]!=0) {
744            U16_NEXT_UNSAFE(p, j, c2);
745            addClosure(c, U_SENTINEL, c, c2, 0);
746        }
747    }
748
749    if(beVerbose) {
750        puts("---- ---- ---- ---- (done with closures from unfolding)");
751    }
752
753    /* add further closure mappings from analyzing simple mappings */
754    do {
755        someMappingsAdded=FALSE;
756
757        i=0;
758        while((row=upvec_getRow(pv, i, &start, &end))!=NULL && start<UPVEC_FIRST_SPECIAL_CP) {
759            value=*row;
760            if(value!=0) {
761                while(start<=end) {
762                    if(addClosure(start, U_SENTINEL, U_SENTINEL, start, value)) {
763                        someMappingsAdded=TRUE;
764
765                        /*
766                         * stop this loop because pv was changed and row is not valid any more
767                         * skip all rows below the current start
768                         */
769                        while((row=upvec_getRow(pv, i, NULL, &end))!=NULL && start>end) {
770                            ++i;
771                        }
772                        row=NULL; /* signal to continue with outer loop, without further ++i */
773                        break;
774                    }
775                    ++start;
776                }
777                if(row==NULL) {
778                    continue; /* see row=NULL above */
779                }
780            }
781            ++i;
782        }
783
784        if(beVerbose && someMappingsAdded) {
785            puts("---- ---- ---- ----");
786        }
787    } while(someMappingsAdded);
788}
789
790/* exceptions --------------------------------------------------------------- */
791
792/* get the string length from zero-terminated code points in a limited-length array */
793static int32_t
794getLengthOfCodePoints(const UChar32 *s, int32_t maxLength) {
795    int32_t i, length;
796
797    for(i=length=0; i<maxLength && s[i]!=0; ++i) {
798        length+=U16_LENGTH(s[i]);
799    }
800    return length;
801}
802
803static UBool
804fullMappingEqualsSimple(const UChar *s, UChar32 simple, UChar32 c) {
805    int32_t i, length;
806    UChar32 full;
807
808    length=*s++;
809    if(length==0 || length>U16_MAX_LENGTH) {
810        return FALSE;
811    }
812    i=0;
813    U16_NEXT(s, i, length, full);
814
815    if(simple==0) {
816        simple=c; /* UCD has no simple mapping if it's the same as the code point itself */
817    }
818    return (UBool)(i==length && full==simple);
819}
820
821static uint16_t
822makeException(uint32_t value, Props *p) {
823    uint32_t slots[8];
824    uint32_t slotBits;
825    uint16_t excWord, excIndex, excTop, i, count, length, fullLengths;
826    UBool doubleSlots;
827
828    /* excIndex will be returned for storing in the trie word */
829    excIndex=exceptionsTop;
830    if(excIndex>=UCASE_MAX_EXCEPTIONS) {
831        fprintf(stderr, "gencase error: too many exceptions words\n");
832        exit(U_BUFFER_OVERFLOW_ERROR);
833    }
834
835    excTop=excIndex+1; /* +1 for excWord which will be stored at excIndex */
836
837    /* copy and shift the soft-dotted bits */
838    excWord=((uint16_t)value&UCASE_DOT_MASK)<<UCASE_EXC_DOT_SHIFT;
839
840    /* update maxFullLength */
841    if(p->specialCasing!=NULL) {
842        length=p->specialCasing->lowerCase[0];
843        if(length>maxFullLength) {
844            maxFullLength=length;
845        }
846        length=p->specialCasing->upperCase[0];
847        if(length>maxFullLength) {
848            maxFullLength=length;
849        }
850        length=p->specialCasing->titleCase[0];
851        if(length>maxFullLength) {
852            maxFullLength=length;
853        }
854    }
855    if(p->caseFolding!=NULL) {
856        length=p->caseFolding->full[0];
857        if(length>maxFullLength) {
858            maxFullLength=length;
859        }
860    }
861
862    /* set the bits for conditional mappings */
863    if(p->specialCasing!=NULL && p->specialCasing->isComplex) {
864        excWord|=UCASE_EXC_CONDITIONAL_SPECIAL;
865        p->specialCasing=NULL;
866    }
867    if(p->caseFolding!=NULL && p->caseFolding->simple==0 && p->caseFolding->full[0]==0) {
868        excWord|=UCASE_EXC_CONDITIONAL_FOLD;
869        p->caseFolding=NULL;
870    }
871
872    /*
873     * Note:
874     * UCD stores no simple mappings when they are the same as the code point itself.
875     * SpecialCasing and CaseFolding do store simple mappings even if they are
876     * the same as the code point itself.
877     * Comparisons between simple regular mappings and simple special/folding
878     * mappings need to compensate for the difference by comparing with the
879     * original code point if a simple UCD mapping is missing (0).
880     */
881
882    /* remove redundant data */
883    if(p->specialCasing!=NULL) {
884        /* do not store full mappings if they are the same as the simple ones */
885        if(fullMappingEqualsSimple(p->specialCasing->lowerCase, p->lowerCase, p->code)) {
886            p->specialCasing->lowerCase[0]=0;
887        }
888        if(fullMappingEqualsSimple(p->specialCasing->upperCase, p->upperCase, p->code)) {
889            p->specialCasing->upperCase[0]=0;
890        }
891        if(fullMappingEqualsSimple(p->specialCasing->titleCase, p->titleCase, p->code)) {
892            p->specialCasing->titleCase[0]=0;
893        }
894    }
895    if( p->caseFolding!=NULL &&
896        fullMappingEqualsSimple(p->caseFolding->full, p->caseFolding->simple, p->code)
897    ) {
898        p->caseFolding->full[0]=0;
899    }
900
901    /* write the optional slots */
902    slotBits=0;
903    count=0;
904
905    if(p->lowerCase!=0) {
906        slots[count]=(uint32_t)p->lowerCase;
907        slotBits|=slots[count];
908        ++count;
909        excWord|=U_MASK(UCASE_EXC_LOWER);
910    }
911    if( p->caseFolding!=NULL &&
912        p->caseFolding->simple!=0 &&
913        (p->lowerCase!=0 ?
914            p->caseFolding->simple!=p->lowerCase :
915            p->caseFolding->simple!=p->code)
916    ) {
917        slots[count]=(uint32_t)p->caseFolding->simple;
918        slotBits|=slots[count];
919        ++count;
920        excWord|=U_MASK(UCASE_EXC_FOLD);
921    }
922    if(p->upperCase!=0) {
923        slots[count]=(uint32_t)p->upperCase;
924        slotBits|=slots[count];
925        ++count;
926        excWord|=U_MASK(UCASE_EXC_UPPER);
927    }
928    if(p->upperCase!=p->titleCase) {
929        if(p->titleCase!=0) {
930            slots[count]=(uint32_t)p->titleCase;
931        } else {
932            slots[count]=(uint32_t)p->code;
933        }
934        slotBits|=slots[count];
935        ++count;
936        excWord|=U_MASK(UCASE_EXC_TITLE);
937    }
938
939    /* length of case closure */
940    if(p->closure[0]!=0) {
941        length=getLengthOfCodePoints(p->closure, LENGTHOF(p->closure));
942        slots[count]=(uint32_t)length; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */
943        slotBits|=slots[count];
944        ++count;
945        excWord|=U_MASK(UCASE_EXC_CLOSURE);
946    }
947
948    /* lengths of full case mapping strings, stored in the last slot */
949    fullLengths=0;
950    if(p->specialCasing!=NULL) {
951        fullLengths=p->specialCasing->lowerCase[0];
952        fullLengths|=p->specialCasing->upperCase[0]<<8;
953        fullLengths|=p->specialCasing->titleCase[0]<<12;
954    }
955    if(p->caseFolding!=NULL) {
956        fullLengths|=p->caseFolding->full[0]<<4;
957    }
958    if(fullLengths!=0) {
959        slots[count]=fullLengths;
960        slotBits|=slots[count];
961        ++count;
962        excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS);
963    }
964
965    /* write slots */
966    doubleSlots=(UBool)(slotBits>0xffff);
967    if(!doubleSlots) {
968        for(i=0; i<count; ++i) {
969            exceptions[excTop++]=(uint16_t)slots[i];
970        }
971    } else {
972        excWord|=UCASE_EXC_DOUBLE_SLOTS;
973        for(i=0; i<count; ++i) {
974            exceptions[excTop++]=(uint16_t)(slots[i]>>16);
975            exceptions[excTop++]=(uint16_t)slots[i];
976        }
977    }
978
979    /* write the full case mapping strings */
980    if(p->specialCasing!=NULL) {
981        length=(uint16_t)p->specialCasing->lowerCase[0];
982        u_memcpy((UChar *)exceptions+excTop, p->specialCasing->lowerCase+1, length);
983        excTop+=length;
984    }
985    if(p->caseFolding!=NULL) {
986        length=(uint16_t)p->caseFolding->full[0];
987        u_memcpy((UChar *)exceptions+excTop, p->caseFolding->full+1, length);
988        excTop+=length;
989    }
990    if(p->specialCasing!=NULL) {
991        length=(uint16_t)p->specialCasing->upperCase[0];
992        u_memcpy((UChar *)exceptions+excTop, p->specialCasing->upperCase+1, length);
993        excTop+=length;
994
995        length=(uint16_t)p->specialCasing->titleCase[0];
996        u_memcpy((UChar *)exceptions+excTop, p->specialCasing->titleCase+1, length);
997        excTop+=length;
998    }
999
1000    /* write the closure data */
1001    if(p->closure[0]!=0) {
1002        UChar32 c;
1003
1004        for(i=0; i<LENGTHOF(p->closure) && (c=p->closure[i])!=0; ++i) {
1005            U16_APPEND_UNSAFE((UChar *)exceptions, excTop, c);
1006        }
1007    }
1008
1009    exceptionsTop=excTop;
1010
1011    /* write the main exceptions word */
1012    exceptions[excIndex]=excWord;
1013
1014    return excIndex;
1015}
1016
1017extern void
1018makeExceptions() {
1019    uint32_t *row;
1020    uint32_t value;
1021    int32_t i;
1022    uint16_t excIndex;
1023
1024    i=0;
1025    while((row=upvec_getRow(pv, i, NULL, NULL))!=NULL) {
1026        value=*row;
1027        if(value&UCASE_EXCEPTION) {
1028            excIndex=makeException(value, excProps+(value>>UGENCASE_EXC_SHIFT));
1029            *row=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|(excIndex<<UCASE_EXC_SHIFT);
1030        }
1031        ++i;
1032    }
1033}
1034
1035/* generate output data ----------------------------------------------------- */
1036
1037extern void
1038generateData(const char *dataDir, UBool csource) {
1039    static int32_t indexes[UCASE_IX_TOP]={
1040        UCASE_IX_TOP
1041    };
1042    static uint8_t trieBlock[40000];
1043
1044    const uint32_t *row;
1045    UChar32 start, end;
1046    int32_t i;
1047
1048    UNewDataMemory *pData;
1049    UNewTrie *pTrie;
1050    UErrorCode errorCode=U_ZERO_ERROR;
1051    int32_t trieSize;
1052    long dataLength;
1053
1054    pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE);
1055    if(pTrie==NULL) {
1056        fprintf(stderr, "gencase error: unable to create a UNewTrie\n");
1057        exit(U_MEMORY_ALLOCATION_ERROR);
1058    }
1059
1060    for(i=0; (row=upvec_getRow(pv, i, &start, &end))!=NULL; ++i) {
1061        if(start<UPVEC_FIRST_SPECIAL_CP && !utrie_setRange32(pTrie, start, end+1, *row, TRUE)) {
1062            fprintf(stderr, "gencase error: unable to set trie value (overflow)\n");
1063            exit(U_BUFFER_OVERFLOW_ERROR);
1064        }
1065    }
1066
1067    trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
1068    if(U_FAILURE(errorCode)) {
1069        fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
1070        exit(errorCode);
1071    }
1072
1073    indexes[UCASE_IX_EXC_LENGTH]=exceptionsTop;
1074    indexes[UCASE_IX_TRIE_SIZE]=trieSize;
1075    indexes[UCASE_IX_UNFOLD_LENGTH]=unfoldTop;
1076    indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop+2*unfoldTop;
1077
1078    indexes[UCASE_IX_MAX_FULL_LENGTH]=maxFullLength;
1079
1080    if(beVerbose) {
1081        printf("trie size in bytes:                    %5d\n", (int)trieSize);
1082        printf("number of code points with exceptions: %5d\n", exceptionsCount);
1083        printf("size in bytes of exceptions:           %5d\n", 2*exceptionsTop);
1084        printf("size in bytes of reverse foldings:     %5d\n", 2*unfoldTop);
1085        printf("data size:                             %5d\n", (int)indexes[UCASE_IX_LENGTH]);
1086    }
1087
1088    if(csource) {
1089        /* write .c file for hardcoded data */
1090        UTrie trie={ NULL };
1091        UTrie2 *trie2;
1092        FILE *f;
1093
1094        utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
1095        if(U_FAILURE(errorCode)) {
1096            fprintf(
1097                stderr,
1098                "gencase error: failed to utrie_unserialize(ucase.icu trie) - %s\n",
1099                u_errorName(errorCode));
1100            exit(errorCode);
1101        }
1102
1103        /* use UTrie2 */
1104        dataInfo.formatVersion[0]=2;
1105        dataInfo.formatVersion[2]=0;
1106        dataInfo.formatVersion[3]=0;
1107        trie2=utrie2_fromUTrie(&trie, 0, &errorCode);
1108        if(U_FAILURE(errorCode)) {
1109            fprintf(
1110                stderr,
1111                "gencase error: utrie2_fromUTrie() failed - %s\n",
1112                u_errorName(errorCode));
1113            exit(errorCode);
1114        }
1115        {
1116            /* delete lead surrogate code unit values */
1117            UChar lead;
1118            trie2=utrie2_cloneAsThawed(trie2, &errorCode);
1119            for(lead=0xd800; lead<0xdc00; ++lead) {
1120                utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode);
1121            }
1122            utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode);
1123            if(U_FAILURE(errorCode)) {
1124                fprintf(
1125                    stderr,
1126                    "gencase error: deleting lead surrogate code unit values failed - %s\n",
1127                    u_errorName(errorCode));
1128                exit(errorCode);
1129            }
1130        }
1131
1132        f=usrc_create(dataDir, "ucase_props_data.c");
1133        if(f!=NULL) {
1134            usrc_writeArray(f,
1135                "static const UVersionInfo ucase_props_dataVersion={",
1136                dataInfo.dataVersion, 8, 4,
1137                "};\n\n");
1138            usrc_writeArray(f,
1139                "static const int32_t ucase_props_indexes[UCASE_IX_TOP]={",
1140                indexes, 32, UCASE_IX_TOP,
1141                "};\n\n");
1142            usrc_writeUTrie2Arrays(f,
1143                "static const uint16_t ucase_props_trieIndex[%ld]={\n", NULL,
1144                trie2,
1145                "\n};\n\n");
1146            usrc_writeArray(f,
1147                "static const uint16_t ucase_props_exceptions[%ld]={\n",
1148                exceptions, 16, exceptionsTop,
1149                "\n};\n\n");
1150            usrc_writeArray(f,
1151                "static const uint16_t ucase_props_unfold[%ld]={\n",
1152                unfold, 16, unfoldTop,
1153                "\n};\n\n");
1154            fputs(
1155                "static const UCaseProps ucase_props_singleton={\n"
1156                "  NULL,\n"
1157                "  ucase_props_indexes,\n"
1158                "  ucase_props_exceptions,\n"
1159                "  ucase_props_unfold,\n",
1160                f);
1161            usrc_writeUTrie2Struct(f,
1162                "  {\n",
1163                trie2, "ucase_props_trieIndex", NULL,
1164                "  },\n");
1165            usrc_writeArray(f, "  { ", dataInfo.formatVersion, 8, 4, " }\n");
1166            fputs("};\n", f);
1167            fclose(f);
1168        }
1169        utrie2_close(trie2);
1170    } else {
1171        /* write the data */
1172        pData=udata_create(dataDir, UCASE_DATA_TYPE, UCASE_DATA_NAME, &dataInfo,
1173                        haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
1174        if(U_FAILURE(errorCode)) {
1175            fprintf(stderr, "gencase: unable to create data memory, %s\n", u_errorName(errorCode));
1176            exit(errorCode);
1177        }
1178
1179        udata_writeBlock(pData, indexes, sizeof(indexes));
1180        udata_writeBlock(pData, trieBlock, trieSize);
1181        udata_writeBlock(pData, exceptions, 2*exceptionsTop);
1182        udata_writeBlock(pData, unfold, 2*unfoldTop);
1183
1184        /* finish up */
1185        dataLength=udata_finish(pData, &errorCode);
1186        if(U_FAILURE(errorCode)) {
1187            fprintf(stderr, "gencase: error %d writing the output file\n", errorCode);
1188            exit(errorCode);
1189        }
1190
1191        if(dataLength!=indexes[UCASE_IX_LENGTH]) {
1192            fprintf(stderr, "gencase: data length %ld != calculated size %d\n",
1193                dataLength, (int)indexes[UCASE_IX_LENGTH]);
1194            exit(U_INTERNAL_PROGRAM_ERROR);
1195        }
1196    }
1197
1198    utrie_close(pTrie);
1199}
1200
1201/*
1202 * Hey, Emacs, please set the following:
1203 *
1204 * Local Variables:
1205 * indent-tabs-mode: nil
1206 * End:
1207 *
1208 */
1209