1/*
2******************************************************************************
3*
4*   Copyright (C) 1999-2009, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  unames.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 1999oct04
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/putil.h"
19#include "unicode/uchar.h"
20#include "unicode/udata.h"
21#include "ustr_imp.h"
22#include "umutex.h"
23#include "cmemory.h"
24#include "cstring.h"
25#include "ucln_cmn.h"
26#include "udataswp.h"
27#include "uprops.h"
28
29/* prototypes ------------------------------------------------------------- */
30
31#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
32
33static const char DATA_NAME[] = "unames";
34static const char DATA_TYPE[] = "icu";
35
36#define GROUP_SHIFT 5
37#define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
38#define GROUP_MASK (LINES_PER_GROUP-1)
39
40/*
41 * This struct was replaced by explicitly accessing equivalent
42 * fields from triples of uint16_t.
43 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
44 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
45 * would advance by 6 bytes (3 uint16_t).
46 *
47 * We can't just change the data structure because it's loaded from a data file,
48 * and we don't want to make it less compact, so we changed the access code.
49 *
50 * For details see ICU tickets 6331 and 6008.
51typedef struct {
52    uint16_t groupMSB,
53             offsetHigh, offsetLow; / * avoid padding * /
54} Group;
55 */
56enum {
57    GROUP_MSB,
58    GROUP_OFFSET_HIGH,
59    GROUP_OFFSET_LOW,
60    GROUP_LENGTH
61};
62
63/*
64 * Get the 32-bit group offset.
65 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
66 * @return group offset (int32_t)
67 */
68#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
69
70#define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
71#define PREV_GROUP(group) ((group)-GROUP_LENGTH)
72
73typedef struct {
74    uint32_t start, end;
75    uint8_t type, variant;
76    uint16_t size;
77} AlgorithmicRange;
78
79typedef struct {
80    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
81} UCharNames;
82
83/*
84 * Get the groups table from a UCharNames struct.
85 * The groups table consists of one uint16_t groupCount followed by
86 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
87 * and the comment for the old struct Group above.
88 *
89 * @param names (const UCharNames *) pointer to the UCharNames indexes
90 * @return (const uint16_t *) pointer to the groups table
91 */
92#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
93
94typedef struct {
95    const char *otherName;
96    UChar32 code;
97} FindName;
98
99#define DO_FIND_NAME NULL
100
101static UDataMemory *uCharNamesData=NULL;
102static UCharNames *uCharNames=NULL;
103static UErrorCode gLoadErrorCode=U_ZERO_ERROR;
104
105/*
106 * Maximum length of character names (regular & 1.0).
107 */
108static int32_t gMaxNameLength=0;
109
110/*
111 * Set of chars used in character names (regular & 1.0).
112 * Chars are platform-dependent (can be EBCDIC).
113 */
114static uint32_t gNameSet[8]={ 0 };
115
116#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
117#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
118#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
119
120#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
121
122static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
123    "unassigned",
124    "uppercase letter",
125    "lowercase letter",
126    "titlecase letter",
127    "modifier letter",
128    "other letter",
129    "non spacing mark",
130    "enclosing mark",
131    "combining spacing mark",
132    "decimal digit number",
133    "letter number",
134    "other number",
135    "space separator",
136    "line separator",
137    "paragraph separator",
138    "control",
139    "format",
140    "private use area",
141    "surrogate",
142    "dash punctuation",
143    "start punctuation",
144    "end punctuation",
145    "connector punctuation",
146    "other punctuation",
147    "math symbol",
148    "currency symbol",
149    "modifier symbol",
150    "other symbol",
151    "initial punctuation",
152    "final punctuation",
153    "noncharacter",
154    "lead surrogate",
155    "trail surrogate"
156};
157
158/* implementation ----------------------------------------------------------- */
159
160static UBool U_CALLCONV unames_cleanup(void)
161{
162    if(uCharNamesData) {
163        udata_close(uCharNamesData);
164        uCharNamesData = NULL;
165    }
166    if(uCharNames) {
167        uCharNames = NULL;
168    }
169    gMaxNameLength=0;
170    return TRUE;
171}
172
173static UBool U_CALLCONV
174isAcceptable(void *context,
175             const char *type, const char *name,
176             const UDataInfo *pInfo) {
177    return (UBool)(
178        pInfo->size>=20 &&
179        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
180        pInfo->charsetFamily==U_CHARSET_FAMILY &&
181        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
182        pInfo->dataFormat[1]==0x6e &&
183        pInfo->dataFormat[2]==0x61 &&
184        pInfo->dataFormat[3]==0x6d &&
185        pInfo->formatVersion[0]==1);
186}
187
188static UBool
189isDataLoaded(UErrorCode *pErrorCode) {
190    /* load UCharNames from file if necessary */
191    UBool isCached;
192
193    /* do this because double-checked locking is broken */
194    UMTX_CHECK(NULL, (uCharNames!=NULL), isCached);
195
196    if(!isCached) {
197        UCharNames *names;
198        UDataMemory *data;
199
200        /* check error code from previous attempt */
201        if(U_FAILURE(gLoadErrorCode)) {
202            *pErrorCode=gLoadErrorCode;
203            return FALSE;
204        }
205
206        /* open the data outside the mutex block */
207        data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
208        if(U_FAILURE(*pErrorCode)) {
209            gLoadErrorCode=*pErrorCode;
210            return FALSE;
211        }
212
213        names=(UCharNames *)udata_getMemory(data);
214
215        /* in the mutex block, set the data for this process */
216        {
217            umtx_lock(NULL);
218            if(uCharNames==NULL) {
219                uCharNamesData=data;
220                uCharNames=names;
221                data=NULL;
222                names=NULL;
223                ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
224            }
225            umtx_unlock(NULL);
226        }
227
228        /* if a different thread set it first, then close the extra data */
229        if(data!=NULL) {
230            udata_close(data); /* NULL if it was set correctly */
231        }
232    }
233    return TRUE;
234}
235
236#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
237    if((bufferLength)>0) { \
238        *(buffer)++=c; \
239        --(bufferLength); \
240    } \
241    ++(bufferPos); \
242}
243
244#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
245
246/*
247 * Important: expandName() and compareName() are almost the same -
248 * apply fixes to both.
249 *
250 * UnicodeData.txt uses ';' as a field separator, so no
251 * field can contain ';' as part of its contents.
252 * In unames.dat, it is marked as token[';']==-1 only if the
253 * semicolon is used in the data file - which is iff we
254 * have Unicode 1.0 names or ISO comments or aliases.
255 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
256 * although we know that it will never be part of a name.
257 */
258static uint16_t
259expandName(UCharNames *names,
260           const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
261           char *buffer, uint16_t bufferLength) {
262    uint16_t *tokens=(uint16_t *)names+8;
263    uint16_t token, tokenCount=*tokens++, bufferPos=0;
264    uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
265    uint8_t c;
266
267    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
268        /*
269         * skip the modern name if it is not requested _and_
270         * if the semicolon byte value is a character, not a token number
271         */
272        if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
273            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
274            do {
275                while(nameLength>0) {
276                    --nameLength;
277                    if(*name++==';') {
278                        break;
279                    }
280                }
281            } while(--fieldIndex>0);
282        } else {
283            /*
284             * the semicolon byte value is a token number, therefore
285             * only modern names are stored in unames.dat and there is no
286             * such requested alternate name here
287             */
288            nameLength=0;
289        }
290    }
291
292    /* write each letter directly, and write a token word per token */
293    while(nameLength>0) {
294        --nameLength;
295        c=*name++;
296
297        if(c>=tokenCount) {
298            if(c!=';') {
299                /* implicit letter */
300                WRITE_CHAR(buffer, bufferLength, bufferPos, c);
301            } else {
302                /* finished */
303                break;
304            }
305        } else {
306            token=tokens[c];
307            if(token==(uint16_t)(-2)) {
308                /* this is a lead byte for a double-byte token */
309                token=tokens[c<<8|*name++];
310                --nameLength;
311            }
312            if(token==(uint16_t)(-1)) {
313                if(c!=';') {
314                    /* explicit letter */
315                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
316                } else {
317                    /* stop, but skip the semicolon if we are seeking
318                       extended names and there was no 2.0 name but there
319                       is a 1.0 name. */
320                    if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
321                        if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
322                            continue;
323                        }
324                    }
325                    /* finished */
326                    break;
327                }
328            } else {
329                /* write token word */
330                uint8_t *tokenString=tokenStrings+token;
331                while((c=*tokenString++)!=0) {
332                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
333                }
334            }
335        }
336    }
337
338    /* zero-terminate */
339    if(bufferLength>0) {
340        *buffer=0;
341    }
342
343    return bufferPos;
344}
345
346/*
347 * compareName() is almost the same as expandName() except that it compares
348 * the currently expanded name to an input name.
349 * It returns the match/no match result as soon as possible.
350 */
351static UBool
352compareName(UCharNames *names,
353            const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
354            const char *otherName) {
355    uint16_t *tokens=(uint16_t *)names+8;
356    uint16_t token, tokenCount=*tokens++;
357    uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
358    uint8_t c;
359    const char *origOtherName = otherName;
360
361    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
362        /*
363         * skip the modern name if it is not requested _and_
364         * if the semicolon byte value is a character, not a token number
365         */
366        if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
367            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
368            do {
369                while(nameLength>0) {
370                    --nameLength;
371                    if(*name++==';') {
372                        break;
373                    }
374                }
375            } while(--fieldIndex>0);
376        } else {
377            /*
378             * the semicolon byte value is a token number, therefore
379             * only modern names are stored in unames.dat and there is no
380             * such requested alternate name here
381             */
382            nameLength=0;
383        }
384    }
385
386    /* compare each letter directly, and compare a token word per token */
387    while(nameLength>0) {
388        --nameLength;
389        c=*name++;
390
391        if(c>=tokenCount) {
392            if(c!=';') {
393                /* implicit letter */
394                if((char)c!=*otherName++) {
395                    return FALSE;
396                }
397            } else {
398                /* finished */
399                break;
400            }
401        } else {
402            token=tokens[c];
403            if(token==(uint16_t)(-2)) {
404                /* this is a lead byte for a double-byte token */
405                token=tokens[c<<8|*name++];
406                --nameLength;
407            }
408            if(token==(uint16_t)(-1)) {
409                if(c!=';') {
410                    /* explicit letter */
411                    if((char)c!=*otherName++) {
412                        return FALSE;
413                    }
414                } else {
415                    /* stop, but skip the semicolon if we are seeking
416                       extended names and there was no 2.0 name but there
417                       is a 1.0 name. */
418                    if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
419                        if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
420                            continue;
421                        }
422                    }
423                    /* finished */
424                    break;
425                }
426            } else {
427                /* write token word */
428                uint8_t *tokenString=tokenStrings+token;
429                while((c=*tokenString++)!=0) {
430                    if((char)c!=*otherName++) {
431                        return FALSE;
432                    }
433                }
434            }
435        }
436    }
437
438    /* complete match? */
439    return (UBool)(*otherName==0);
440}
441
442static uint8_t getCharCat(UChar32 cp) {
443    uint8_t cat;
444
445    if (UTF_IS_UNICODE_NONCHAR(cp)) {
446        return U_NONCHARACTER_CODE_POINT;
447    }
448
449    if ((cat = u_charType(cp)) == U_SURROGATE) {
450        cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
451    }
452
453    return cat;
454}
455
456static const char *getCharCatName(UChar32 cp) {
457    uint8_t cat = getCharCat(cp);
458
459    /* Return unknown if the table of names above is not up to
460       date. */
461
462    if (cat >= LENGTHOF(charCatNames)) {
463        return "unknown";
464    } else {
465        return charCatNames[cat];
466    }
467}
468
469static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
470    const char *catname = getCharCatName(code);
471    uint16_t length = 0;
472
473    UChar32 cp;
474    int ndigits, i;
475
476    WRITE_CHAR(buffer, bufferLength, length, '<');
477    while (catname[length - 1]) {
478        WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
479    }
480    WRITE_CHAR(buffer, bufferLength, length, '-');
481    for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
482        ;
483    if (ndigits < 4)
484        ndigits = 4;
485    for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
486        uint8_t v = (uint8_t)(cp & 0xf);
487        buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
488    }
489    buffer += ndigits;
490    length += ndigits;
491    WRITE_CHAR(buffer, bufferLength, length, '>');
492
493    return length;
494}
495
496/*
497 * getGroup() does a binary search for the group that contains the
498 * Unicode code point "code".
499 * The return value is always a valid Group* that may contain "code"
500 * or else is the highest group before "code".
501 * If the lowest group is after "code", then that one is returned.
502 */
503static const uint16_t *
504getGroup(UCharNames *names, uint32_t code) {
505    const uint16_t *groups=GET_GROUPS(names);
506    uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
507             start=0,
508             limit=*groups++,
509             number;
510
511    /* binary search for the group of names that contains the one for code */
512    while(start<limit-1) {
513        number=(uint16_t)((start+limit)/2);
514        if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
515            limit=number;
516        } else {
517            start=number;
518        }
519    }
520
521    /* return this regardless of whether it is an exact match */
522    return groups+start*GROUP_LENGTH;
523}
524
525/*
526 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
527 * expands them into offsets and lengths for each string.
528 * Lengths are stored with a variable-width encoding in consecutive nibbles:
529 * If a nibble<0xc, then it is the length itself (0=empty string).
530 * If a nibble>=0xc, then it forms a length value with the following nibble.
531 * Calculation see below.
532 * The offsets and lengths arrays must be at least 33 (one more) long because
533 * there is no check here at the end if the last nibble is still used.
534 */
535static const uint8_t *
536expandGroupLengths(const uint8_t *s,
537                   uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
538    /* read the lengths of the 32 strings in this group and get each string's offset */
539    uint16_t i=0, offset=0, length=0;
540    uint8_t lengthByte;
541
542    /* all 32 lengths must be read to get the offset of the first group string */
543    while(i<LINES_PER_GROUP) {
544        lengthByte=*s++;
545
546        /* read even nibble - MSBs of lengthByte */
547        if(length>=12) {
548            /* double-nibble length spread across two bytes */
549            length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
550            lengthByte&=0xf;
551        } else if((lengthByte /* &0xf0 */)>=0xc0) {
552            /* double-nibble length spread across this one byte */
553            length=(uint16_t)((lengthByte&0x3f)+12);
554        } else {
555            /* single-nibble length in MSBs */
556            length=(uint16_t)(lengthByte>>4);
557            lengthByte&=0xf;
558        }
559
560        *offsets++=offset;
561        *lengths++=length;
562
563        offset+=length;
564        ++i;
565
566        /* read odd nibble - LSBs of lengthByte */
567        if((lengthByte&0xf0)==0) {
568            /* this nibble was not consumed for a double-nibble length above */
569            length=lengthByte;
570            if(length<12) {
571                /* single-nibble length in LSBs */
572                *offsets++=offset;
573                *lengths++=length;
574
575                offset+=length;
576                ++i;
577            }
578        } else {
579            length=0;   /* prevent double-nibble detection in the next iteration */
580        }
581    }
582
583    /* now, s is at the first group string */
584    return s;
585}
586
587static uint16_t
588expandGroupName(UCharNames *names, const uint16_t *group,
589                uint16_t lineNumber, UCharNameChoice nameChoice,
590                char *buffer, uint16_t bufferLength) {
591    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
592    const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
593    s=expandGroupLengths(s, offsets, lengths);
594    return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
595                      buffer, bufferLength);
596}
597
598static uint16_t
599getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
600        char *buffer, uint16_t bufferLength) {
601    const uint16_t *group=getGroup(names, code);
602    if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
603        return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
604                               buffer, bufferLength);
605    } else {
606        /* group not found */
607        /* zero-terminate */
608        if(bufferLength>0) {
609            *buffer=0;
610        }
611        return 0;
612    }
613}
614
615/*
616 * enumGroupNames() enumerates all the names in a 32-group
617 * and either calls the enumerator function or finds a given input name.
618 */
619static UBool
620enumGroupNames(UCharNames *names, const uint16_t *group,
621               UChar32 start, UChar32 end,
622               UEnumCharNamesFn *fn, void *context,
623               UCharNameChoice nameChoice) {
624    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
625    const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
626
627    s=expandGroupLengths(s, offsets, lengths);
628    if(fn!=DO_FIND_NAME) {
629        char buffer[200];
630        uint16_t length;
631
632        while(start<=end) {
633            length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
634            if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
635                buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
636            }
637            /* here, we assume that the buffer is large enough */
638            if(length>0) {
639                if(!fn(context, start, nameChoice, buffer, length)) {
640                    return FALSE;
641                }
642            }
643            ++start;
644        }
645    } else {
646        const char *otherName=((FindName *)context)->otherName;
647        while(start<=end) {
648            if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
649                ((FindName *)context)->code=start;
650                return FALSE;
651            }
652            ++start;
653        }
654    }
655    return TRUE;
656}
657
658/*
659 * enumExtNames enumerate extended names.
660 * It only needs to do it if it is called with a real function and not
661 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
662 * for extended names by itself.
663 */
664static UBool
665enumExtNames(UChar32 start, UChar32 end,
666             UEnumCharNamesFn *fn, void *context)
667{
668    if(fn!=DO_FIND_NAME) {
669        char buffer[200];
670        uint16_t length;
671
672        while(start<=end) {
673            buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
674            /* here, we assume that the buffer is large enough */
675            if(length>0) {
676                if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
677                    return FALSE;
678                }
679            }
680            ++start;
681        }
682    }
683
684    return TRUE;
685}
686
687static UBool
688enumNames(UCharNames *names,
689          UChar32 start, UChar32 limit,
690          UEnumCharNamesFn *fn, void *context,
691          UCharNameChoice nameChoice) {
692    uint16_t startGroupMSB, endGroupMSB, groupCount;
693    const uint16_t *group, *groupLimit;
694
695    startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
696    endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
697
698    /* find the group that contains start, or the highest before it */
699    group=getGroup(names, start);
700
701    if(startGroupMSB==endGroupMSB) {
702        if(startGroupMSB==group[GROUP_MSB]) {
703            /* if start and limit-1 are in the same group, then enumerate only in that one */
704            return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
705        }
706    } else {
707        const uint16_t *groups=GET_GROUPS(names);
708        groupCount=*groups++;
709        groupLimit=groups+groupCount*GROUP_LENGTH;
710
711        if(startGroupMSB==group[GROUP_MSB]) {
712            /* enumerate characters in the partial start group */
713            if((start&GROUP_MASK)!=0) {
714                if(!enumGroupNames(names, group,
715                                   start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
716                                   fn, context, nameChoice)) {
717                    return FALSE;
718                }
719                group=NEXT_GROUP(group); /* continue with the next group */
720            }
721        } else if(startGroupMSB>group[GROUP_MSB]) {
722            /* make sure that we start enumerating with the first group after start */
723            const uint16_t *nextGroup=NEXT_GROUP(group);
724            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
725                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
726                if (end > limit) {
727                    end = limit;
728                }
729                if (!enumExtNames(start, end - 1, fn, context)) {
730                    return FALSE;
731                }
732            }
733            group=nextGroup;
734        }
735
736        /* enumerate entire groups between the start- and end-groups */
737        while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
738            const uint16_t *nextGroup;
739            start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
740            if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
741                return FALSE;
742            }
743            nextGroup=NEXT_GROUP(group);
744            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
745                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
746                if (end > limit) {
747                    end = limit;
748                }
749                if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
750                    return FALSE;
751                }
752            }
753            group=nextGroup;
754        }
755
756        /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
757        if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
758            return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
759        } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
760            UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
761            if (next > start) {
762                start = next;
763            }
764        } else {
765            return TRUE;
766        }
767    }
768
769    /* we have not found a group, which means everything is made of
770       extended names. */
771    if (nameChoice == U_EXTENDED_CHAR_NAME) {
772        if (limit > UCHAR_MAX_VALUE + 1) {
773            limit = UCHAR_MAX_VALUE + 1;
774        }
775        return enumExtNames(start, limit - 1, fn, context);
776    }
777
778    return TRUE;
779}
780
781static uint16_t
782writeFactorSuffix(const uint16_t *factors, uint16_t count,
783                  const char *s, /* suffix elements */
784                  uint32_t code,
785                  uint16_t indexes[8], /* output fields from here */
786                  const char *elementBases[8], const char *elements[8],
787                  char *buffer, uint16_t bufferLength) {
788    uint16_t i, factor, bufferPos=0;
789    char c;
790
791    /* write elements according to the factors */
792
793    /*
794     * the factorized elements are determined by modulo arithmetic
795     * with the factors of this algorithm
796     *
797     * note that for fewer operations, count is decremented here
798     */
799    --count;
800    for(i=count; i>0; --i) {
801        factor=factors[i];
802        indexes[i]=(uint16_t)(code%factor);
803        code/=factor;
804    }
805    /*
806     * we don't need to calculate the last modulus because start<=code<=end
807     * guarantees here that code<=factors[0]
808     */
809    indexes[0]=(uint16_t)code;
810
811    /* write each element */
812    for(;;) {
813        if(elementBases!=NULL) {
814            *elementBases++=s;
815        }
816
817        /* skip indexes[i] strings */
818        factor=indexes[i];
819        while(factor>0) {
820            while(*s++!=0) {}
821            --factor;
822        }
823        if(elements!=NULL) {
824            *elements++=s;
825        }
826
827        /* write element */
828        while((c=*s++)!=0) {
829            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
830        }
831
832        /* we do not need to perform the rest of this loop for i==count - break here */
833        if(i>=count) {
834            break;
835        }
836
837        /* skip the rest of the strings for this factors[i] */
838        factor=(uint16_t)(factors[i]-indexes[i]-1);
839        while(factor>0) {
840            while(*s++!=0) {}
841            --factor;
842        }
843
844        ++i;
845    }
846
847    /* zero-terminate */
848    if(bufferLength>0) {
849        *buffer=0;
850    }
851
852    return bufferPos;
853}
854
855/*
856 * Important:
857 * Parts of findAlgName() are almost the same as some of getAlgName().
858 * Fixes must be applied to both.
859 */
860static uint16_t
861getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
862        char *buffer, uint16_t bufferLength) {
863    uint16_t bufferPos=0;
864
865    /* Only the normative character name can be algorithmic. */
866    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
867        /* zero-terminate */
868        if(bufferLength>0) {
869            *buffer=0;
870        }
871        return 0;
872    }
873
874    switch(range->type) {
875    case 0: {
876        /* name = prefix hex-digits */
877        const char *s=(const char *)(range+1);
878        char c;
879
880        uint16_t i, count;
881
882        /* copy prefix */
883        while((c=*s++)!=0) {
884            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
885        }
886
887        /* write hexadecimal code point value */
888        count=range->variant;
889
890        /* zero-terminate */
891        if(count<bufferLength) {
892            buffer[count]=0;
893        }
894
895        for(i=count; i>0;) {
896            if(--i<bufferLength) {
897                c=(char)(code&0xf);
898                if(c<10) {
899                    c+='0';
900                } else {
901                    c+='A'-10;
902                }
903                buffer[i]=c;
904            }
905            code>>=4;
906        }
907
908        bufferPos+=count;
909        break;
910    }
911    case 1: {
912        /* name = prefix factorized-elements */
913        uint16_t indexes[8];
914        const uint16_t *factors=(const uint16_t *)(range+1);
915        uint16_t count=range->variant;
916        const char *s=(const char *)(factors+count);
917        char c;
918
919        /* copy prefix */
920        while((c=*s++)!=0) {
921            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
922        }
923
924        bufferPos+=writeFactorSuffix(factors, count,
925                                     s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
926        break;
927    }
928    default:
929        /* undefined type */
930        /* zero-terminate */
931        if(bufferLength>0) {
932            *buffer=0;
933        }
934        break;
935    }
936
937    return bufferPos;
938}
939
940/*
941 * Important: enumAlgNames() and findAlgName() are almost the same.
942 * Any fix must be applied to both.
943 */
944static UBool
945enumAlgNames(AlgorithmicRange *range,
946             UChar32 start, UChar32 limit,
947             UEnumCharNamesFn *fn, void *context,
948             UCharNameChoice nameChoice) {
949    char buffer[200];
950    uint16_t length;
951
952    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
953        return TRUE;
954    }
955
956    switch(range->type) {
957    case 0: {
958        char *s, *end;
959        char c;
960
961        /* get the full name of the start character */
962        length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
963        if(length<=0) {
964            return TRUE;
965        }
966
967        /* call the enumerator function with this first character */
968        if(!fn(context, start, nameChoice, buffer, length)) {
969            return FALSE;
970        }
971
972        /* go to the end of the name; all these names have the same length */
973        end=buffer;
974        while(*end!=0) {
975            ++end;
976        }
977
978        /* enumerate the rest of the names */
979        while(++start<limit) {
980            /* increment the hexadecimal number on a character-basis */
981            s=end;
982            for (;;) {
983                c=*--s;
984                if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
985                    *s=(char)(c+1);
986                    break;
987                } else if(c=='9') {
988                    *s='A';
989                    break;
990                } else if(c=='F') {
991                    *s='0';
992                }
993            }
994
995            if(!fn(context, start, nameChoice, buffer, length)) {
996                return FALSE;
997            }
998        }
999        break;
1000    }
1001    case 1: {
1002        uint16_t indexes[8];
1003        const char *elementBases[8], *elements[8];
1004        const uint16_t *factors=(const uint16_t *)(range+1);
1005        uint16_t count=range->variant;
1006        const char *s=(const char *)(factors+count);
1007        char *suffix, *t;
1008        uint16_t prefixLength, i, idx;
1009
1010        char c;
1011
1012        /* name = prefix factorized-elements */
1013
1014        /* copy prefix */
1015        suffix=buffer;
1016        prefixLength=0;
1017        while((c=*s++)!=0) {
1018            *suffix++=c;
1019            ++prefixLength;
1020        }
1021
1022        /* append the suffix of the start character */
1023        length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
1024                                              s, (uint32_t)start-range->start,
1025                                              indexes, elementBases, elements,
1026                                              suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
1027
1028        /* call the enumerator function with this first character */
1029        if(!fn(context, start, nameChoice, buffer, length)) {
1030            return FALSE;
1031        }
1032
1033        /* enumerate the rest of the names */
1034        while(++start<limit) {
1035            /* increment the indexes in lexical order bound by the factors */
1036            i=count;
1037            for (;;) {
1038                idx=(uint16_t)(indexes[--i]+1);
1039                if(idx<factors[i]) {
1040                    /* skip one index and its element string */
1041                    indexes[i]=idx;
1042                    s=elements[i];
1043                    while(*s++!=0) {
1044                    }
1045                    elements[i]=s;
1046                    break;
1047                } else {
1048                    /* reset this index to 0 and its element string to the first one */
1049                    indexes[i]=0;
1050                    elements[i]=elementBases[i];
1051                }
1052            }
1053
1054            /* to make matters a little easier, just append all elements to the suffix */
1055            t=suffix;
1056            length=prefixLength;
1057            for(i=0; i<count; ++i) {
1058                s=elements[i];
1059                while((c=*s++)!=0) {
1060                    *t++=c;
1061                    ++length;
1062                }
1063            }
1064            /* zero-terminate */
1065            *t=0;
1066
1067            if(!fn(context, start, nameChoice, buffer, length)) {
1068                return FALSE;
1069            }
1070        }
1071        break;
1072    }
1073    default:
1074        /* undefined type */
1075        break;
1076    }
1077
1078    return TRUE;
1079}
1080
1081/*
1082 * findAlgName() is almost the same as enumAlgNames() except that it
1083 * returns the code point for a name if it fits into the range.
1084 * It returns 0xffff otherwise.
1085 */
1086static UChar32
1087findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1088    UChar32 code;
1089
1090    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1091        return 0xffff;
1092    }
1093
1094    switch(range->type) {
1095    case 0: {
1096        /* name = prefix hex-digits */
1097        const char *s=(const char *)(range+1);
1098        char c;
1099
1100        uint16_t i, count;
1101
1102        /* compare prefix */
1103        while((c=*s++)!=0) {
1104            if((char)c!=*otherName++) {
1105                return 0xffff;
1106            }
1107        }
1108
1109        /* read hexadecimal code point value */
1110        count=range->variant;
1111        code=0;
1112        for(i=0; i<count; ++i) {
1113            c=*otherName++;
1114            if('0'<=c && c<='9') {
1115                code=(code<<4)|(c-'0');
1116            } else if('A'<=c && c<='F') {
1117                code=(code<<4)|(c-'A'+10);
1118            } else {
1119                return 0xffff;
1120            }
1121        }
1122
1123        /* does it fit into the range? */
1124        if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1125            return code;
1126        }
1127        break;
1128    }
1129    case 1: {
1130        char buffer[64];
1131        uint16_t indexes[8];
1132        const char *elementBases[8], *elements[8];
1133        const uint16_t *factors=(const uint16_t *)(range+1);
1134        uint16_t count=range->variant;
1135        const char *s=(const char *)(factors+count), *t;
1136        UChar32 start, limit;
1137        uint16_t i, idx;
1138
1139        char c;
1140
1141        /* name = prefix factorized-elements */
1142
1143        /* compare prefix */
1144        while((c=*s++)!=0) {
1145            if((char)c!=*otherName++) {
1146                return 0xffff;
1147            }
1148        }
1149
1150        start=(UChar32)range->start;
1151        limit=(UChar32)(range->end+1);
1152
1153        /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1154        writeFactorSuffix(factors, count, s, 0,
1155                          indexes, elementBases, elements, buffer, sizeof(buffer));
1156
1157        /* compare the first suffix */
1158        if(0==uprv_strcmp(otherName, buffer)) {
1159            return start;
1160        }
1161
1162        /* enumerate and compare the rest of the suffixes */
1163        while(++start<limit) {
1164            /* increment the indexes in lexical order bound by the factors */
1165            i=count;
1166            for (;;) {
1167                idx=(uint16_t)(indexes[--i]+1);
1168                if(idx<factors[i]) {
1169                    /* skip one index and its element string */
1170                    indexes[i]=idx;
1171                    s=elements[i];
1172                    while(*s++!=0) {}
1173                    elements[i]=s;
1174                    break;
1175                } else {
1176                    /* reset this index to 0 and its element string to the first one */
1177                    indexes[i]=0;
1178                    elements[i]=elementBases[i];
1179                }
1180            }
1181
1182            /* to make matters a little easier, just compare all elements of the suffix */
1183            t=otherName;
1184            for(i=0; i<count; ++i) {
1185                s=elements[i];
1186                while((c=*s++)!=0) {
1187                    if(c!=*t++) {
1188                        s=""; /* does not match */
1189                        i=99;
1190                    }
1191                }
1192            }
1193            if(i<99 && *t==0) {
1194                return start;
1195            }
1196        }
1197        break;
1198    }
1199    default:
1200        /* undefined type */
1201        break;
1202    }
1203
1204    return 0xffff;
1205}
1206
1207/* sets of name characters, maximum name lengths ---------------------------- */
1208
1209#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1210#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1211
1212static int32_t
1213calcStringSetLength(uint32_t set[8], const char *s) {
1214    int32_t length=0;
1215    char c;
1216
1217    while((c=*s++)!=0) {
1218        SET_ADD(set, c);
1219        ++length;
1220    }
1221    return length;
1222}
1223
1224static int32_t
1225calcAlgNameSetsLengths(int32_t maxNameLength) {
1226    AlgorithmicRange *range;
1227    uint32_t *p;
1228    uint32_t rangeCount;
1229    int32_t length;
1230
1231    /* enumerate algorithmic ranges */
1232    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1233    rangeCount=*p;
1234    range=(AlgorithmicRange *)(p+1);
1235    while(rangeCount>0) {
1236        switch(range->type) {
1237        case 0:
1238            /* name = prefix + (range->variant times) hex-digits */
1239            /* prefix */
1240            length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1241            if(length>maxNameLength) {
1242                maxNameLength=length;
1243            }
1244            break;
1245        case 1: {
1246            /* name = prefix factorized-elements */
1247            const uint16_t *factors=(const uint16_t *)(range+1);
1248            const char *s;
1249            int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1250
1251            /* prefix length */
1252            s=(const char *)(factors+count);
1253            length=calcStringSetLength(gNameSet, s);
1254            s+=length+1; /* start of factor suffixes */
1255
1256            /* get the set and maximum factor suffix length for each factor */
1257            for(i=0; i<count; ++i) {
1258                maxFactorLength=0;
1259                for(factor=factors[i]; factor>0; --factor) {
1260                    factorLength=calcStringSetLength(gNameSet, s);
1261                    s+=factorLength+1;
1262                    if(factorLength>maxFactorLength) {
1263                        maxFactorLength=factorLength;
1264                    }
1265                }
1266                length+=maxFactorLength;
1267            }
1268
1269            if(length>maxNameLength) {
1270                maxNameLength=length;
1271            }
1272            break;
1273        }
1274        default:
1275            /* unknown type */
1276            break;
1277        }
1278
1279        range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1280        --rangeCount;
1281    }
1282    return maxNameLength;
1283}
1284
1285static int32_t
1286calcExtNameSetsLengths(int32_t maxNameLength) {
1287    int32_t i, length;
1288
1289    for(i=0; i<LENGTHOF(charCatNames); ++i) {
1290        /*
1291         * for each category, count the length of the category name
1292         * plus 9=
1293         * 2 for <>
1294         * 1 for -
1295         * 6 for most hex digits per code point
1296         */
1297        length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1298        if(length>maxNameLength) {
1299            maxNameLength=length;
1300        }
1301    }
1302    return maxNameLength;
1303}
1304
1305static int32_t
1306calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1307                  uint32_t set[8],
1308                  const uint8_t **pLine, const uint8_t *lineLimit) {
1309    const uint8_t *line=*pLine;
1310    int32_t length=0, tokenLength;
1311    uint16_t c, token;
1312
1313    while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1314        if(c>=tokenCount) {
1315            /* implicit letter */
1316            SET_ADD(set, c);
1317            ++length;
1318        } else {
1319            token=tokens[c];
1320            if(token==(uint16_t)(-2)) {
1321                /* this is a lead byte for a double-byte token */
1322                c=c<<8|*line++;
1323                token=tokens[c];
1324            }
1325            if(token==(uint16_t)(-1)) {
1326                /* explicit letter */
1327                SET_ADD(set, c);
1328                ++length;
1329            } else {
1330                /* count token word */
1331                if(tokenLengths!=NULL) {
1332                    /* use cached token length */
1333                    tokenLength=tokenLengths[c];
1334                    if(tokenLength==0) {
1335                        tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1336                        tokenLengths[c]=(int8_t)tokenLength;
1337                    }
1338                } else {
1339                    tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1340                }
1341                length+=tokenLength;
1342            }
1343        }
1344    }
1345
1346    *pLine=line;
1347    return length;
1348}
1349
1350static void
1351calcGroupNameSetsLengths(int32_t maxNameLength) {
1352    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1353
1354    uint16_t *tokens=(uint16_t *)uCharNames+8;
1355    uint16_t tokenCount=*tokens++;
1356    uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1357
1358    int8_t *tokenLengths;
1359
1360    const uint16_t *group;
1361    const uint8_t *s, *line, *lineLimit;
1362
1363    int32_t groupCount, lineNumber, length;
1364
1365    tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1366    if(tokenLengths!=NULL) {
1367        uprv_memset(tokenLengths, 0, tokenCount);
1368    }
1369
1370    group=GET_GROUPS(uCharNames);
1371    groupCount=*group++;
1372
1373    /* enumerate all groups */
1374    while(groupCount>0) {
1375        s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
1376        s=expandGroupLengths(s, offsets, lengths);
1377
1378        /* enumerate all lines in each group */
1379        for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1380            line=s+offsets[lineNumber];
1381            length=lengths[lineNumber];
1382            if(length==0) {
1383                continue;
1384            }
1385
1386            lineLimit=line+length;
1387
1388            /* read regular name */
1389            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1390            if(length>maxNameLength) {
1391                maxNameLength=length;
1392            }
1393            if(line==lineLimit) {
1394                continue;
1395            }
1396
1397            /* read Unicode 1.0 name */
1398            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1399            if(length>maxNameLength) {
1400                maxNameLength=length;
1401            }
1402            if(line==lineLimit) {
1403                continue;
1404            }
1405
1406            /* read ISO comment */
1407            /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1408        }
1409
1410        group=NEXT_GROUP(group);
1411        --groupCount;
1412    }
1413
1414    if(tokenLengths!=NULL) {
1415        uprv_free(tokenLengths);
1416    }
1417
1418    /* set gMax... - name length last for threading */
1419    gMaxNameLength=maxNameLength;
1420}
1421
1422static UBool
1423calcNameSetsLengths(UErrorCode *pErrorCode) {
1424    static const char extChars[]="0123456789ABCDEF<>-";
1425    int32_t i, maxNameLength;
1426
1427    if(gMaxNameLength!=0) {
1428        return TRUE;
1429    }
1430
1431    if(!isDataLoaded(pErrorCode)) {
1432        return FALSE;
1433    }
1434
1435    /* set hex digits, used in various names, and <>-, used in extended names */
1436    for(i=0; i<sizeof(extChars)-1; ++i) {
1437        SET_ADD(gNameSet, extChars[i]);
1438    }
1439
1440    /* set sets and lengths from algorithmic names */
1441    maxNameLength=calcAlgNameSetsLengths(0);
1442
1443    /* set sets and lengths from extended names */
1444    maxNameLength=calcExtNameSetsLengths(maxNameLength);
1445
1446    /* set sets and lengths from group names, set global maximum values */
1447    calcGroupNameSetsLengths(maxNameLength);
1448
1449    return TRUE;
1450}
1451
1452/* public API --------------------------------------------------------------- */
1453
1454U_CAPI int32_t U_EXPORT2
1455u_charName(UChar32 code, UCharNameChoice nameChoice,
1456           char *buffer, int32_t bufferLength,
1457           UErrorCode *pErrorCode) {
1458    AlgorithmicRange *algRange;
1459    uint32_t *p;
1460    uint32_t i;
1461    int32_t length;
1462
1463    /* check the argument values */
1464    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1465        return 0;
1466    } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1467              bufferLength<0 || (bufferLength>0 && buffer==NULL)
1468    ) {
1469        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1470        return 0;
1471    }
1472
1473    if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1474        return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1475    }
1476
1477    length=0;
1478
1479    /* try algorithmic names first */
1480    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1481    i=*p;
1482    algRange=(AlgorithmicRange *)(p+1);
1483    while(i>0) {
1484        if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1485            length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1486            break;
1487        }
1488        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1489        --i;
1490    }
1491
1492    if(i==0) {
1493        if (nameChoice == U_EXTENDED_CHAR_NAME) {
1494            length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1495            if (!length) {
1496                /* extended character name */
1497                length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1498            }
1499        } else {
1500            /* normal character name */
1501            length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1502        }
1503    }
1504
1505    return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1506}
1507
1508U_CAPI int32_t U_EXPORT2
1509u_getISOComment(UChar32 c,
1510                char *dest, int32_t destCapacity,
1511                UErrorCode *pErrorCode) {
1512    int32_t length;
1513
1514    /* check the argument values */
1515    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1516        return 0;
1517    } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1518        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1519        return 0;
1520    }
1521
1522    if((uint32_t)c>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1523        return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1524    }
1525
1526    /* the ISO comment is stored like a normal character name */
1527    length=getName(uCharNames, (uint32_t)c, U_ISO_COMMENT, dest, (uint16_t)destCapacity);
1528    return u_terminateChars(dest, destCapacity, length, pErrorCode);
1529}
1530
1531U_CAPI UChar32 U_EXPORT2
1532u_charFromName(UCharNameChoice nameChoice,
1533               const char *name,
1534               UErrorCode *pErrorCode) {
1535    char upper[120], lower[120];
1536    FindName findName;
1537    AlgorithmicRange *algRange;
1538    uint32_t *p;
1539    uint32_t i;
1540    UChar32 cp = 0;
1541    char c0;
1542    UChar32 error = 0xffff;     /* Undefined, but use this for backwards compatibility. */
1543
1544    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1545        return error;
1546    }
1547
1548    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1549        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1550        return error;
1551    }
1552
1553    if(!isDataLoaded(pErrorCode)) {
1554        return error;
1555    }
1556
1557    /* construct the uppercase and lowercase of the name first */
1558    for(i=0; i<sizeof(upper); ++i) {
1559        if((c0=*name++)!=0) {
1560            upper[i]=uprv_toupper(c0);
1561            lower[i]=uprv_tolower(c0);
1562        } else {
1563            upper[i]=lower[i]=0;
1564            break;
1565        }
1566    }
1567    if(i==sizeof(upper)) {
1568        /* name too long, there is no such character */
1569        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1570        return error;
1571    }
1572
1573    /* try extended names first */
1574    if (lower[0] == '<') {
1575        if (nameChoice == U_EXTENDED_CHAR_NAME) {
1576            if (lower[--i] == '>') {
1577                for (--i; lower[i] && lower[i] != '-'; --i) {
1578                }
1579
1580                if (lower[i] == '-') { /* We've got a category. */
1581                    uint32_t cIdx;
1582
1583                    lower[i] = 0;
1584
1585                    for (++i; lower[i] != '>'; ++i) {
1586                        if (lower[i] >= '0' && lower[i] <= '9') {
1587                            cp = (cp << 4) + lower[i] - '0';
1588                        } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1589                            cp = (cp << 4) + lower[i] - 'a' + 10;
1590                        } else {
1591                            *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1592                            return error;
1593                        }
1594                    }
1595
1596                    /* Now validate the category name.
1597                       We could use a binary search, or a trie, if
1598                       we really wanted to. */
1599
1600                    for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
1601
1602                        if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1603                            if (getCharCat(cp) == cIdx) {
1604                                return cp;
1605                            }
1606                            break;
1607                        }
1608                    }
1609                }
1610            }
1611        }
1612
1613        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1614        return error;
1615    }
1616
1617    /* try algorithmic names now */
1618    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1619    i=*p;
1620    algRange=(AlgorithmicRange *)(p+1);
1621    while(i>0) {
1622        if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1623            return cp;
1624        }
1625        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1626        --i;
1627    }
1628
1629    /* normal character name */
1630    findName.otherName=upper;
1631    findName.code=error;
1632    enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1633    if (findName.code == error) {
1634         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1635    }
1636    return findName.code;
1637}
1638
1639U_CAPI void U_EXPORT2
1640u_enumCharNames(UChar32 start, UChar32 limit,
1641                UEnumCharNamesFn *fn,
1642                void *context,
1643                UCharNameChoice nameChoice,
1644                UErrorCode *pErrorCode) {
1645    AlgorithmicRange *algRange;
1646    uint32_t *p;
1647    uint32_t i;
1648
1649    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1650        return;
1651    }
1652
1653    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1654        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1655        return;
1656    }
1657
1658    if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1659        limit = UCHAR_MAX_VALUE + 1;
1660    }
1661    if((uint32_t)start>=(uint32_t)limit) {
1662        return;
1663    }
1664
1665    if(!isDataLoaded(pErrorCode)) {
1666        return;
1667    }
1668
1669    /* interleave the data-driven ones with the algorithmic ones */
1670    /* iterate over all algorithmic ranges; assume that they are in ascending order */
1671    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1672    i=*p;
1673    algRange=(AlgorithmicRange *)(p+1);
1674    while(i>0) {
1675        /* enumerate the character names before the current algorithmic range */
1676        /* here: start<limit */
1677        if((uint32_t)start<algRange->start) {
1678            if((uint32_t)limit<=algRange->start) {
1679                enumNames(uCharNames, start, limit, fn, context, nameChoice);
1680                return;
1681            }
1682            if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1683                return;
1684            }
1685            start=(UChar32)algRange->start;
1686        }
1687        /* enumerate the character names in the current algorithmic range */
1688        /* here: algRange->start<=start<limit */
1689        if((uint32_t)start<=algRange->end) {
1690            if((uint32_t)limit<=(algRange->end+1)) {
1691                enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1692                return;
1693            }
1694            if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1695                return;
1696            }
1697            start=(UChar32)algRange->end+1;
1698        }
1699        /* continue to the next algorithmic range (here: start<limit) */
1700        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1701        --i;
1702    }
1703    /* enumerate the character names after the last algorithmic range */
1704    enumNames(uCharNames, start, limit, fn, context, nameChoice);
1705}
1706
1707U_CAPI int32_t U_EXPORT2
1708uprv_getMaxCharNameLength() {
1709    UErrorCode errorCode=U_ZERO_ERROR;
1710    if(calcNameSetsLengths(&errorCode)) {
1711        return gMaxNameLength;
1712    } else {
1713        return 0;
1714    }
1715}
1716
1717/**
1718 * Converts the char set cset into a Unicode set uset.
1719 * @param cset Set of 256 bit flags corresponding to a set of chars.
1720 * @param uset USet to receive characters. Existing contents are deleted.
1721 */
1722static void
1723charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1724    UChar us[256];
1725    char cs[256];
1726
1727    int32_t i, length;
1728    UErrorCode errorCode;
1729
1730    errorCode=U_ZERO_ERROR;
1731
1732    if(!calcNameSetsLengths(&errorCode)) {
1733        return;
1734    }
1735
1736    /* build a char string with all chars that are used in character names */
1737    length=0;
1738    for(i=0; i<256; ++i) {
1739        if(SET_CONTAINS(cset, i)) {
1740            cs[length++]=(char)i;
1741        }
1742    }
1743
1744    /* convert the char string to a UChar string */
1745    u_charsToUChars(cs, us, length);
1746
1747    /* add each UChar to the USet */
1748    for(i=0; i<length; ++i) {
1749        if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
1750            sa->add(sa->set, us[i]);
1751        }
1752    }
1753}
1754
1755/**
1756 * Fills set with characters that are used in Unicode character names.
1757 * @param set USet to receive characters.
1758 */
1759U_CAPI void U_EXPORT2
1760uprv_getCharNameCharacters(const USetAdder *sa) {
1761    charSetToUSet(gNameSet, sa);
1762}
1763
1764/* data swapping ------------------------------------------------------------ */
1765
1766/*
1767 * The token table contains non-negative entries for token bytes,
1768 * and -1 for bytes that represent themselves in the data file's charset.
1769 * -2 entries are used for lead bytes.
1770 *
1771 * Direct bytes (-1 entries) must be translated from the input charset family
1772 * to the output charset family.
1773 * makeTokenMap() writes a permutation mapping for this.
1774 * Use it once for single-/lead-byte tokens and once more for all trail byte
1775 * tokens. (';' is an unused trail byte marked with -1.)
1776 */
1777static void
1778makeTokenMap(const UDataSwapper *ds,
1779             int16_t tokens[], uint16_t tokenCount,
1780             uint8_t map[256],
1781             UErrorCode *pErrorCode) {
1782    UBool usedOutChar[256];
1783    uint16_t i, j;
1784    uint8_t c1, c2;
1785
1786    if(U_FAILURE(*pErrorCode)) {
1787        return;
1788    }
1789
1790    if(ds->inCharset==ds->outCharset) {
1791        /* Same charset family: identity permutation */
1792        for(i=0; i<256; ++i) {
1793            map[i]=(uint8_t)i;
1794        }
1795    } else {
1796        uprv_memset(map, 0, 256);
1797        uprv_memset(usedOutChar, 0, 256);
1798
1799        if(tokenCount>256) {
1800            tokenCount=256;
1801        }
1802
1803        /* set the direct bytes (byte 0 always maps to itself) */
1804        for(i=1; i<tokenCount; ++i) {
1805            if(tokens[i]==-1) {
1806                /* convert the direct byte character */
1807                c1=(uint8_t)i;
1808                ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1809                if(U_FAILURE(*pErrorCode)) {
1810                    udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1811                                     i, ds->inCharset);
1812                    return;
1813                }
1814
1815                /* enter the converted character into the map and mark it used */
1816                map[c1]=c2;
1817                usedOutChar[c2]=TRUE;
1818            }
1819        }
1820
1821        /* set the mappings for the rest of the permutation */
1822        for(i=j=1; i<tokenCount; ++i) {
1823            /* set mappings that were not set for direct bytes */
1824            if(map[i]==0) {
1825                /* set an output byte value that was not used as an output byte above */
1826                while(usedOutChar[j]) {
1827                    ++j;
1828                }
1829                map[i]=(uint8_t)j++;
1830            }
1831        }
1832
1833        /*
1834         * leave mappings at tokenCount and above unset if tokenCount<256
1835         * because they won't be used
1836         */
1837    }
1838}
1839
1840U_CAPI int32_t U_EXPORT2
1841uchar_swapNames(const UDataSwapper *ds,
1842                const void *inData, int32_t length, void *outData,
1843                UErrorCode *pErrorCode) {
1844    const UDataInfo *pInfo;
1845    int32_t headerSize;
1846
1847    const uint8_t *inBytes;
1848    uint8_t *outBytes;
1849
1850    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1851             offset, i, count, stringsCount;
1852
1853    const AlgorithmicRange *inRange;
1854    AlgorithmicRange *outRange;
1855
1856    /* udata_swapDataHeader checks the arguments */
1857    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1858    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1859        return 0;
1860    }
1861
1862    /* check data format and format version */
1863    pInfo=(const UDataInfo *)((const char *)inData+4);
1864    if(!(
1865        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
1866        pInfo->dataFormat[1]==0x6e &&
1867        pInfo->dataFormat[2]==0x61 &&
1868        pInfo->dataFormat[3]==0x6d &&
1869        pInfo->formatVersion[0]==1
1870    )) {
1871        udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1872                         pInfo->dataFormat[0], pInfo->dataFormat[1],
1873                         pInfo->dataFormat[2], pInfo->dataFormat[3],
1874                         pInfo->formatVersion[0]);
1875        *pErrorCode=U_UNSUPPORTED_ERROR;
1876        return 0;
1877    }
1878
1879    inBytes=(const uint8_t *)inData+headerSize;
1880    outBytes=(uint8_t *)outData+headerSize;
1881    if(length<0) {
1882        algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1883    } else {
1884        length-=headerSize;
1885        if( length<20 ||
1886            (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1887        ) {
1888            udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1889                             length);
1890            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1891            return 0;
1892        }
1893    }
1894
1895    if(length<0) {
1896        /* preflighting: iterate through algorithmic ranges */
1897        offset=algNamesOffset;
1898        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1899        offset+=4;
1900
1901        for(i=0; i<count; ++i) {
1902            inRange=(const AlgorithmicRange *)(inBytes+offset);
1903            offset+=ds->readUInt16(inRange->size);
1904        }
1905    } else {
1906        /* swap data */
1907        const uint16_t *p;
1908        uint16_t *q, *temp;
1909
1910        int16_t tokens[512];
1911        uint16_t tokenCount;
1912
1913        uint8_t map[256], trailMap[256];
1914
1915        /* copy the data for inaccessible bytes */
1916        if(inBytes!=outBytes) {
1917            uprv_memcpy(outBytes, inBytes, length);
1918        }
1919
1920        /* the initial 4 offsets first */
1921        tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1922        groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1923        groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1924        ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1925
1926        /*
1927         * now the tokens table
1928         * it needs to be permutated along with the compressed name strings
1929         */
1930        p=(const uint16_t *)(inBytes+16);
1931        q=(uint16_t *)(outBytes+16);
1932
1933        /* read and swap the tokenCount */
1934        tokenCount=ds->readUInt16(*p);
1935        ds->swapArray16(ds, p, 2, q, pErrorCode);
1936        ++p;
1937        ++q;
1938
1939        /* read the first 512 tokens and make the token maps */
1940        if(tokenCount<=512) {
1941            count=tokenCount;
1942        } else {
1943            count=512;
1944        }
1945        for(i=0; i<count; ++i) {
1946            tokens[i]=udata_readInt16(ds, p[i]);
1947        }
1948        for(; i<512; ++i) {
1949            tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1950        }
1951        makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1952        makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1953        if(U_FAILURE(*pErrorCode)) {
1954            return 0;
1955        }
1956
1957        /*
1958         * swap and permutate the tokens
1959         * go through a temporary array to support in-place swapping
1960         */
1961        temp=(uint16_t *)uprv_malloc(tokenCount*2);
1962        if(temp==NULL) {
1963            udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1964                             tokenCount);
1965            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1966            return 0;
1967        }
1968
1969        /* swap and permutate single-/lead-byte tokens */
1970        for(i=0; i<tokenCount && i<256; ++i) {
1971            ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1972        }
1973
1974        /* swap and permutate trail-byte tokens */
1975        for(; i<tokenCount; ++i) {
1976            ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1977        }
1978
1979        /* copy the result into the output and free the temporary array */
1980        uprv_memcpy(q, temp, tokenCount*2);
1981        uprv_free(temp);
1982
1983        /*
1984         * swap the token strings but not a possible padding byte after
1985         * the terminating NUL of the last string
1986         */
1987        udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1988                                    outBytes+tokenStringOffset, pErrorCode);
1989        if(U_FAILURE(*pErrorCode)) {
1990            udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1991            return 0;
1992        }
1993
1994        /* swap the group table */
1995        count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
1996        ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
1997                           outBytes+groupsOffset, pErrorCode);
1998
1999        /*
2000         * swap the group strings
2001         * swap the string bytes but not the nibble-encoded string lengths
2002         */
2003        if(ds->inCharset!=ds->outCharset) {
2004            uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
2005
2006            const uint8_t *inStrings, *nextInStrings;
2007            uint8_t *outStrings;
2008
2009            uint8_t c;
2010
2011            inStrings=inBytes+groupStringOffset;
2012            outStrings=outBytes+groupStringOffset;
2013
2014            stringsCount=algNamesOffset-groupStringOffset;
2015
2016            /* iterate through string groups until only a few padding bytes are left */
2017            while(stringsCount>32) {
2018                nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2019
2020                /* move past the length bytes */
2021                stringsCount-=(uint32_t)(nextInStrings-inStrings);
2022                outStrings+=nextInStrings-inStrings;
2023                inStrings=nextInStrings;
2024
2025                count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2026                stringsCount-=count;
2027
2028                /* swap the string bytes using map[] and trailMap[] */
2029                while(count>0) {
2030                    c=*inStrings++;
2031                    *outStrings++=map[c];
2032                    if(tokens[c]!=-2) {
2033                        --count;
2034                    } else {
2035                        /* token lead byte: swap the trail byte, too */
2036                        *outStrings++=trailMap[*inStrings++];
2037                        count-=2;
2038                    }
2039                }
2040            }
2041        }
2042
2043        /* swap the algorithmic ranges */
2044        offset=algNamesOffset;
2045        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2046        ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2047        offset+=4;
2048
2049        for(i=0; i<count; ++i) {
2050            if(offset>(uint32_t)length) {
2051                udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2052                                 length, i);
2053                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2054                return 0;
2055            }
2056
2057            inRange=(const AlgorithmicRange *)(inBytes+offset);
2058            outRange=(AlgorithmicRange *)(outBytes+offset);
2059            offset+=ds->readUInt16(inRange->size);
2060
2061            ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2062            ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2063            switch(inRange->type) {
2064            case 0:
2065                /* swap prefix string */
2066                ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2067                                    outRange+1, pErrorCode);
2068                if(U_FAILURE(*pErrorCode)) {
2069                    udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2070                                     i);
2071                    return 0;
2072                }
2073                break;
2074            case 1:
2075                {
2076                    /* swap factors and the prefix and factor strings */
2077                    uint32_t factorsCount;
2078
2079                    factorsCount=inRange->variant;
2080                    p=(const uint16_t *)(inRange+1);
2081                    q=(uint16_t *)(outRange+1);
2082                    ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2083
2084                    /* swap the strings, up to the last terminating NUL */
2085                    p+=factorsCount;
2086                    q+=factorsCount;
2087                    stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2088                    while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2089                        --stringsCount;
2090                    }
2091                    ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2092                }
2093                break;
2094            default:
2095                udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2096                                 inRange->type, i);
2097                *pErrorCode=U_UNSUPPORTED_ERROR;
2098                return 0;
2099            }
2100        }
2101    }
2102
2103    return headerSize+(int32_t)offset;
2104}
2105
2106/*
2107 * Hey, Emacs, please set the following:
2108 *
2109 * Local Variables:
2110 * indent-tabs-mode: nil
2111 * End:
2112 *
2113 */
2114