1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5*
6*   Copyright (C) 1999-2014, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9******************************************************************************
10*   file name:  unames.c
11*   encoding:   US-ASCII
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 1999oct04
16*   created by: Markus W. Scherer
17*/
18
19#include "unicode/utypes.h"
20#include "unicode/putil.h"
21#include "unicode/uchar.h"
22#include "unicode/udata.h"
23#include "unicode/utf.h"
24#include "unicode/utf16.h"
25#include "uassert.h"
26#include "ustr_imp.h"
27#include "umutex.h"
28#include "cmemory.h"
29#include "cstring.h"
30#include "ucln_cmn.h"
31#include "udataswp.h"
32#include "uprops.h"
33
34U_NAMESPACE_BEGIN
35
36/* prototypes ------------------------------------------------------------- */
37
38static const char DATA_NAME[] = "unames";
39static const char DATA_TYPE[] = "icu";
40
41#define GROUP_SHIFT 5
42#define LINES_PER_GROUP (1L<<GROUP_SHIFT)
43#define GROUP_MASK (LINES_PER_GROUP-1)
44
45/*
46 * This struct was replaced by explicitly accessing equivalent
47 * fields from triples of uint16_t.
48 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
49 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
50 * would advance by 6 bytes (3 uint16_t).
51 *
52 * We can't just change the data structure because it's loaded from a data file,
53 * and we don't want to make it less compact, so we changed the access code.
54 *
55 * For details see ICU tickets 6331 and 6008.
56typedef struct {
57    uint16_t groupMSB,
58             offsetHigh, offsetLow; / * avoid padding * /
59} Group;
60 */
61enum {
62    GROUP_MSB,
63    GROUP_OFFSET_HIGH,
64    GROUP_OFFSET_LOW,
65    GROUP_LENGTH
66};
67
68/*
69 * Get the 32-bit group offset.
70 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
71 * @return group offset (int32_t)
72 */
73#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
74
75#define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
76#define PREV_GROUP(group) ((group)-GROUP_LENGTH)
77
78typedef struct {
79    uint32_t start, end;
80    uint8_t type, variant;
81    uint16_t size;
82} AlgorithmicRange;
83
84typedef struct {
85    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
86} UCharNames;
87
88/*
89 * Get the groups table from a UCharNames struct.
90 * The groups table consists of one uint16_t groupCount followed by
91 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
92 * and the comment for the old struct Group above.
93 *
94 * @param names (const UCharNames *) pointer to the UCharNames indexes
95 * @return (const uint16_t *) pointer to the groups table
96 */
97#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
98
99typedef struct {
100    const char *otherName;
101    UChar32 code;
102} FindName;
103
104#define DO_FIND_NAME NULL
105
106static UDataMemory *uCharNamesData=NULL;
107static UCharNames *uCharNames=NULL;
108static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER;
109
110/*
111 * Maximum length of character names (regular & 1.0).
112 */
113static int32_t gMaxNameLength=0;
114
115/*
116 * Set of chars used in character names (regular & 1.0).
117 * Chars are platform-dependent (can be EBCDIC).
118 */
119static uint32_t gNameSet[8]={ 0 };
120
121#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
122#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
123#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
124
125#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
126
127static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
128    "unassigned",
129    "uppercase letter",
130    "lowercase letter",
131    "titlecase letter",
132    "modifier letter",
133    "other letter",
134    "non spacing mark",
135    "enclosing mark",
136    "combining spacing mark",
137    "decimal digit number",
138    "letter number",
139    "other number",
140    "space separator",
141    "line separator",
142    "paragraph separator",
143    "control",
144    "format",
145    "private use area",
146    "surrogate",
147    "dash punctuation",
148    "start punctuation",
149    "end punctuation",
150    "connector punctuation",
151    "other punctuation",
152    "math symbol",
153    "currency symbol",
154    "modifier symbol",
155    "other symbol",
156    "initial punctuation",
157    "final punctuation",
158    "noncharacter",
159    "lead surrogate",
160    "trail surrogate"
161};
162
163/* implementation ----------------------------------------------------------- */
164
165static UBool U_CALLCONV unames_cleanup(void)
166{
167    if(uCharNamesData) {
168        udata_close(uCharNamesData);
169        uCharNamesData = NULL;
170    }
171    if(uCharNames) {
172        uCharNames = NULL;
173    }
174    gCharNamesInitOnce.reset();
175    gMaxNameLength=0;
176    return TRUE;
177}
178
179static UBool U_CALLCONV
180isAcceptable(void * /*context*/,
181             const char * /*type*/, const char * /*name*/,
182             const UDataInfo *pInfo) {
183    return (UBool)(
184        pInfo->size>=20 &&
185        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
186        pInfo->charsetFamily==U_CHARSET_FAMILY &&
187        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
188        pInfo->dataFormat[1]==0x6e &&
189        pInfo->dataFormat[2]==0x61 &&
190        pInfo->dataFormat[3]==0x6d &&
191        pInfo->formatVersion[0]==1);
192}
193
194static void U_CALLCONV
195loadCharNames(UErrorCode &status) {
196    U_ASSERT(uCharNamesData == NULL);
197    U_ASSERT(uCharNames == NULL);
198
199    uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status);
200    if(U_FAILURE(status)) {
201        uCharNamesData = NULL;
202    } else {
203        uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
204    }
205    ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
206}
207
208
209static UBool
210isDataLoaded(UErrorCode *pErrorCode) {
211    umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
212    return U_SUCCESS(*pErrorCode);
213}
214
215#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
216    if((bufferLength)>0) { \
217        *(buffer)++=c; \
218        --(bufferLength); \
219    } \
220    ++(bufferPos); \
221}
222
223#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
224
225/*
226 * Important: expandName() and compareName() are almost the same -
227 * apply fixes to both.
228 *
229 * UnicodeData.txt uses ';' as a field separator, so no
230 * field can contain ';' as part of its contents.
231 * In unames.dat, it is marked as token[';']==-1 only if the
232 * semicolon is used in the data file - which is iff we
233 * have Unicode 1.0 names or ISO comments or aliases.
234 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
235 * although we know that it will never be part of a name.
236 */
237static uint16_t
238expandName(UCharNames *names,
239           const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
240           char *buffer, uint16_t bufferLength) {
241    uint16_t *tokens=(uint16_t *)names+8;
242    uint16_t token, tokenCount=*tokens++, bufferPos=0;
243    uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
244    uint8_t c;
245
246    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
247        /*
248         * skip the modern name if it is not requested _and_
249         * if the semicolon byte value is a character, not a token number
250         */
251        if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
252            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
253            do {
254                while(nameLength>0) {
255                    --nameLength;
256                    if(*name++==';') {
257                        break;
258                    }
259                }
260            } while(--fieldIndex>0);
261        } else {
262            /*
263             * the semicolon byte value is a token number, therefore
264             * only modern names are stored in unames.dat and there is no
265             * such requested alternate name here
266             */
267            nameLength=0;
268        }
269    }
270
271    /* write each letter directly, and write a token word per token */
272    while(nameLength>0) {
273        --nameLength;
274        c=*name++;
275
276        if(c>=tokenCount) {
277            if(c!=';') {
278                /* implicit letter */
279                WRITE_CHAR(buffer, bufferLength, bufferPos, c);
280            } else {
281                /* finished */
282                break;
283            }
284        } else {
285            token=tokens[c];
286            if(token==(uint16_t)(-2)) {
287                /* this is a lead byte for a double-byte token */
288                token=tokens[c<<8|*name++];
289                --nameLength;
290            }
291            if(token==(uint16_t)(-1)) {
292                if(c!=';') {
293                    /* explicit letter */
294                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
295                } else {
296                    /* stop, but skip the semicolon if we are seeking
297                       extended names and there was no 2.0 name but there
298                       is a 1.0 name. */
299                    if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
300                        if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
301                            continue;
302                        }
303                    }
304                    /* finished */
305                    break;
306                }
307            } else {
308                /* write token word */
309                uint8_t *tokenString=tokenStrings+token;
310                while((c=*tokenString++)!=0) {
311                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
312                }
313            }
314        }
315    }
316
317    /* zero-terminate */
318    if(bufferLength>0) {
319        *buffer=0;
320    }
321
322    return bufferPos;
323}
324
325/*
326 * compareName() is almost the same as expandName() except that it compares
327 * the currently expanded name to an input name.
328 * It returns the match/no match result as soon as possible.
329 */
330static UBool
331compareName(UCharNames *names,
332            const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
333            const char *otherName) {
334    uint16_t *tokens=(uint16_t *)names+8;
335    uint16_t token, tokenCount=*tokens++;
336    uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
337    uint8_t c;
338    const char *origOtherName = otherName;
339
340    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
341        /*
342         * skip the modern name if it is not requested _and_
343         * if the semicolon byte value is a character, not a token number
344         */
345        if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
346            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
347            do {
348                while(nameLength>0) {
349                    --nameLength;
350                    if(*name++==';') {
351                        break;
352                    }
353                }
354            } while(--fieldIndex>0);
355        } else {
356            /*
357             * the semicolon byte value is a token number, therefore
358             * only modern names are stored in unames.dat and there is no
359             * such requested alternate name here
360             */
361            nameLength=0;
362        }
363    }
364
365    /* compare each letter directly, and compare a token word per token */
366    while(nameLength>0) {
367        --nameLength;
368        c=*name++;
369
370        if(c>=tokenCount) {
371            if(c!=';') {
372                /* implicit letter */
373                if((char)c!=*otherName++) {
374                    return FALSE;
375                }
376            } else {
377                /* finished */
378                break;
379            }
380        } else {
381            token=tokens[c];
382            if(token==(uint16_t)(-2)) {
383                /* this is a lead byte for a double-byte token */
384                token=tokens[c<<8|*name++];
385                --nameLength;
386            }
387            if(token==(uint16_t)(-1)) {
388                if(c!=';') {
389                    /* explicit letter */
390                    if((char)c!=*otherName++) {
391                        return FALSE;
392                    }
393                } else {
394                    /* stop, but skip the semicolon if we are seeking
395                       extended names and there was no 2.0 name but there
396                       is a 1.0 name. */
397                    if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
398                        if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
399                            continue;
400                        }
401                    }
402                    /* finished */
403                    break;
404                }
405            } else {
406                /* write token word */
407                uint8_t *tokenString=tokenStrings+token;
408                while((c=*tokenString++)!=0) {
409                    if((char)c!=*otherName++) {
410                        return FALSE;
411                    }
412                }
413            }
414        }
415    }
416
417    /* complete match? */
418    return (UBool)(*otherName==0);
419}
420
421static uint8_t getCharCat(UChar32 cp) {
422    uint8_t cat;
423
424    if (U_IS_UNICODE_NONCHAR(cp)) {
425        return U_NONCHARACTER_CODE_POINT;
426    }
427
428    if ((cat = u_charType(cp)) == U_SURROGATE) {
429        cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
430    }
431
432    return cat;
433}
434
435static const char *getCharCatName(UChar32 cp) {
436    uint8_t cat = getCharCat(cp);
437
438    /* Return unknown if the table of names above is not up to
439       date. */
440
441    if (cat >= UPRV_LENGTHOF(charCatNames)) {
442        return "unknown";
443    } else {
444        return charCatNames[cat];
445    }
446}
447
448static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
449    const char *catname = getCharCatName(code);
450    uint16_t length = 0;
451
452    UChar32 cp;
453    int ndigits, i;
454
455    WRITE_CHAR(buffer, bufferLength, length, '<');
456    while (catname[length - 1]) {
457        WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
458    }
459    WRITE_CHAR(buffer, bufferLength, length, '-');
460    for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
461        ;
462    if (ndigits < 4)
463        ndigits = 4;
464    for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
465        uint8_t v = (uint8_t)(cp & 0xf);
466        buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
467    }
468    buffer += ndigits;
469    length += ndigits;
470    WRITE_CHAR(buffer, bufferLength, length, '>');
471
472    return length;
473}
474
475/*
476 * getGroup() does a binary search for the group that contains the
477 * Unicode code point "code".
478 * The return value is always a valid Group* that may contain "code"
479 * or else is the highest group before "code".
480 * If the lowest group is after "code", then that one is returned.
481 */
482static const uint16_t *
483getGroup(UCharNames *names, uint32_t code) {
484    const uint16_t *groups=GET_GROUPS(names);
485    uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
486             start=0,
487             limit=*groups++,
488             number;
489
490    /* binary search for the group of names that contains the one for code */
491    while(start<limit-1) {
492        number=(uint16_t)((start+limit)/2);
493        if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
494            limit=number;
495        } else {
496            start=number;
497        }
498    }
499
500    /* return this regardless of whether it is an exact match */
501    return groups+start*GROUP_LENGTH;
502}
503
504/*
505 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
506 * expands them into offsets and lengths for each string.
507 * Lengths are stored with a variable-width encoding in consecutive nibbles:
508 * If a nibble<0xc, then it is the length itself (0=empty string).
509 * If a nibble>=0xc, then it forms a length value with the following nibble.
510 * Calculation see below.
511 * The offsets and lengths arrays must be at least 33 (one more) long because
512 * there is no check here at the end if the last nibble is still used.
513 */
514static const uint8_t *
515expandGroupLengths(const uint8_t *s,
516                   uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
517    /* read the lengths of the 32 strings in this group and get each string's offset */
518    uint16_t i=0, offset=0, length=0;
519    uint8_t lengthByte;
520
521    /* all 32 lengths must be read to get the offset of the first group string */
522    while(i<LINES_PER_GROUP) {
523        lengthByte=*s++;
524
525        /* read even nibble - MSBs of lengthByte */
526        if(length>=12) {
527            /* double-nibble length spread across two bytes */
528            length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
529            lengthByte&=0xf;
530        } else if((lengthByte /* &0xf0 */)>=0xc0) {
531            /* double-nibble length spread across this one byte */
532            length=(uint16_t)((lengthByte&0x3f)+12);
533        } else {
534            /* single-nibble length in MSBs */
535            length=(uint16_t)(lengthByte>>4);
536            lengthByte&=0xf;
537        }
538
539        *offsets++=offset;
540        *lengths++=length;
541
542        offset+=length;
543        ++i;
544
545        /* read odd nibble - LSBs of lengthByte */
546        if((lengthByte&0xf0)==0) {
547            /* this nibble was not consumed for a double-nibble length above */
548            length=lengthByte;
549            if(length<12) {
550                /* single-nibble length in LSBs */
551                *offsets++=offset;
552                *lengths++=length;
553
554                offset+=length;
555                ++i;
556            }
557        } else {
558            length=0;   /* prevent double-nibble detection in the next iteration */
559        }
560    }
561
562    /* now, s is at the first group string */
563    return s;
564}
565
566static uint16_t
567expandGroupName(UCharNames *names, const uint16_t *group,
568                uint16_t lineNumber, UCharNameChoice nameChoice,
569                char *buffer, uint16_t bufferLength) {
570    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
571    const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
572    s=expandGroupLengths(s, offsets, lengths);
573    return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
574                      buffer, bufferLength);
575}
576
577static uint16_t
578getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
579        char *buffer, uint16_t bufferLength) {
580    const uint16_t *group=getGroup(names, code);
581    if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
582        return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
583                               buffer, bufferLength);
584    } else {
585        /* group not found */
586        /* zero-terminate */
587        if(bufferLength>0) {
588            *buffer=0;
589        }
590        return 0;
591    }
592}
593
594/*
595 * enumGroupNames() enumerates all the names in a 32-group
596 * and either calls the enumerator function or finds a given input name.
597 */
598static UBool
599enumGroupNames(UCharNames *names, const uint16_t *group,
600               UChar32 start, UChar32 end,
601               UEnumCharNamesFn *fn, void *context,
602               UCharNameChoice nameChoice) {
603    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
604    const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
605
606    s=expandGroupLengths(s, offsets, lengths);
607    if(fn!=DO_FIND_NAME) {
608        char buffer[200];
609        uint16_t length;
610
611        while(start<=end) {
612            length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
613            if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
614                buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
615            }
616            /* here, we assume that the buffer is large enough */
617            if(length>0) {
618                if(!fn(context, start, nameChoice, buffer, length)) {
619                    return FALSE;
620                }
621            }
622            ++start;
623        }
624    } else {
625        const char *otherName=((FindName *)context)->otherName;
626        while(start<=end) {
627            if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
628                ((FindName *)context)->code=start;
629                return FALSE;
630            }
631            ++start;
632        }
633    }
634    return TRUE;
635}
636
637/*
638 * enumExtNames enumerate extended names.
639 * It only needs to do it if it is called with a real function and not
640 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
641 * for extended names by itself.
642 */
643static UBool
644enumExtNames(UChar32 start, UChar32 end,
645             UEnumCharNamesFn *fn, void *context)
646{
647    if(fn!=DO_FIND_NAME) {
648        char buffer[200];
649        uint16_t length;
650
651        while(start<=end) {
652            buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
653            /* here, we assume that the buffer is large enough */
654            if(length>0) {
655                if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
656                    return FALSE;
657                }
658            }
659            ++start;
660        }
661    }
662
663    return TRUE;
664}
665
666static UBool
667enumNames(UCharNames *names,
668          UChar32 start, UChar32 limit,
669          UEnumCharNamesFn *fn, void *context,
670          UCharNameChoice nameChoice) {
671    uint16_t startGroupMSB, endGroupMSB, groupCount;
672    const uint16_t *group, *groupLimit;
673
674    startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
675    endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
676
677    /* find the group that contains start, or the highest before it */
678    group=getGroup(names, start);
679
680    if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
681        /* enumerate synthetic names between start and the group start */
682        UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT);
683        if(extLimit>limit) {
684            extLimit=limit;
685        }
686        if(!enumExtNames(start, extLimit-1, fn, context)) {
687            return FALSE;
688        }
689        start=extLimit;
690    }
691
692    if(startGroupMSB==endGroupMSB) {
693        if(startGroupMSB==group[GROUP_MSB]) {
694            /* if start and limit-1 are in the same group, then enumerate only in that one */
695            return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
696        }
697    } else {
698        const uint16_t *groups=GET_GROUPS(names);
699        groupCount=*groups++;
700        groupLimit=groups+groupCount*GROUP_LENGTH;
701
702        if(startGroupMSB==group[GROUP_MSB]) {
703            /* enumerate characters in the partial start group */
704            if((start&GROUP_MASK)!=0) {
705                if(!enumGroupNames(names, group,
706                                   start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
707                                   fn, context, nameChoice)) {
708                    return FALSE;
709                }
710                group=NEXT_GROUP(group); /* continue with the next group */
711            }
712        } else if(startGroupMSB>group[GROUP_MSB]) {
713            /* make sure that we start enumerating with the first group after start */
714            const uint16_t *nextGroup=NEXT_GROUP(group);
715            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
716                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
717                if (end > limit) {
718                    end = limit;
719                }
720                if (!enumExtNames(start, end - 1, fn, context)) {
721                    return FALSE;
722                }
723            }
724            group=nextGroup;
725        }
726
727        /* enumerate entire groups between the start- and end-groups */
728        while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
729            const uint16_t *nextGroup;
730            start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
731            if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
732                return FALSE;
733            }
734            nextGroup=NEXT_GROUP(group);
735            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
736                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
737                if (end > limit) {
738                    end = limit;
739                }
740                if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
741                    return FALSE;
742                }
743            }
744            group=nextGroup;
745        }
746
747        /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
748        if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
749            return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
750        } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
751            UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
752            if (next > start) {
753                start = next;
754            }
755        } else {
756            return TRUE;
757        }
758    }
759
760    /* we have not found a group, which means everything is made of
761       extended names. */
762    if (nameChoice == U_EXTENDED_CHAR_NAME) {
763        if (limit > UCHAR_MAX_VALUE + 1) {
764            limit = UCHAR_MAX_VALUE + 1;
765        }
766        return enumExtNames(start, limit - 1, fn, context);
767    }
768
769    return TRUE;
770}
771
772static uint16_t
773writeFactorSuffix(const uint16_t *factors, uint16_t count,
774                  const char *s, /* suffix elements */
775                  uint32_t code,
776                  uint16_t indexes[8], /* output fields from here */
777                  const char *elementBases[8], const char *elements[8],
778                  char *buffer, uint16_t bufferLength) {
779    uint16_t i, factor, bufferPos=0;
780    char c;
781
782    /* write elements according to the factors */
783
784    /*
785     * the factorized elements are determined by modulo arithmetic
786     * with the factors of this algorithm
787     *
788     * note that for fewer operations, count is decremented here
789     */
790    --count;
791    for(i=count; i>0; --i) {
792        factor=factors[i];
793        indexes[i]=(uint16_t)(code%factor);
794        code/=factor;
795    }
796    /*
797     * we don't need to calculate the last modulus because start<=code<=end
798     * guarantees here that code<=factors[0]
799     */
800    indexes[0]=(uint16_t)code;
801
802    /* write each element */
803    for(;;) {
804        if(elementBases!=NULL) {
805            *elementBases++=s;
806        }
807
808        /* skip indexes[i] strings */
809        factor=indexes[i];
810        while(factor>0) {
811            while(*s++!=0) {}
812            --factor;
813        }
814        if(elements!=NULL) {
815            *elements++=s;
816        }
817
818        /* write element */
819        while((c=*s++)!=0) {
820            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
821        }
822
823        /* we do not need to perform the rest of this loop for i==count - break here */
824        if(i>=count) {
825            break;
826        }
827
828        /* skip the rest of the strings for this factors[i] */
829        factor=(uint16_t)(factors[i]-indexes[i]-1);
830        while(factor>0) {
831            while(*s++!=0) {}
832            --factor;
833        }
834
835        ++i;
836    }
837
838    /* zero-terminate */
839    if(bufferLength>0) {
840        *buffer=0;
841    }
842
843    return bufferPos;
844}
845
846/*
847 * Important:
848 * Parts of findAlgName() are almost the same as some of getAlgName().
849 * Fixes must be applied to both.
850 */
851static uint16_t
852getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
853        char *buffer, uint16_t bufferLength) {
854    uint16_t bufferPos=0;
855
856    /* Only the normative character name can be algorithmic. */
857    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
858        /* zero-terminate */
859        if(bufferLength>0) {
860            *buffer=0;
861        }
862        return 0;
863    }
864
865    switch(range->type) {
866    case 0: {
867        /* name = prefix hex-digits */
868        const char *s=(const char *)(range+1);
869        char c;
870
871        uint16_t i, count;
872
873        /* copy prefix */
874        while((c=*s++)!=0) {
875            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
876        }
877
878        /* write hexadecimal code point value */
879        count=range->variant;
880
881        /* zero-terminate */
882        if(count<bufferLength) {
883            buffer[count]=0;
884        }
885
886        for(i=count; i>0;) {
887            if(--i<bufferLength) {
888                c=(char)(code&0xf);
889                if(c<10) {
890                    c+='0';
891                } else {
892                    c+='A'-10;
893                }
894                buffer[i]=c;
895            }
896            code>>=4;
897        }
898
899        bufferPos+=count;
900        break;
901    }
902    case 1: {
903        /* name = prefix factorized-elements */
904        uint16_t indexes[8];
905        const uint16_t *factors=(const uint16_t *)(range+1);
906        uint16_t count=range->variant;
907        const char *s=(const char *)(factors+count);
908        char c;
909
910        /* copy prefix */
911        while((c=*s++)!=0) {
912            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
913        }
914
915        bufferPos+=writeFactorSuffix(factors, count,
916                                     s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
917        break;
918    }
919    default:
920        /* undefined type */
921        /* zero-terminate */
922        if(bufferLength>0) {
923            *buffer=0;
924        }
925        break;
926    }
927
928    return bufferPos;
929}
930
931/*
932 * Important: enumAlgNames() and findAlgName() are almost the same.
933 * Any fix must be applied to both.
934 */
935static UBool
936enumAlgNames(AlgorithmicRange *range,
937             UChar32 start, UChar32 limit,
938             UEnumCharNamesFn *fn, void *context,
939             UCharNameChoice nameChoice) {
940    char buffer[200];
941    uint16_t length;
942
943    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
944        return TRUE;
945    }
946
947    switch(range->type) {
948    case 0: {
949        char *s, *end;
950        char c;
951
952        /* get the full name of the start character */
953        length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
954        if(length<=0) {
955            return TRUE;
956        }
957
958        /* call the enumerator function with this first character */
959        if(!fn(context, start, nameChoice, buffer, length)) {
960            return FALSE;
961        }
962
963        /* go to the end of the name; all these names have the same length */
964        end=buffer;
965        while(*end!=0) {
966            ++end;
967        }
968
969        /* enumerate the rest of the names */
970        while(++start<limit) {
971            /* increment the hexadecimal number on a character-basis */
972            s=end;
973            for (;;) {
974                c=*--s;
975                if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
976                    *s=(char)(c+1);
977                    break;
978                } else if(c=='9') {
979                    *s='A';
980                    break;
981                } else if(c=='F') {
982                    *s='0';
983                }
984            }
985
986            if(!fn(context, start, nameChoice, buffer, length)) {
987                return FALSE;
988            }
989        }
990        break;
991    }
992    case 1: {
993        uint16_t indexes[8];
994        const char *elementBases[8], *elements[8];
995        const uint16_t *factors=(const uint16_t *)(range+1);
996        uint16_t count=range->variant;
997        const char *s=(const char *)(factors+count);
998        char *suffix, *t;
999        uint16_t prefixLength, i, idx;
1000
1001        char c;
1002
1003        /* name = prefix factorized-elements */
1004
1005        /* copy prefix */
1006        suffix=buffer;
1007        prefixLength=0;
1008        while((c=*s++)!=0) {
1009            *suffix++=c;
1010            ++prefixLength;
1011        }
1012
1013        /* append the suffix of the start character */
1014        length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
1015                                              s, (uint32_t)start-range->start,
1016                                              indexes, elementBases, elements,
1017                                              suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
1018
1019        /* call the enumerator function with this first character */
1020        if(!fn(context, start, nameChoice, buffer, length)) {
1021            return FALSE;
1022        }
1023
1024        /* enumerate the rest of the names */
1025        while(++start<limit) {
1026            /* increment the indexes in lexical order bound by the factors */
1027            i=count;
1028            for (;;) {
1029                idx=(uint16_t)(indexes[--i]+1);
1030                if(idx<factors[i]) {
1031                    /* skip one index and its element string */
1032                    indexes[i]=idx;
1033                    s=elements[i];
1034                    while(*s++!=0) {
1035                    }
1036                    elements[i]=s;
1037                    break;
1038                } else {
1039                    /* reset this index to 0 and its element string to the first one */
1040                    indexes[i]=0;
1041                    elements[i]=elementBases[i];
1042                }
1043            }
1044
1045            /* to make matters a little easier, just append all elements to the suffix */
1046            t=suffix;
1047            length=prefixLength;
1048            for(i=0; i<count; ++i) {
1049                s=elements[i];
1050                while((c=*s++)!=0) {
1051                    *t++=c;
1052                    ++length;
1053                }
1054            }
1055            /* zero-terminate */
1056            *t=0;
1057
1058            if(!fn(context, start, nameChoice, buffer, length)) {
1059                return FALSE;
1060            }
1061        }
1062        break;
1063    }
1064    default:
1065        /* undefined type */
1066        break;
1067    }
1068
1069    return TRUE;
1070}
1071
1072/*
1073 * findAlgName() is almost the same as enumAlgNames() except that it
1074 * returns the code point for a name if it fits into the range.
1075 * It returns 0xffff otherwise.
1076 */
1077static UChar32
1078findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1079    UChar32 code;
1080
1081    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1082        return 0xffff;
1083    }
1084
1085    switch(range->type) {
1086    case 0: {
1087        /* name = prefix hex-digits */
1088        const char *s=(const char *)(range+1);
1089        char c;
1090
1091        uint16_t i, count;
1092
1093        /* compare prefix */
1094        while((c=*s++)!=0) {
1095            if((char)c!=*otherName++) {
1096                return 0xffff;
1097            }
1098        }
1099
1100        /* read hexadecimal code point value */
1101        count=range->variant;
1102        code=0;
1103        for(i=0; i<count; ++i) {
1104            c=*otherName++;
1105            if('0'<=c && c<='9') {
1106                code=(code<<4)|(c-'0');
1107            } else if('A'<=c && c<='F') {
1108                code=(code<<4)|(c-'A'+10);
1109            } else {
1110                return 0xffff;
1111            }
1112        }
1113
1114        /* does it fit into the range? */
1115        if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1116            return code;
1117        }
1118        break;
1119    }
1120    case 1: {
1121        char buffer[64];
1122        uint16_t indexes[8];
1123        const char *elementBases[8], *elements[8];
1124        const uint16_t *factors=(const uint16_t *)(range+1);
1125        uint16_t count=range->variant;
1126        const char *s=(const char *)(factors+count), *t;
1127        UChar32 start, limit;
1128        uint16_t i, idx;
1129
1130        char c;
1131
1132        /* name = prefix factorized-elements */
1133
1134        /* compare prefix */
1135        while((c=*s++)!=0) {
1136            if((char)c!=*otherName++) {
1137                return 0xffff;
1138            }
1139        }
1140
1141        start=(UChar32)range->start;
1142        limit=(UChar32)(range->end+1);
1143
1144        /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1145        writeFactorSuffix(factors, count, s, 0,
1146                          indexes, elementBases, elements, buffer, sizeof(buffer));
1147
1148        /* compare the first suffix */
1149        if(0==uprv_strcmp(otherName, buffer)) {
1150            return start;
1151        }
1152
1153        /* enumerate and compare the rest of the suffixes */
1154        while(++start<limit) {
1155            /* increment the indexes in lexical order bound by the factors */
1156            i=count;
1157            for (;;) {
1158                idx=(uint16_t)(indexes[--i]+1);
1159                if(idx<factors[i]) {
1160                    /* skip one index and its element string */
1161                    indexes[i]=idx;
1162                    s=elements[i];
1163                    while(*s++!=0) {}
1164                    elements[i]=s;
1165                    break;
1166                } else {
1167                    /* reset this index to 0 and its element string to the first one */
1168                    indexes[i]=0;
1169                    elements[i]=elementBases[i];
1170                }
1171            }
1172
1173            /* to make matters a little easier, just compare all elements of the suffix */
1174            t=otherName;
1175            for(i=0; i<count; ++i) {
1176                s=elements[i];
1177                while((c=*s++)!=0) {
1178                    if(c!=*t++) {
1179                        s=""; /* does not match */
1180                        i=99;
1181                    }
1182                }
1183            }
1184            if(i<99 && *t==0) {
1185                return start;
1186            }
1187        }
1188        break;
1189    }
1190    default:
1191        /* undefined type */
1192        break;
1193    }
1194
1195    return 0xffff;
1196}
1197
1198/* sets of name characters, maximum name lengths ---------------------------- */
1199
1200#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1201#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1202
1203static int32_t
1204calcStringSetLength(uint32_t set[8], const char *s) {
1205    int32_t length=0;
1206    char c;
1207
1208    while((c=*s++)!=0) {
1209        SET_ADD(set, c);
1210        ++length;
1211    }
1212    return length;
1213}
1214
1215static int32_t
1216calcAlgNameSetsLengths(int32_t maxNameLength) {
1217    AlgorithmicRange *range;
1218    uint32_t *p;
1219    uint32_t rangeCount;
1220    int32_t length;
1221
1222    /* enumerate algorithmic ranges */
1223    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1224    rangeCount=*p;
1225    range=(AlgorithmicRange *)(p+1);
1226    while(rangeCount>0) {
1227        switch(range->type) {
1228        case 0:
1229            /* name = prefix + (range->variant times) hex-digits */
1230            /* prefix */
1231            length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1232            if(length>maxNameLength) {
1233                maxNameLength=length;
1234            }
1235            break;
1236        case 1: {
1237            /* name = prefix factorized-elements */
1238            const uint16_t *factors=(const uint16_t *)(range+1);
1239            const char *s;
1240            int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1241
1242            /* prefix length */
1243            s=(const char *)(factors+count);
1244            length=calcStringSetLength(gNameSet, s);
1245            s+=length+1; /* start of factor suffixes */
1246
1247            /* get the set and maximum factor suffix length for each factor */
1248            for(i=0; i<count; ++i) {
1249                maxFactorLength=0;
1250                for(factor=factors[i]; factor>0; --factor) {
1251                    factorLength=calcStringSetLength(gNameSet, s);
1252                    s+=factorLength+1;
1253                    if(factorLength>maxFactorLength) {
1254                        maxFactorLength=factorLength;
1255                    }
1256                }
1257                length+=maxFactorLength;
1258            }
1259
1260            if(length>maxNameLength) {
1261                maxNameLength=length;
1262            }
1263            break;
1264        }
1265        default:
1266            /* unknown type */
1267            break;
1268        }
1269
1270        range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1271        --rangeCount;
1272    }
1273    return maxNameLength;
1274}
1275
1276static int32_t
1277calcExtNameSetsLengths(int32_t maxNameLength) {
1278    int32_t i, length;
1279
1280    for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) {
1281        /*
1282         * for each category, count the length of the category name
1283         * plus 9=
1284         * 2 for <>
1285         * 1 for -
1286         * 6 for most hex digits per code point
1287         */
1288        length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1289        if(length>maxNameLength) {
1290            maxNameLength=length;
1291        }
1292    }
1293    return maxNameLength;
1294}
1295
1296static int32_t
1297calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1298                  uint32_t set[8],
1299                  const uint8_t **pLine, const uint8_t *lineLimit) {
1300    const uint8_t *line=*pLine;
1301    int32_t length=0, tokenLength;
1302    uint16_t c, token;
1303
1304    while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1305        if(c>=tokenCount) {
1306            /* implicit letter */
1307            SET_ADD(set, c);
1308            ++length;
1309        } else {
1310            token=tokens[c];
1311            if(token==(uint16_t)(-2)) {
1312                /* this is a lead byte for a double-byte token */
1313                c=c<<8|*line++;
1314                token=tokens[c];
1315            }
1316            if(token==(uint16_t)(-1)) {
1317                /* explicit letter */
1318                SET_ADD(set, c);
1319                ++length;
1320            } else {
1321                /* count token word */
1322                if(tokenLengths!=NULL) {
1323                    /* use cached token length */
1324                    tokenLength=tokenLengths[c];
1325                    if(tokenLength==0) {
1326                        tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1327                        tokenLengths[c]=(int8_t)tokenLength;
1328                    }
1329                } else {
1330                    tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1331                }
1332                length+=tokenLength;
1333            }
1334        }
1335    }
1336
1337    *pLine=line;
1338    return length;
1339}
1340
1341static void
1342calcGroupNameSetsLengths(int32_t maxNameLength) {
1343    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1344
1345    uint16_t *tokens=(uint16_t *)uCharNames+8;
1346    uint16_t tokenCount=*tokens++;
1347    uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1348
1349    int8_t *tokenLengths;
1350
1351    const uint16_t *group;
1352    const uint8_t *s, *line, *lineLimit;
1353
1354    int32_t groupCount, lineNumber, length;
1355
1356    tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1357    if(tokenLengths!=NULL) {
1358        uprv_memset(tokenLengths, 0, tokenCount);
1359    }
1360
1361    group=GET_GROUPS(uCharNames);
1362    groupCount=*group++;
1363
1364    /* enumerate all groups */
1365    while(groupCount>0) {
1366        s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
1367        s=expandGroupLengths(s, offsets, lengths);
1368
1369        /* enumerate all lines in each group */
1370        for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1371            line=s+offsets[lineNumber];
1372            length=lengths[lineNumber];
1373            if(length==0) {
1374                continue;
1375            }
1376
1377            lineLimit=line+length;
1378
1379            /* read regular name */
1380            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1381            if(length>maxNameLength) {
1382                maxNameLength=length;
1383            }
1384            if(line==lineLimit) {
1385                continue;
1386            }
1387
1388            /* read Unicode 1.0 name */
1389            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1390            if(length>maxNameLength) {
1391                maxNameLength=length;
1392            }
1393            if(line==lineLimit) {
1394                continue;
1395            }
1396
1397            /* read ISO comment */
1398            /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1399        }
1400
1401        group=NEXT_GROUP(group);
1402        --groupCount;
1403    }
1404
1405    if(tokenLengths!=NULL) {
1406        uprv_free(tokenLengths);
1407    }
1408
1409    /* set gMax... - name length last for threading */
1410    gMaxNameLength=maxNameLength;
1411}
1412
1413static UBool
1414calcNameSetsLengths(UErrorCode *pErrorCode) {
1415    static const char extChars[]="0123456789ABCDEF<>-";
1416    int32_t i, maxNameLength;
1417
1418    if(gMaxNameLength!=0) {
1419        return TRUE;
1420    }
1421
1422    if(!isDataLoaded(pErrorCode)) {
1423        return FALSE;
1424    }
1425
1426    /* set hex digits, used in various names, and <>-, used in extended names */
1427    for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) {
1428        SET_ADD(gNameSet, extChars[i]);
1429    }
1430
1431    /* set sets and lengths from algorithmic names */
1432    maxNameLength=calcAlgNameSetsLengths(0);
1433
1434    /* set sets and lengths from extended names */
1435    maxNameLength=calcExtNameSetsLengths(maxNameLength);
1436
1437    /* set sets and lengths from group names, set global maximum values */
1438    calcGroupNameSetsLengths(maxNameLength);
1439
1440    return TRUE;
1441}
1442
1443U_NAMESPACE_END
1444
1445/* public API --------------------------------------------------------------- */
1446
1447U_NAMESPACE_USE
1448
1449U_CAPI int32_t U_EXPORT2
1450u_charName(UChar32 code, UCharNameChoice nameChoice,
1451           char *buffer, int32_t bufferLength,
1452           UErrorCode *pErrorCode) {
1453     AlgorithmicRange *algRange;
1454    uint32_t *p;
1455    uint32_t i;
1456    int32_t length;
1457
1458    /* check the argument values */
1459    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1460        return 0;
1461    } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1462              bufferLength<0 || (bufferLength>0 && buffer==NULL)
1463    ) {
1464        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1465        return 0;
1466    }
1467
1468    if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1469        return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1470    }
1471
1472    length=0;
1473
1474    /* try algorithmic names first */
1475    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1476    i=*p;
1477    algRange=(AlgorithmicRange *)(p+1);
1478    while(i>0) {
1479        if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1480            length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1481            break;
1482        }
1483        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1484        --i;
1485    }
1486
1487    if(i==0) {
1488        if (nameChoice == U_EXTENDED_CHAR_NAME) {
1489            length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1490            if (!length) {
1491                /* extended character name */
1492                length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1493            }
1494        } else {
1495            /* normal character name */
1496            length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1497        }
1498    }
1499
1500    return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1501}
1502
1503U_CAPI int32_t U_EXPORT2
1504u_getISOComment(UChar32 /*c*/,
1505                char *dest, int32_t destCapacity,
1506                UErrorCode *pErrorCode) {
1507    /* check the argument values */
1508    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1509        return 0;
1510    } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1511        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1512        return 0;
1513    }
1514
1515    return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1516}
1517
1518U_CAPI UChar32 U_EXPORT2
1519u_charFromName(UCharNameChoice nameChoice,
1520               const char *name,
1521               UErrorCode *pErrorCode) {
1522    char upper[120], lower[120];
1523    FindName findName;
1524    AlgorithmicRange *algRange;
1525    uint32_t *p;
1526    uint32_t i;
1527    UChar32 cp = 0;
1528    char c0;
1529    UChar32 error = 0xffff;     /* Undefined, but use this for backwards compatibility. */
1530
1531    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1532        return error;
1533    }
1534
1535    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1536        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1537        return error;
1538    }
1539
1540    if(!isDataLoaded(pErrorCode)) {
1541        return error;
1542    }
1543
1544    /* construct the uppercase and lowercase of the name first */
1545    for(i=0; i<sizeof(upper); ++i) {
1546        if((c0=*name++)!=0) {
1547            upper[i]=uprv_toupper(c0);
1548            lower[i]=uprv_tolower(c0);
1549        } else {
1550            upper[i]=lower[i]=0;
1551            break;
1552        }
1553    }
1554    if(i==sizeof(upper)) {
1555        /* name too long, there is no such character */
1556        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1557        return error;
1558    }
1559    // i==strlen(name)==strlen(lower)==strlen(upper)
1560
1561    /* try extended names first */
1562    if (lower[0] == '<') {
1563        if (nameChoice == U_EXTENDED_CHAR_NAME) {
1564            // Parse a string like "<category-HHHH>" where HHHH is a hex code point.
1565            if (lower[--i] == '>' && i >= 3 && lower[--i] != '-') {
1566                while (i >= 3 && lower[--i] != '-') {}
1567
1568                if (i >= 2 && lower[i] == '-') {
1569                    uint32_t cIdx;
1570
1571                    lower[i] = 0;
1572
1573                    for (++i; lower[i] != '>'; ++i) {
1574                        if (lower[i] >= '0' && lower[i] <= '9') {
1575                            cp = (cp << 4) + lower[i] - '0';
1576                        } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1577                            cp = (cp << 4) + lower[i] - 'a' + 10;
1578                        } else {
1579                            *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1580                            return error;
1581                        }
1582                    }
1583
1584                    /* Now validate the category name.
1585                       We could use a binary search, or a trie, if
1586                       we really wanted to. */
1587
1588                    for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) {
1589
1590                        if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1591                            if (getCharCat(cp) == cIdx) {
1592                                return cp;
1593                            }
1594                            break;
1595                        }
1596                    }
1597                }
1598            }
1599        }
1600
1601        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1602        return error;
1603    }
1604
1605    /* try algorithmic names now */
1606    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1607    i=*p;
1608    algRange=(AlgorithmicRange *)(p+1);
1609    while(i>0) {
1610        if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1611            return cp;
1612        }
1613        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1614        --i;
1615    }
1616
1617    /* normal character name */
1618    findName.otherName=upper;
1619    findName.code=error;
1620    enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1621    if (findName.code == error) {
1622         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1623    }
1624    return findName.code;
1625}
1626
1627U_CAPI void U_EXPORT2
1628u_enumCharNames(UChar32 start, UChar32 limit,
1629                UEnumCharNamesFn *fn,
1630                void *context,
1631                UCharNameChoice nameChoice,
1632                UErrorCode *pErrorCode) {
1633    AlgorithmicRange *algRange;
1634    uint32_t *p;
1635    uint32_t i;
1636
1637    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1638        return;
1639    }
1640
1641    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1642        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1643        return;
1644    }
1645
1646    if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1647        limit = UCHAR_MAX_VALUE + 1;
1648    }
1649    if((uint32_t)start>=(uint32_t)limit) {
1650        return;
1651    }
1652
1653    if(!isDataLoaded(pErrorCode)) {
1654        return;
1655    }
1656
1657    /* interleave the data-driven ones with the algorithmic ones */
1658    /* iterate over all algorithmic ranges; assume that they are in ascending order */
1659    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1660    i=*p;
1661    algRange=(AlgorithmicRange *)(p+1);
1662    while(i>0) {
1663        /* enumerate the character names before the current algorithmic range */
1664        /* here: start<limit */
1665        if((uint32_t)start<algRange->start) {
1666            if((uint32_t)limit<=algRange->start) {
1667                enumNames(uCharNames, start, limit, fn, context, nameChoice);
1668                return;
1669            }
1670            if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1671                return;
1672            }
1673            start=(UChar32)algRange->start;
1674        }
1675        /* enumerate the character names in the current algorithmic range */
1676        /* here: algRange->start<=start<limit */
1677        if((uint32_t)start<=algRange->end) {
1678            if((uint32_t)limit<=(algRange->end+1)) {
1679                enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1680                return;
1681            }
1682            if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1683                return;
1684            }
1685            start=(UChar32)algRange->end+1;
1686        }
1687        /* continue to the next algorithmic range (here: start<limit) */
1688        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1689        --i;
1690    }
1691    /* enumerate the character names after the last algorithmic range */
1692    enumNames(uCharNames, start, limit, fn, context, nameChoice);
1693}
1694
1695U_CAPI int32_t U_EXPORT2
1696uprv_getMaxCharNameLength() {
1697    UErrorCode errorCode=U_ZERO_ERROR;
1698    if(calcNameSetsLengths(&errorCode)) {
1699        return gMaxNameLength;
1700    } else {
1701        return 0;
1702    }
1703}
1704
1705/**
1706 * Converts the char set cset into a Unicode set uset.
1707 * @param cset Set of 256 bit flags corresponding to a set of chars.
1708 * @param uset USet to receive characters. Existing contents are deleted.
1709 */
1710static void
1711charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1712    UChar us[256];
1713    char cs[256];
1714
1715    int32_t i, length;
1716    UErrorCode errorCode;
1717
1718    errorCode=U_ZERO_ERROR;
1719
1720    if(!calcNameSetsLengths(&errorCode)) {
1721        return;
1722    }
1723
1724    /* build a char string with all chars that are used in character names */
1725    length=0;
1726    for(i=0; i<256; ++i) {
1727        if(SET_CONTAINS(cset, i)) {
1728            cs[length++]=(char)i;
1729        }
1730    }
1731
1732    /* convert the char string to a UChar string */
1733    u_charsToUChars(cs, us, length);
1734
1735    /* add each UChar to the USet */
1736    for(i=0; i<length; ++i) {
1737        if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
1738            sa->add(sa->set, us[i]);
1739        }
1740    }
1741}
1742
1743/**
1744 * Fills set with characters that are used in Unicode character names.
1745 * @param set USet to receive characters.
1746 */
1747U_CAPI void U_EXPORT2
1748uprv_getCharNameCharacters(const USetAdder *sa) {
1749    charSetToUSet(gNameSet, sa);
1750}
1751
1752/* data swapping ------------------------------------------------------------ */
1753
1754/*
1755 * The token table contains non-negative entries for token bytes,
1756 * and -1 for bytes that represent themselves in the data file's charset.
1757 * -2 entries are used for lead bytes.
1758 *
1759 * Direct bytes (-1 entries) must be translated from the input charset family
1760 * to the output charset family.
1761 * makeTokenMap() writes a permutation mapping for this.
1762 * Use it once for single-/lead-byte tokens and once more for all trail byte
1763 * tokens. (';' is an unused trail byte marked with -1.)
1764 */
1765static void
1766makeTokenMap(const UDataSwapper *ds,
1767             int16_t tokens[], uint16_t tokenCount,
1768             uint8_t map[256],
1769             UErrorCode *pErrorCode) {
1770    UBool usedOutChar[256];
1771    uint16_t i, j;
1772    uint8_t c1, c2;
1773
1774    if(U_FAILURE(*pErrorCode)) {
1775        return;
1776    }
1777
1778    if(ds->inCharset==ds->outCharset) {
1779        /* Same charset family: identity permutation */
1780        for(i=0; i<256; ++i) {
1781            map[i]=(uint8_t)i;
1782        }
1783    } else {
1784        uprv_memset(map, 0, 256);
1785        uprv_memset(usedOutChar, 0, 256);
1786
1787        if(tokenCount>256) {
1788            tokenCount=256;
1789        }
1790
1791        /* set the direct bytes (byte 0 always maps to itself) */
1792        for(i=1; i<tokenCount; ++i) {
1793            if(tokens[i]==-1) {
1794                /* convert the direct byte character */
1795                c1=(uint8_t)i;
1796                ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1797                if(U_FAILURE(*pErrorCode)) {
1798                    udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1799                                     i, ds->inCharset);
1800                    return;
1801                }
1802
1803                /* enter the converted character into the map and mark it used */
1804                map[c1]=c2;
1805                usedOutChar[c2]=TRUE;
1806            }
1807        }
1808
1809        /* set the mappings for the rest of the permutation */
1810        for(i=j=1; i<tokenCount; ++i) {
1811            /* set mappings that were not set for direct bytes */
1812            if(map[i]==0) {
1813                /* set an output byte value that was not used as an output byte above */
1814                while(usedOutChar[j]) {
1815                    ++j;
1816                }
1817                map[i]=(uint8_t)j++;
1818            }
1819        }
1820
1821        /*
1822         * leave mappings at tokenCount and above unset if tokenCount<256
1823         * because they won't be used
1824         */
1825    }
1826}
1827
1828U_CAPI int32_t U_EXPORT2
1829uchar_swapNames(const UDataSwapper *ds,
1830                const void *inData, int32_t length, void *outData,
1831                UErrorCode *pErrorCode) {
1832    const UDataInfo *pInfo;
1833    int32_t headerSize;
1834
1835    const uint8_t *inBytes;
1836    uint8_t *outBytes;
1837
1838    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1839             offset, i, count, stringsCount;
1840
1841    const AlgorithmicRange *inRange;
1842    AlgorithmicRange *outRange;
1843
1844    /* udata_swapDataHeader checks the arguments */
1845    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1846    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1847        return 0;
1848    }
1849
1850    /* check data format and format version */
1851    pInfo=(const UDataInfo *)((const char *)inData+4);
1852    if(!(
1853        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
1854        pInfo->dataFormat[1]==0x6e &&
1855        pInfo->dataFormat[2]==0x61 &&
1856        pInfo->dataFormat[3]==0x6d &&
1857        pInfo->formatVersion[0]==1
1858    )) {
1859        udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1860                         pInfo->dataFormat[0], pInfo->dataFormat[1],
1861                         pInfo->dataFormat[2], pInfo->dataFormat[3],
1862                         pInfo->formatVersion[0]);
1863        *pErrorCode=U_UNSUPPORTED_ERROR;
1864        return 0;
1865    }
1866
1867    inBytes=(const uint8_t *)inData+headerSize;
1868    outBytes=(uint8_t *)outData+headerSize;
1869    if(length<0) {
1870        algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1871    } else {
1872        length-=headerSize;
1873        if( length<20 ||
1874            (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1875        ) {
1876            udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1877                             length);
1878            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1879            return 0;
1880        }
1881    }
1882
1883    if(length<0) {
1884        /* preflighting: iterate through algorithmic ranges */
1885        offset=algNamesOffset;
1886        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1887        offset+=4;
1888
1889        for(i=0; i<count; ++i) {
1890            inRange=(const AlgorithmicRange *)(inBytes+offset);
1891            offset+=ds->readUInt16(inRange->size);
1892        }
1893    } else {
1894        /* swap data */
1895        const uint16_t *p;
1896        uint16_t *q, *temp;
1897
1898        int16_t tokens[512];
1899        uint16_t tokenCount;
1900
1901        uint8_t map[256], trailMap[256];
1902
1903        /* copy the data for inaccessible bytes */
1904        if(inBytes!=outBytes) {
1905            uprv_memcpy(outBytes, inBytes, length);
1906        }
1907
1908        /* the initial 4 offsets first */
1909        tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1910        groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1911        groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1912        ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1913
1914        /*
1915         * now the tokens table
1916         * it needs to be permutated along with the compressed name strings
1917         */
1918        p=(const uint16_t *)(inBytes+16);
1919        q=(uint16_t *)(outBytes+16);
1920
1921        /* read and swap the tokenCount */
1922        tokenCount=ds->readUInt16(*p);
1923        ds->swapArray16(ds, p, 2, q, pErrorCode);
1924        ++p;
1925        ++q;
1926
1927        /* read the first 512 tokens and make the token maps */
1928        if(tokenCount<=512) {
1929            count=tokenCount;
1930        } else {
1931            count=512;
1932        }
1933        for(i=0; i<count; ++i) {
1934            tokens[i]=udata_readInt16(ds, p[i]);
1935        }
1936        for(; i<512; ++i) {
1937            tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1938        }
1939        makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1940        makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1941        if(U_FAILURE(*pErrorCode)) {
1942            return 0;
1943        }
1944
1945        /*
1946         * swap and permutate the tokens
1947         * go through a temporary array to support in-place swapping
1948         */
1949        temp=(uint16_t *)uprv_malloc(tokenCount*2);
1950        if(temp==NULL) {
1951            udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1952                             tokenCount);
1953            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1954            return 0;
1955        }
1956
1957        /* swap and permutate single-/lead-byte tokens */
1958        for(i=0; i<tokenCount && i<256; ++i) {
1959            ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1960        }
1961
1962        /* swap and permutate trail-byte tokens */
1963        for(; i<tokenCount; ++i) {
1964            ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1965        }
1966
1967        /* copy the result into the output and free the temporary array */
1968        uprv_memcpy(q, temp, tokenCount*2);
1969        uprv_free(temp);
1970
1971        /*
1972         * swap the token strings but not a possible padding byte after
1973         * the terminating NUL of the last string
1974         */
1975        udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1976                                    outBytes+tokenStringOffset, pErrorCode);
1977        if(U_FAILURE(*pErrorCode)) {
1978            udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1979            return 0;
1980        }
1981
1982        /* swap the group table */
1983        count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
1984        ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
1985                           outBytes+groupsOffset, pErrorCode);
1986
1987        /*
1988         * swap the group strings
1989         * swap the string bytes but not the nibble-encoded string lengths
1990         */
1991        if(ds->inCharset!=ds->outCharset) {
1992            uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
1993
1994            const uint8_t *inStrings, *nextInStrings;
1995            uint8_t *outStrings;
1996
1997            uint8_t c;
1998
1999            inStrings=inBytes+groupStringOffset;
2000            outStrings=outBytes+groupStringOffset;
2001
2002            stringsCount=algNamesOffset-groupStringOffset;
2003
2004            /* iterate through string groups until only a few padding bytes are left */
2005            while(stringsCount>32) {
2006                nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2007
2008                /* move past the length bytes */
2009                stringsCount-=(uint32_t)(nextInStrings-inStrings);
2010                outStrings+=nextInStrings-inStrings;
2011                inStrings=nextInStrings;
2012
2013                count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2014                stringsCount-=count;
2015
2016                /* swap the string bytes using map[] and trailMap[] */
2017                while(count>0) {
2018                    c=*inStrings++;
2019                    *outStrings++=map[c];
2020                    if(tokens[c]!=-2) {
2021                        --count;
2022                    } else {
2023                        /* token lead byte: swap the trail byte, too */
2024                        *outStrings++=trailMap[*inStrings++];
2025                        count-=2;
2026                    }
2027                }
2028            }
2029        }
2030
2031        /* swap the algorithmic ranges */
2032        offset=algNamesOffset;
2033        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2034        ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2035        offset+=4;
2036
2037        for(i=0; i<count; ++i) {
2038            if(offset>(uint32_t)length) {
2039                udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2040                                 length, i);
2041                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2042                return 0;
2043            }
2044
2045            inRange=(const AlgorithmicRange *)(inBytes+offset);
2046            outRange=(AlgorithmicRange *)(outBytes+offset);
2047            offset+=ds->readUInt16(inRange->size);
2048
2049            ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2050            ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2051            switch(inRange->type) {
2052            case 0:
2053                /* swap prefix string */
2054                ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2055                                    outRange+1, pErrorCode);
2056                if(U_FAILURE(*pErrorCode)) {
2057                    udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2058                                     i);
2059                    return 0;
2060                }
2061                break;
2062            case 1:
2063                {
2064                    /* swap factors and the prefix and factor strings */
2065                    uint32_t factorsCount;
2066
2067                    factorsCount=inRange->variant;
2068                    p=(const uint16_t *)(inRange+1);
2069                    q=(uint16_t *)(outRange+1);
2070                    ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2071
2072                    /* swap the strings, up to the last terminating NUL */
2073                    p+=factorsCount;
2074                    q+=factorsCount;
2075                    stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2076                    while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2077                        --stringsCount;
2078                    }
2079                    ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2080                }
2081                break;
2082            default:
2083                udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2084                                 inRange->type, i);
2085                *pErrorCode=U_UNSUPPORTED_ERROR;
2086                return 0;
2087            }
2088        }
2089    }
2090
2091    return headerSize+(int32_t)offset;
2092}
2093
2094/*
2095 * Hey, Emacs, please set the following:
2096 *
2097 * Local Variables:
2098 * indent-tabs-mode: nil
2099 * End:
2100 *
2101 */
2102