1/*
2******************************************************************************
3*
4*   Copyright (C) 1999-2013, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  unames.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 1999oct04
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/putil.h"
19#include "unicode/uchar.h"
20#include "unicode/udata.h"
21#include "unicode/utf.h"
22#include "unicode/utf16.h"
23#include "uassert.h"
24#include "ustr_imp.h"
25#include "umutex.h"
26#include "cmemory.h"
27#include "cstring.h"
28#include "ucln_cmn.h"
29#include "udataswp.h"
30#include "uprops.h"
31
32U_NAMESPACE_BEGIN
33
34/* prototypes ------------------------------------------------------------- */
35
36#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
37
38static const char DATA_NAME[] = "unames";
39static const char DATA_TYPE[] = "icu";
40
41#define GROUP_SHIFT 5
42#define LINES_PER_GROUP (1L<<GROUP_SHIFT)
43#define GROUP_MASK (LINES_PER_GROUP-1)
44
45/*
46 * This struct was replaced by explicitly accessing equivalent
47 * fields from triples of uint16_t.
48 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
49 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
50 * would advance by 6 bytes (3 uint16_t).
51 *
52 * We can't just change the data structure because it's loaded from a data file,
53 * and we don't want to make it less compact, so we changed the access code.
54 *
55 * For details see ICU tickets 6331 and 6008.
56typedef struct {
57    uint16_t groupMSB,
58             offsetHigh, offsetLow; / * avoid padding * /
59} Group;
60 */
61enum {
62    GROUP_MSB,
63    GROUP_OFFSET_HIGH,
64    GROUP_OFFSET_LOW,
65    GROUP_LENGTH
66};
67
68/*
69 * Get the 32-bit group offset.
70 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
71 * @return group offset (int32_t)
72 */
73#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
74
75#define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
76#define PREV_GROUP(group) ((group)-GROUP_LENGTH)
77
78typedef struct {
79    uint32_t start, end;
80    uint8_t type, variant;
81    uint16_t size;
82} AlgorithmicRange;
83
84typedef struct {
85    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
86} UCharNames;
87
88/*
89 * Get the groups table from a UCharNames struct.
90 * The groups table consists of one uint16_t groupCount followed by
91 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
92 * and the comment for the old struct Group above.
93 *
94 * @param names (const UCharNames *) pointer to the UCharNames indexes
95 * @return (const uint16_t *) pointer to the groups table
96 */
97#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
98
99typedef struct {
100    const char *otherName;
101    UChar32 code;
102} FindName;
103
104#define DO_FIND_NAME NULL
105
106static UDataMemory *uCharNamesData=NULL;
107static UCharNames *uCharNames=NULL;
108static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER;
109
110/*
111 * Maximum length of character names (regular & 1.0).
112 */
113static int32_t gMaxNameLength=0;
114
115/*
116 * Set of chars used in character names (regular & 1.0).
117 * Chars are platform-dependent (can be EBCDIC).
118 */
119static uint32_t gNameSet[8]={ 0 };
120
121#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
122#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
123#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
124
125#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
126
127static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
128    "unassigned",
129    "uppercase letter",
130    "lowercase letter",
131    "titlecase letter",
132    "modifier letter",
133    "other letter",
134    "non spacing mark",
135    "enclosing mark",
136    "combining spacing mark",
137    "decimal digit number",
138    "letter number",
139    "other number",
140    "space separator",
141    "line separator",
142    "paragraph separator",
143    "control",
144    "format",
145    "private use area",
146    "surrogate",
147    "dash punctuation",
148    "start punctuation",
149    "end punctuation",
150    "connector punctuation",
151    "other punctuation",
152    "math symbol",
153    "currency symbol",
154    "modifier symbol",
155    "other symbol",
156    "initial punctuation",
157    "final punctuation",
158    "noncharacter",
159    "lead surrogate",
160    "trail surrogate"
161};
162
163/* implementation ----------------------------------------------------------- */
164
165static UBool U_CALLCONV unames_cleanup(void)
166{
167    if(uCharNamesData) {
168        udata_close(uCharNamesData);
169        uCharNamesData = NULL;
170    }
171    if(uCharNames) {
172        uCharNames = NULL;
173    }
174    gCharNamesInitOnce.reset();
175    gMaxNameLength=0;
176    return TRUE;
177}
178
179static UBool U_CALLCONV
180isAcceptable(void * /*context*/,
181             const char * /*type*/, const char * /*name*/,
182             const UDataInfo *pInfo) {
183    return (UBool)(
184        pInfo->size>=20 &&
185        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
186        pInfo->charsetFamily==U_CHARSET_FAMILY &&
187        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
188        pInfo->dataFormat[1]==0x6e &&
189        pInfo->dataFormat[2]==0x61 &&
190        pInfo->dataFormat[3]==0x6d &&
191        pInfo->formatVersion[0]==1);
192}
193
194static void U_CALLCONV
195loadCharNames(UErrorCode &status) {
196    U_ASSERT(uCharNamesData == NULL);
197    U_ASSERT(uCharNames == NULL);
198
199    uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status);
200    if(U_FAILURE(status)) {
201        uCharNamesData = NULL;
202    } else {
203        uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
204    }
205    ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
206}
207
208
209static UBool
210isDataLoaded(UErrorCode *pErrorCode) {
211    umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
212    return U_SUCCESS(*pErrorCode);
213}
214
215#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
216    if((bufferLength)>0) { \
217        *(buffer)++=c; \
218        --(bufferLength); \
219    } \
220    ++(bufferPos); \
221}
222
223#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
224
225/*
226 * Important: expandName() and compareName() are almost the same -
227 * apply fixes to both.
228 *
229 * UnicodeData.txt uses ';' as a field separator, so no
230 * field can contain ';' as part of its contents.
231 * In unames.dat, it is marked as token[';']==-1 only if the
232 * semicolon is used in the data file - which is iff we
233 * have Unicode 1.0 names or ISO comments or aliases.
234 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
235 * although we know that it will never be part of a name.
236 */
237static uint16_t
238expandName(UCharNames *names,
239           const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
240           char *buffer, uint16_t bufferLength) {
241    uint16_t *tokens=(uint16_t *)names+8;
242    uint16_t token, tokenCount=*tokens++, bufferPos=0;
243    uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
244    uint8_t c;
245
246    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
247        /*
248         * skip the modern name if it is not requested _and_
249         * if the semicolon byte value is a character, not a token number
250         */
251        if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
252            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
253            do {
254                while(nameLength>0) {
255                    --nameLength;
256                    if(*name++==';') {
257                        break;
258                    }
259                }
260            } while(--fieldIndex>0);
261        } else {
262            /*
263             * the semicolon byte value is a token number, therefore
264             * only modern names are stored in unames.dat and there is no
265             * such requested alternate name here
266             */
267            nameLength=0;
268        }
269    }
270
271    /* write each letter directly, and write a token word per token */
272    while(nameLength>0) {
273        --nameLength;
274        c=*name++;
275
276        if(c>=tokenCount) {
277            if(c!=';') {
278                /* implicit letter */
279                WRITE_CHAR(buffer, bufferLength, bufferPos, c);
280            } else {
281                /* finished */
282                break;
283            }
284        } else {
285            token=tokens[c];
286            if(token==(uint16_t)(-2)) {
287                /* this is a lead byte for a double-byte token */
288                token=tokens[c<<8|*name++];
289                --nameLength;
290            }
291            if(token==(uint16_t)(-1)) {
292                if(c!=';') {
293                    /* explicit letter */
294                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
295                } else {
296                    /* stop, but skip the semicolon if we are seeking
297                       extended names and there was no 2.0 name but there
298                       is a 1.0 name. */
299                    if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
300                        if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
301                            continue;
302                        }
303                    }
304                    /* finished */
305                    break;
306                }
307            } else {
308                /* write token word */
309                uint8_t *tokenString=tokenStrings+token;
310                while((c=*tokenString++)!=0) {
311                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
312                }
313            }
314        }
315    }
316
317    /* zero-terminate */
318    if(bufferLength>0) {
319        *buffer=0;
320    }
321
322    return bufferPos;
323}
324
325/*
326 * compareName() is almost the same as expandName() except that it compares
327 * the currently expanded name to an input name.
328 * It returns the match/no match result as soon as possible.
329 */
330static UBool
331compareName(UCharNames *names,
332            const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
333            const char *otherName) {
334    uint16_t *tokens=(uint16_t *)names+8;
335    uint16_t token, tokenCount=*tokens++;
336    uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
337    uint8_t c;
338    const char *origOtherName = otherName;
339
340    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
341        /*
342         * skip the modern name if it is not requested _and_
343         * if the semicolon byte value is a character, not a token number
344         */
345        if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
346            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
347            do {
348                while(nameLength>0) {
349                    --nameLength;
350                    if(*name++==';') {
351                        break;
352                    }
353                }
354            } while(--fieldIndex>0);
355        } else {
356            /*
357             * the semicolon byte value is a token number, therefore
358             * only modern names are stored in unames.dat and there is no
359             * such requested alternate name here
360             */
361            nameLength=0;
362        }
363    }
364
365    /* compare each letter directly, and compare a token word per token */
366    while(nameLength>0) {
367        --nameLength;
368        c=*name++;
369
370        if(c>=tokenCount) {
371            if(c!=';') {
372                /* implicit letter */
373                if((char)c!=*otherName++) {
374                    return FALSE;
375                }
376            } else {
377                /* finished */
378                break;
379            }
380        } else {
381            token=tokens[c];
382            if(token==(uint16_t)(-2)) {
383                /* this is a lead byte for a double-byte token */
384                token=tokens[c<<8|*name++];
385                --nameLength;
386            }
387            if(token==(uint16_t)(-1)) {
388                if(c!=';') {
389                    /* explicit letter */
390                    if((char)c!=*otherName++) {
391                        return FALSE;
392                    }
393                } else {
394                    /* stop, but skip the semicolon if we are seeking
395                       extended names and there was no 2.0 name but there
396                       is a 1.0 name. */
397                    if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
398                        if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
399                            continue;
400                        }
401                    }
402                    /* finished */
403                    break;
404                }
405            } else {
406                /* write token word */
407                uint8_t *tokenString=tokenStrings+token;
408                while((c=*tokenString++)!=0) {
409                    if((char)c!=*otherName++) {
410                        return FALSE;
411                    }
412                }
413            }
414        }
415    }
416
417    /* complete match? */
418    return (UBool)(*otherName==0);
419}
420
421static uint8_t getCharCat(UChar32 cp) {
422    uint8_t cat;
423
424    if (U_IS_UNICODE_NONCHAR(cp)) {
425        return U_NONCHARACTER_CODE_POINT;
426    }
427
428    if ((cat = u_charType(cp)) == U_SURROGATE) {
429        cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
430    }
431
432    return cat;
433}
434
435static const char *getCharCatName(UChar32 cp) {
436    uint8_t cat = getCharCat(cp);
437
438    /* Return unknown if the table of names above is not up to
439       date. */
440
441    if (cat >= LENGTHOF(charCatNames)) {
442        return "unknown";
443    } else {
444        return charCatNames[cat];
445    }
446}
447
448static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
449    const char *catname = getCharCatName(code);
450    uint16_t length = 0;
451
452    UChar32 cp;
453    int ndigits, i;
454
455    WRITE_CHAR(buffer, bufferLength, length, '<');
456    while (catname[length - 1]) {
457        WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
458    }
459    WRITE_CHAR(buffer, bufferLength, length, '-');
460    for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
461        ;
462    if (ndigits < 4)
463        ndigits = 4;
464    for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
465        uint8_t v = (uint8_t)(cp & 0xf);
466        buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
467    }
468    buffer += ndigits;
469    length += ndigits;
470    WRITE_CHAR(buffer, bufferLength, length, '>');
471
472    return length;
473}
474
475/*
476 * getGroup() does a binary search for the group that contains the
477 * Unicode code point "code".
478 * The return value is always a valid Group* that may contain "code"
479 * or else is the highest group before "code".
480 * If the lowest group is after "code", then that one is returned.
481 */
482static const uint16_t *
483getGroup(UCharNames *names, uint32_t code) {
484    const uint16_t *groups=GET_GROUPS(names);
485    uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
486             start=0,
487             limit=*groups++,
488             number;
489
490    /* binary search for the group of names that contains the one for code */
491    while(start<limit-1) {
492        number=(uint16_t)((start+limit)/2);
493        if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
494            limit=number;
495        } else {
496            start=number;
497        }
498    }
499
500    /* return this regardless of whether it is an exact match */
501    return groups+start*GROUP_LENGTH;
502}
503
504/*
505 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
506 * expands them into offsets and lengths for each string.
507 * Lengths are stored with a variable-width encoding in consecutive nibbles:
508 * If a nibble<0xc, then it is the length itself (0=empty string).
509 * If a nibble>=0xc, then it forms a length value with the following nibble.
510 * Calculation see below.
511 * The offsets and lengths arrays must be at least 33 (one more) long because
512 * there is no check here at the end if the last nibble is still used.
513 */
514static const uint8_t *
515expandGroupLengths(const uint8_t *s,
516                   uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
517    /* read the lengths of the 32 strings in this group and get each string's offset */
518    uint16_t i=0, offset=0, length=0;
519    uint8_t lengthByte;
520
521    /* all 32 lengths must be read to get the offset of the first group string */
522    while(i<LINES_PER_GROUP) {
523        lengthByte=*s++;
524
525        /* read even nibble - MSBs of lengthByte */
526        if(length>=12) {
527            /* double-nibble length spread across two bytes */
528            length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
529            lengthByte&=0xf;
530        } else if((lengthByte /* &0xf0 */)>=0xc0) {
531            /* double-nibble length spread across this one byte */
532            length=(uint16_t)((lengthByte&0x3f)+12);
533        } else {
534            /* single-nibble length in MSBs */
535            length=(uint16_t)(lengthByte>>4);
536            lengthByte&=0xf;
537        }
538
539        *offsets++=offset;
540        *lengths++=length;
541
542        offset+=length;
543        ++i;
544
545        /* read odd nibble - LSBs of lengthByte */
546        if((lengthByte&0xf0)==0) {
547            /* this nibble was not consumed for a double-nibble length above */
548            length=lengthByte;
549            if(length<12) {
550                /* single-nibble length in LSBs */
551                *offsets++=offset;
552                *lengths++=length;
553
554                offset+=length;
555                ++i;
556            }
557        } else {
558            length=0;   /* prevent double-nibble detection in the next iteration */
559        }
560    }
561
562    /* now, s is at the first group string */
563    return s;
564}
565
566static uint16_t
567expandGroupName(UCharNames *names, const uint16_t *group,
568                uint16_t lineNumber, UCharNameChoice nameChoice,
569                char *buffer, uint16_t bufferLength) {
570    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
571    const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
572    s=expandGroupLengths(s, offsets, lengths);
573    return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
574                      buffer, bufferLength);
575}
576
577static uint16_t
578getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
579        char *buffer, uint16_t bufferLength) {
580    const uint16_t *group=getGroup(names, code);
581    if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
582        return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
583                               buffer, bufferLength);
584    } else {
585        /* group not found */
586        /* zero-terminate */
587        if(bufferLength>0) {
588            *buffer=0;
589        }
590        return 0;
591    }
592}
593
594/*
595 * enumGroupNames() enumerates all the names in a 32-group
596 * and either calls the enumerator function or finds a given input name.
597 */
598static UBool
599enumGroupNames(UCharNames *names, const uint16_t *group,
600               UChar32 start, UChar32 end,
601               UEnumCharNamesFn *fn, void *context,
602               UCharNameChoice nameChoice) {
603    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
604    const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
605
606    s=expandGroupLengths(s, offsets, lengths);
607    if(fn!=DO_FIND_NAME) {
608        char buffer[200];
609        uint16_t length;
610
611        while(start<=end) {
612            length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
613            if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
614                buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
615            }
616            /* here, we assume that the buffer is large enough */
617            if(length>0) {
618                if(!fn(context, start, nameChoice, buffer, length)) {
619                    return FALSE;
620                }
621            }
622            ++start;
623        }
624    } else {
625        const char *otherName=((FindName *)context)->otherName;
626        while(start<=end) {
627            if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
628                ((FindName *)context)->code=start;
629                return FALSE;
630            }
631            ++start;
632        }
633    }
634    return TRUE;
635}
636
637/*
638 * enumExtNames enumerate extended names.
639 * It only needs to do it if it is called with a real function and not
640 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
641 * for extended names by itself.
642 */
643static UBool
644enumExtNames(UChar32 start, UChar32 end,
645             UEnumCharNamesFn *fn, void *context)
646{
647    if(fn!=DO_FIND_NAME) {
648        char buffer[200];
649        uint16_t length;
650
651        while(start<=end) {
652            buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
653            /* here, we assume that the buffer is large enough */
654            if(length>0) {
655                if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
656                    return FALSE;
657                }
658            }
659            ++start;
660        }
661    }
662
663    return TRUE;
664}
665
666static UBool
667enumNames(UCharNames *names,
668          UChar32 start, UChar32 limit,
669          UEnumCharNamesFn *fn, void *context,
670          UCharNameChoice nameChoice) {
671    uint16_t startGroupMSB, endGroupMSB, groupCount;
672    const uint16_t *group, *groupLimit;
673
674    startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
675    endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
676
677    /* find the group that contains start, or the highest before it */
678    group=getGroup(names, start);
679
680    if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
681        /* enumerate synthetic names between start and the group start */
682        UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT);
683        if(extLimit>limit) {
684            extLimit=limit;
685        }
686        if(!enumExtNames(start, extLimit-1, fn, context)) {
687            return FALSE;
688        }
689        start=extLimit;
690    }
691
692    if(startGroupMSB==endGroupMSB) {
693        if(startGroupMSB==group[GROUP_MSB]) {
694            /* if start and limit-1 are in the same group, then enumerate only in that one */
695            return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
696        }
697    } else {
698        const uint16_t *groups=GET_GROUPS(names);
699        groupCount=*groups++;
700        groupLimit=groups+groupCount*GROUP_LENGTH;
701
702        if(startGroupMSB==group[GROUP_MSB]) {
703            /* enumerate characters in the partial start group */
704            if((start&GROUP_MASK)!=0) {
705                if(!enumGroupNames(names, group,
706                                   start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
707                                   fn, context, nameChoice)) {
708                    return FALSE;
709                }
710                group=NEXT_GROUP(group); /* continue with the next group */
711            }
712        } else if(startGroupMSB>group[GROUP_MSB]) {
713            /* make sure that we start enumerating with the first group after start */
714            const uint16_t *nextGroup=NEXT_GROUP(group);
715            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
716                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
717                if (end > limit) {
718                    end = limit;
719                }
720                if (!enumExtNames(start, end - 1, fn, context)) {
721                    return FALSE;
722                }
723            }
724            group=nextGroup;
725        }
726
727        /* enumerate entire groups between the start- and end-groups */
728        while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
729            const uint16_t *nextGroup;
730            start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
731            if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
732                return FALSE;
733            }
734            nextGroup=NEXT_GROUP(group);
735            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
736                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
737                if (end > limit) {
738                    end = limit;
739                }
740                if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
741                    return FALSE;
742                }
743            }
744            group=nextGroup;
745        }
746
747        /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
748        if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
749            return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
750        } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
751            UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
752            if (next > start) {
753                start = next;
754            }
755        } else {
756            return TRUE;
757        }
758    }
759
760    /* we have not found a group, which means everything is made of
761       extended names. */
762    if (nameChoice == U_EXTENDED_CHAR_NAME) {
763        if (limit > UCHAR_MAX_VALUE + 1) {
764            limit = UCHAR_MAX_VALUE + 1;
765        }
766        return enumExtNames(start, limit - 1, fn, context);
767    }
768
769    return TRUE;
770}
771
772static uint16_t
773writeFactorSuffix(const uint16_t *factors, uint16_t count,
774                  const char *s, /* suffix elements */
775                  uint32_t code,
776                  uint16_t indexes[8], /* output fields from here */
777                  const char *elementBases[8], const char *elements[8],
778                  char *buffer, uint16_t bufferLength) {
779    uint16_t i, factor, bufferPos=0;
780    char c;
781
782    /* write elements according to the factors */
783
784    /*
785     * the factorized elements are determined by modulo arithmetic
786     * with the factors of this algorithm
787     *
788     * note that for fewer operations, count is decremented here
789     */
790    --count;
791    for(i=count; i>0; --i) {
792        factor=factors[i];
793        indexes[i]=(uint16_t)(code%factor);
794        code/=factor;
795    }
796    /*
797     * we don't need to calculate the last modulus because start<=code<=end
798     * guarantees here that code<=factors[0]
799     */
800    indexes[0]=(uint16_t)code;
801
802    /* write each element */
803    for(;;) {
804        if(elementBases!=NULL) {
805            *elementBases++=s;
806        }
807
808        /* skip indexes[i] strings */
809        factor=indexes[i];
810        while(factor>0) {
811            while(*s++!=0) {}
812            --factor;
813        }
814        if(elements!=NULL) {
815            *elements++=s;
816        }
817
818        /* write element */
819        while((c=*s++)!=0) {
820            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
821        }
822
823        /* we do not need to perform the rest of this loop for i==count - break here */
824        if(i>=count) {
825            break;
826        }
827
828        /* skip the rest of the strings for this factors[i] */
829        factor=(uint16_t)(factors[i]-indexes[i]-1);
830        while(factor>0) {
831            while(*s++!=0) {}
832            --factor;
833        }
834
835        ++i;
836    }
837
838    /* zero-terminate */
839    if(bufferLength>0) {
840        *buffer=0;
841    }
842
843    return bufferPos;
844}
845
846/*
847 * Important:
848 * Parts of findAlgName() are almost the same as some of getAlgName().
849 * Fixes must be applied to both.
850 */
851static uint16_t
852getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
853        char *buffer, uint16_t bufferLength) {
854    uint16_t bufferPos=0;
855
856    /* Only the normative character name can be algorithmic. */
857    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
858        /* zero-terminate */
859        if(bufferLength>0) {
860            *buffer=0;
861        }
862        return 0;
863    }
864
865    switch(range->type) {
866    case 0: {
867        /* name = prefix hex-digits */
868        const char *s=(const char *)(range+1);
869        char c;
870
871        uint16_t i, count;
872
873        /* copy prefix */
874        while((c=*s++)!=0) {
875            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
876        }
877
878        /* write hexadecimal code point value */
879        count=range->variant;
880
881        /* zero-terminate */
882        if(count<bufferLength) {
883            buffer[count]=0;
884        }
885
886        for(i=count; i>0;) {
887            if(--i<bufferLength) {
888                c=(char)(code&0xf);
889                if(c<10) {
890                    c+='0';
891                } else {
892                    c+='A'-10;
893                }
894                buffer[i]=c;
895            }
896            code>>=4;
897        }
898
899        bufferPos+=count;
900        break;
901    }
902    case 1: {
903        /* name = prefix factorized-elements */
904        uint16_t indexes[8];
905        const uint16_t *factors=(const uint16_t *)(range+1);
906        uint16_t count=range->variant;
907        const char *s=(const char *)(factors+count);
908        char c;
909
910        /* copy prefix */
911        while((c=*s++)!=0) {
912            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
913        }
914
915        bufferPos+=writeFactorSuffix(factors, count,
916                                     s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
917        break;
918    }
919    default:
920        /* undefined type */
921        /* zero-terminate */
922        if(bufferLength>0) {
923            *buffer=0;
924        }
925        break;
926    }
927
928    return bufferPos;
929}
930
931/*
932 * Important: enumAlgNames() and findAlgName() are almost the same.
933 * Any fix must be applied to both.
934 */
935static UBool
936enumAlgNames(AlgorithmicRange *range,
937             UChar32 start, UChar32 limit,
938             UEnumCharNamesFn *fn, void *context,
939             UCharNameChoice nameChoice) {
940    char buffer[200];
941    uint16_t length;
942
943    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
944        return TRUE;
945    }
946
947    switch(range->type) {
948    case 0: {
949        char *s, *end;
950        char c;
951
952        /* get the full name of the start character */
953        length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
954        if(length<=0) {
955            return TRUE;
956        }
957
958        /* call the enumerator function with this first character */
959        if(!fn(context, start, nameChoice, buffer, length)) {
960            return FALSE;
961        }
962
963        /* go to the end of the name; all these names have the same length */
964        end=buffer;
965        while(*end!=0) {
966            ++end;
967        }
968
969        /* enumerate the rest of the names */
970        while(++start<limit) {
971            /* increment the hexadecimal number on a character-basis */
972            s=end;
973            for (;;) {
974                c=*--s;
975                if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
976                    *s=(char)(c+1);
977                    break;
978                } else if(c=='9') {
979                    *s='A';
980                    break;
981                } else if(c=='F') {
982                    *s='0';
983                }
984            }
985
986            if(!fn(context, start, nameChoice, buffer, length)) {
987                return FALSE;
988            }
989        }
990        break;
991    }
992    case 1: {
993        uint16_t indexes[8];
994        const char *elementBases[8], *elements[8];
995        const uint16_t *factors=(const uint16_t *)(range+1);
996        uint16_t count=range->variant;
997        const char *s=(const char *)(factors+count);
998        char *suffix, *t;
999        uint16_t prefixLength, i, idx;
1000
1001        char c;
1002
1003        /* name = prefix factorized-elements */
1004
1005        /* copy prefix */
1006        suffix=buffer;
1007        prefixLength=0;
1008        while((c=*s++)!=0) {
1009            *suffix++=c;
1010            ++prefixLength;
1011        }
1012
1013        /* append the suffix of the start character */
1014        length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
1015                                              s, (uint32_t)start-range->start,
1016                                              indexes, elementBases, elements,
1017                                              suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
1018
1019        /* call the enumerator function with this first character */
1020        if(!fn(context, start, nameChoice, buffer, length)) {
1021            return FALSE;
1022        }
1023
1024        /* enumerate the rest of the names */
1025        while(++start<limit) {
1026            /* increment the indexes in lexical order bound by the factors */
1027            i=count;
1028            for (;;) {
1029                idx=(uint16_t)(indexes[--i]+1);
1030                if(idx<factors[i]) {
1031                    /* skip one index and its element string */
1032                    indexes[i]=idx;
1033                    s=elements[i];
1034                    while(*s++!=0) {
1035                    }
1036                    elements[i]=s;
1037                    break;
1038                } else {
1039                    /* reset this index to 0 and its element string to the first one */
1040                    indexes[i]=0;
1041                    elements[i]=elementBases[i];
1042                }
1043            }
1044
1045            /* to make matters a little easier, just append all elements to the suffix */
1046            t=suffix;
1047            length=prefixLength;
1048            for(i=0; i<count; ++i) {
1049                s=elements[i];
1050                while((c=*s++)!=0) {
1051                    *t++=c;
1052                    ++length;
1053                }
1054            }
1055            /* zero-terminate */
1056            *t=0;
1057
1058            if(!fn(context, start, nameChoice, buffer, length)) {
1059                return FALSE;
1060            }
1061        }
1062        break;
1063    }
1064    default:
1065        /* undefined type */
1066        break;
1067    }
1068
1069    return TRUE;
1070}
1071
1072/*
1073 * findAlgName() is almost the same as enumAlgNames() except that it
1074 * returns the code point for a name if it fits into the range.
1075 * It returns 0xffff otherwise.
1076 */
1077static UChar32
1078findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1079    UChar32 code;
1080
1081    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1082        return 0xffff;
1083    }
1084
1085    switch(range->type) {
1086    case 0: {
1087        /* name = prefix hex-digits */
1088        const char *s=(const char *)(range+1);
1089        char c;
1090
1091        uint16_t i, count;
1092
1093        /* compare prefix */
1094        while((c=*s++)!=0) {
1095            if((char)c!=*otherName++) {
1096                return 0xffff;
1097            }
1098        }
1099
1100        /* read hexadecimal code point value */
1101        count=range->variant;
1102        code=0;
1103        for(i=0; i<count; ++i) {
1104            c=*otherName++;
1105            if('0'<=c && c<='9') {
1106                code=(code<<4)|(c-'0');
1107            } else if('A'<=c && c<='F') {
1108                code=(code<<4)|(c-'A'+10);
1109            } else {
1110                return 0xffff;
1111            }
1112        }
1113
1114        /* does it fit into the range? */
1115        if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1116            return code;
1117        }
1118        break;
1119    }
1120    case 1: {
1121        char buffer[64];
1122        uint16_t indexes[8];
1123        const char *elementBases[8], *elements[8];
1124        const uint16_t *factors=(const uint16_t *)(range+1);
1125        uint16_t count=range->variant;
1126        const char *s=(const char *)(factors+count), *t;
1127        UChar32 start, limit;
1128        uint16_t i, idx;
1129
1130        char c;
1131
1132        /* name = prefix factorized-elements */
1133
1134        /* compare prefix */
1135        while((c=*s++)!=0) {
1136            if((char)c!=*otherName++) {
1137                return 0xffff;
1138            }
1139        }
1140
1141        start=(UChar32)range->start;
1142        limit=(UChar32)(range->end+1);
1143
1144        /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1145        writeFactorSuffix(factors, count, s, 0,
1146                          indexes, elementBases, elements, buffer, sizeof(buffer));
1147
1148        /* compare the first suffix */
1149        if(0==uprv_strcmp(otherName, buffer)) {
1150            return start;
1151        }
1152
1153        /* enumerate and compare the rest of the suffixes */
1154        while(++start<limit) {
1155            /* increment the indexes in lexical order bound by the factors */
1156            i=count;
1157            for (;;) {
1158                idx=(uint16_t)(indexes[--i]+1);
1159                if(idx<factors[i]) {
1160                    /* skip one index and its element string */
1161                    indexes[i]=idx;
1162                    s=elements[i];
1163                    while(*s++!=0) {}
1164                    elements[i]=s;
1165                    break;
1166                } else {
1167                    /* reset this index to 0 and its element string to the first one */
1168                    indexes[i]=0;
1169                    elements[i]=elementBases[i];
1170                }
1171            }
1172
1173            /* to make matters a little easier, just compare all elements of the suffix */
1174            t=otherName;
1175            for(i=0; i<count; ++i) {
1176                s=elements[i];
1177                while((c=*s++)!=0) {
1178                    if(c!=*t++) {
1179                        s=""; /* does not match */
1180                        i=99;
1181                    }
1182                }
1183            }
1184            if(i<99 && *t==0) {
1185                return start;
1186            }
1187        }
1188        break;
1189    }
1190    default:
1191        /* undefined type */
1192        break;
1193    }
1194
1195    return 0xffff;
1196}
1197
1198/* sets of name characters, maximum name lengths ---------------------------- */
1199
1200#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1201#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1202
1203static int32_t
1204calcStringSetLength(uint32_t set[8], const char *s) {
1205    int32_t length=0;
1206    char c;
1207
1208    while((c=*s++)!=0) {
1209        SET_ADD(set, c);
1210        ++length;
1211    }
1212    return length;
1213}
1214
1215static int32_t
1216calcAlgNameSetsLengths(int32_t maxNameLength) {
1217    AlgorithmicRange *range;
1218    uint32_t *p;
1219    uint32_t rangeCount;
1220    int32_t length;
1221
1222    /* enumerate algorithmic ranges */
1223    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1224    rangeCount=*p;
1225    range=(AlgorithmicRange *)(p+1);
1226    while(rangeCount>0) {
1227        switch(range->type) {
1228        case 0:
1229            /* name = prefix + (range->variant times) hex-digits */
1230            /* prefix */
1231            length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1232            if(length>maxNameLength) {
1233                maxNameLength=length;
1234            }
1235            break;
1236        case 1: {
1237            /* name = prefix factorized-elements */
1238            const uint16_t *factors=(const uint16_t *)(range+1);
1239            const char *s;
1240            int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1241
1242            /* prefix length */
1243            s=(const char *)(factors+count);
1244            length=calcStringSetLength(gNameSet, s);
1245            s+=length+1; /* start of factor suffixes */
1246
1247            /* get the set and maximum factor suffix length for each factor */
1248            for(i=0; i<count; ++i) {
1249                maxFactorLength=0;
1250                for(factor=factors[i]; factor>0; --factor) {
1251                    factorLength=calcStringSetLength(gNameSet, s);
1252                    s+=factorLength+1;
1253                    if(factorLength>maxFactorLength) {
1254                        maxFactorLength=factorLength;
1255                    }
1256                }
1257                length+=maxFactorLength;
1258            }
1259
1260            if(length>maxNameLength) {
1261                maxNameLength=length;
1262            }
1263            break;
1264        }
1265        default:
1266            /* unknown type */
1267            break;
1268        }
1269
1270        range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1271        --rangeCount;
1272    }
1273    return maxNameLength;
1274}
1275
1276static int32_t
1277calcExtNameSetsLengths(int32_t maxNameLength) {
1278    int32_t i, length;
1279
1280    for(i=0; i<LENGTHOF(charCatNames); ++i) {
1281        /*
1282         * for each category, count the length of the category name
1283         * plus 9=
1284         * 2 for <>
1285         * 1 for -
1286         * 6 for most hex digits per code point
1287         */
1288        length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1289        if(length>maxNameLength) {
1290            maxNameLength=length;
1291        }
1292    }
1293    return maxNameLength;
1294}
1295
1296static int32_t
1297calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1298                  uint32_t set[8],
1299                  const uint8_t **pLine, const uint8_t *lineLimit) {
1300    const uint8_t *line=*pLine;
1301    int32_t length=0, tokenLength;
1302    uint16_t c, token;
1303
1304    while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1305        if(c>=tokenCount) {
1306            /* implicit letter */
1307            SET_ADD(set, c);
1308            ++length;
1309        } else {
1310            token=tokens[c];
1311            if(token==(uint16_t)(-2)) {
1312                /* this is a lead byte for a double-byte token */
1313                c=c<<8|*line++;
1314                token=tokens[c];
1315            }
1316            if(token==(uint16_t)(-1)) {
1317                /* explicit letter */
1318                SET_ADD(set, c);
1319                ++length;
1320            } else {
1321                /* count token word */
1322                if(tokenLengths!=NULL) {
1323                    /* use cached token length */
1324                    tokenLength=tokenLengths[c];
1325                    if(tokenLength==0) {
1326                        tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1327                        tokenLengths[c]=(int8_t)tokenLength;
1328                    }
1329                } else {
1330                    tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1331                }
1332                length+=tokenLength;
1333            }
1334        }
1335    }
1336
1337    *pLine=line;
1338    return length;
1339}
1340
1341static void
1342calcGroupNameSetsLengths(int32_t maxNameLength) {
1343    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1344
1345    uint16_t *tokens=(uint16_t *)uCharNames+8;
1346    uint16_t tokenCount=*tokens++;
1347    uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1348
1349    int8_t *tokenLengths;
1350
1351    const uint16_t *group;
1352    const uint8_t *s, *line, *lineLimit;
1353
1354    int32_t groupCount, lineNumber, length;
1355
1356    tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1357    if(tokenLengths!=NULL) {
1358        uprv_memset(tokenLengths, 0, tokenCount);
1359    }
1360
1361    group=GET_GROUPS(uCharNames);
1362    groupCount=*group++;
1363
1364    /* enumerate all groups */
1365    while(groupCount>0) {
1366        s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
1367        s=expandGroupLengths(s, offsets, lengths);
1368
1369        /* enumerate all lines in each group */
1370        for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1371            line=s+offsets[lineNumber];
1372            length=lengths[lineNumber];
1373            if(length==0) {
1374                continue;
1375            }
1376
1377            lineLimit=line+length;
1378
1379            /* read regular name */
1380            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1381            if(length>maxNameLength) {
1382                maxNameLength=length;
1383            }
1384            if(line==lineLimit) {
1385                continue;
1386            }
1387
1388            /* read Unicode 1.0 name */
1389            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1390            if(length>maxNameLength) {
1391                maxNameLength=length;
1392            }
1393            if(line==lineLimit) {
1394                continue;
1395            }
1396
1397            /* read ISO comment */
1398            /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1399        }
1400
1401        group=NEXT_GROUP(group);
1402        --groupCount;
1403    }
1404
1405    if(tokenLengths!=NULL) {
1406        uprv_free(tokenLengths);
1407    }
1408
1409    /* set gMax... - name length last for threading */
1410    gMaxNameLength=maxNameLength;
1411}
1412
1413static UBool
1414calcNameSetsLengths(UErrorCode *pErrorCode) {
1415    static const char extChars[]="0123456789ABCDEF<>-";
1416    int32_t i, maxNameLength;
1417
1418    if(gMaxNameLength!=0) {
1419        return TRUE;
1420    }
1421
1422    if(!isDataLoaded(pErrorCode)) {
1423        return FALSE;
1424    }
1425
1426    /* set hex digits, used in various names, and <>-, used in extended names */
1427    for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) {
1428        SET_ADD(gNameSet, extChars[i]);
1429    }
1430
1431    /* set sets and lengths from algorithmic names */
1432    maxNameLength=calcAlgNameSetsLengths(0);
1433
1434    /* set sets and lengths from extended names */
1435    maxNameLength=calcExtNameSetsLengths(maxNameLength);
1436
1437    /* set sets and lengths from group names, set global maximum values */
1438    calcGroupNameSetsLengths(maxNameLength);
1439
1440    return TRUE;
1441}
1442
1443/* public API --------------------------------------------------------------- */
1444
1445U_CAPI int32_t U_EXPORT2
1446u_charName(UChar32 code, UCharNameChoice nameChoice,
1447           char *buffer, int32_t bufferLength,
1448           UErrorCode *pErrorCode) {
1449    AlgorithmicRange *algRange;
1450    uint32_t *p;
1451    uint32_t i;
1452    int32_t length;
1453
1454    /* check the argument values */
1455    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1456        return 0;
1457    } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1458              bufferLength<0 || (bufferLength>0 && buffer==NULL)
1459    ) {
1460        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1461        return 0;
1462    }
1463
1464    if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1465        return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1466    }
1467
1468    length=0;
1469
1470    /* try algorithmic names first */
1471    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1472    i=*p;
1473    algRange=(AlgorithmicRange *)(p+1);
1474    while(i>0) {
1475        if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1476            length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1477            break;
1478        }
1479        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1480        --i;
1481    }
1482
1483    if(i==0) {
1484        if (nameChoice == U_EXTENDED_CHAR_NAME) {
1485            length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1486            if (!length) {
1487                /* extended character name */
1488                length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1489            }
1490        } else {
1491            /* normal character name */
1492            length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1493        }
1494    }
1495
1496    return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1497}
1498
1499U_CAPI int32_t U_EXPORT2
1500u_getISOComment(UChar32 /*c*/,
1501                char *dest, int32_t destCapacity,
1502                UErrorCode *pErrorCode) {
1503    /* check the argument values */
1504    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1505        return 0;
1506    } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1507        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1508        return 0;
1509    }
1510
1511    return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1512}
1513
1514U_CAPI UChar32 U_EXPORT2
1515u_charFromName(UCharNameChoice nameChoice,
1516               const char *name,
1517               UErrorCode *pErrorCode) {
1518    char upper[120], lower[120];
1519    FindName findName;
1520    AlgorithmicRange *algRange;
1521    uint32_t *p;
1522    uint32_t i;
1523    UChar32 cp = 0;
1524    char c0;
1525    UChar32 error = 0xffff;     /* Undefined, but use this for backwards compatibility. */
1526
1527    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1528        return error;
1529    }
1530
1531    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1532        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1533        return error;
1534    }
1535
1536    if(!isDataLoaded(pErrorCode)) {
1537        return error;
1538    }
1539
1540    /* construct the uppercase and lowercase of the name first */
1541    for(i=0; i<sizeof(upper); ++i) {
1542        if((c0=*name++)!=0) {
1543            upper[i]=uprv_toupper(c0);
1544            lower[i]=uprv_tolower(c0);
1545        } else {
1546            upper[i]=lower[i]=0;
1547            break;
1548        }
1549    }
1550    if(i==sizeof(upper)) {
1551        /* name too long, there is no such character */
1552        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1553        return error;
1554    }
1555
1556    /* try extended names first */
1557    if (lower[0] == '<') {
1558        if (nameChoice == U_EXTENDED_CHAR_NAME) {
1559            if (lower[--i] == '>') {
1560                for (--i; lower[i] && lower[i] != '-'; --i) {
1561                }
1562
1563                if (lower[i] == '-') { /* We've got a category. */
1564                    uint32_t cIdx;
1565
1566                    lower[i] = 0;
1567
1568                    for (++i; lower[i] != '>'; ++i) {
1569                        if (lower[i] >= '0' && lower[i] <= '9') {
1570                            cp = (cp << 4) + lower[i] - '0';
1571                        } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1572                            cp = (cp << 4) + lower[i] - 'a' + 10;
1573                        } else {
1574                            *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1575                            return error;
1576                        }
1577                    }
1578
1579                    /* Now validate the category name.
1580                       We could use a binary search, or a trie, if
1581                       we really wanted to. */
1582
1583                    for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
1584
1585                        if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1586                            if (getCharCat(cp) == cIdx) {
1587                                return cp;
1588                            }
1589                            break;
1590                        }
1591                    }
1592                }
1593            }
1594        }
1595
1596        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1597        return error;
1598    }
1599
1600    /* try algorithmic names now */
1601    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1602    i=*p;
1603    algRange=(AlgorithmicRange *)(p+1);
1604    while(i>0) {
1605        if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1606            return cp;
1607        }
1608        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1609        --i;
1610    }
1611
1612    /* normal character name */
1613    findName.otherName=upper;
1614    findName.code=error;
1615    enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1616    if (findName.code == error) {
1617         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1618    }
1619    return findName.code;
1620}
1621
1622U_CAPI void U_EXPORT2
1623u_enumCharNames(UChar32 start, UChar32 limit,
1624                UEnumCharNamesFn *fn,
1625                void *context,
1626                UCharNameChoice nameChoice,
1627                UErrorCode *pErrorCode) {
1628    AlgorithmicRange *algRange;
1629    uint32_t *p;
1630    uint32_t i;
1631
1632    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1633        return;
1634    }
1635
1636    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1637        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1638        return;
1639    }
1640
1641    if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1642        limit = UCHAR_MAX_VALUE + 1;
1643    }
1644    if((uint32_t)start>=(uint32_t)limit) {
1645        return;
1646    }
1647
1648    if(!isDataLoaded(pErrorCode)) {
1649        return;
1650    }
1651
1652    /* interleave the data-driven ones with the algorithmic ones */
1653    /* iterate over all algorithmic ranges; assume that they are in ascending order */
1654    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1655    i=*p;
1656    algRange=(AlgorithmicRange *)(p+1);
1657    while(i>0) {
1658        /* enumerate the character names before the current algorithmic range */
1659        /* here: start<limit */
1660        if((uint32_t)start<algRange->start) {
1661            if((uint32_t)limit<=algRange->start) {
1662                enumNames(uCharNames, start, limit, fn, context, nameChoice);
1663                return;
1664            }
1665            if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1666                return;
1667            }
1668            start=(UChar32)algRange->start;
1669        }
1670        /* enumerate the character names in the current algorithmic range */
1671        /* here: algRange->start<=start<limit */
1672        if((uint32_t)start<=algRange->end) {
1673            if((uint32_t)limit<=(algRange->end+1)) {
1674                enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1675                return;
1676            }
1677            if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1678                return;
1679            }
1680            start=(UChar32)algRange->end+1;
1681        }
1682        /* continue to the next algorithmic range (here: start<limit) */
1683        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1684        --i;
1685    }
1686    /* enumerate the character names after the last algorithmic range */
1687    enumNames(uCharNames, start, limit, fn, context, nameChoice);
1688}
1689
1690U_CAPI int32_t U_EXPORT2
1691uprv_getMaxCharNameLength() {
1692    UErrorCode errorCode=U_ZERO_ERROR;
1693    if(calcNameSetsLengths(&errorCode)) {
1694        return gMaxNameLength;
1695    } else {
1696        return 0;
1697    }
1698}
1699
1700/**
1701 * Converts the char set cset into a Unicode set uset.
1702 * @param cset Set of 256 bit flags corresponding to a set of chars.
1703 * @param uset USet to receive characters. Existing contents are deleted.
1704 */
1705static void
1706charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1707    UChar us[256];
1708    char cs[256];
1709
1710    int32_t i, length;
1711    UErrorCode errorCode;
1712
1713    errorCode=U_ZERO_ERROR;
1714
1715    if(!calcNameSetsLengths(&errorCode)) {
1716        return;
1717    }
1718
1719    /* build a char string with all chars that are used in character names */
1720    length=0;
1721    for(i=0; i<256; ++i) {
1722        if(SET_CONTAINS(cset, i)) {
1723            cs[length++]=(char)i;
1724        }
1725    }
1726
1727    /* convert the char string to a UChar string */
1728    u_charsToUChars(cs, us, length);
1729
1730    /* add each UChar to the USet */
1731    for(i=0; i<length; ++i) {
1732        if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
1733            sa->add(sa->set, us[i]);
1734        }
1735    }
1736}
1737
1738/**
1739 * Fills set with characters that are used in Unicode character names.
1740 * @param set USet to receive characters.
1741 */
1742U_CAPI void U_EXPORT2
1743uprv_getCharNameCharacters(const USetAdder *sa) {
1744    charSetToUSet(gNameSet, sa);
1745}
1746
1747/* data swapping ------------------------------------------------------------ */
1748
1749/*
1750 * The token table contains non-negative entries for token bytes,
1751 * and -1 for bytes that represent themselves in the data file's charset.
1752 * -2 entries are used for lead bytes.
1753 *
1754 * Direct bytes (-1 entries) must be translated from the input charset family
1755 * to the output charset family.
1756 * makeTokenMap() writes a permutation mapping for this.
1757 * Use it once for single-/lead-byte tokens and once more for all trail byte
1758 * tokens. (';' is an unused trail byte marked with -1.)
1759 */
1760static void
1761makeTokenMap(const UDataSwapper *ds,
1762             int16_t tokens[], uint16_t tokenCount,
1763             uint8_t map[256],
1764             UErrorCode *pErrorCode) {
1765    UBool usedOutChar[256];
1766    uint16_t i, j;
1767    uint8_t c1, c2;
1768
1769    if(U_FAILURE(*pErrorCode)) {
1770        return;
1771    }
1772
1773    if(ds->inCharset==ds->outCharset) {
1774        /* Same charset family: identity permutation */
1775        for(i=0; i<256; ++i) {
1776            map[i]=(uint8_t)i;
1777        }
1778    } else {
1779        uprv_memset(map, 0, 256);
1780        uprv_memset(usedOutChar, 0, 256);
1781
1782        if(tokenCount>256) {
1783            tokenCount=256;
1784        }
1785
1786        /* set the direct bytes (byte 0 always maps to itself) */
1787        for(i=1; i<tokenCount; ++i) {
1788            if(tokens[i]==-1) {
1789                /* convert the direct byte character */
1790                c1=(uint8_t)i;
1791                ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1792                if(U_FAILURE(*pErrorCode)) {
1793                    udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1794                                     i, ds->inCharset);
1795                    return;
1796                }
1797
1798                /* enter the converted character into the map and mark it used */
1799                map[c1]=c2;
1800                usedOutChar[c2]=TRUE;
1801            }
1802        }
1803
1804        /* set the mappings for the rest of the permutation */
1805        for(i=j=1; i<tokenCount; ++i) {
1806            /* set mappings that were not set for direct bytes */
1807            if(map[i]==0) {
1808                /* set an output byte value that was not used as an output byte above */
1809                while(usedOutChar[j]) {
1810                    ++j;
1811                }
1812                map[i]=(uint8_t)j++;
1813            }
1814        }
1815
1816        /*
1817         * leave mappings at tokenCount and above unset if tokenCount<256
1818         * because they won't be used
1819         */
1820    }
1821}
1822
1823U_CAPI int32_t U_EXPORT2
1824uchar_swapNames(const UDataSwapper *ds,
1825                const void *inData, int32_t length, void *outData,
1826                UErrorCode *pErrorCode) {
1827    const UDataInfo *pInfo;
1828    int32_t headerSize;
1829
1830    const uint8_t *inBytes;
1831    uint8_t *outBytes;
1832
1833    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1834             offset, i, count, stringsCount;
1835
1836    const AlgorithmicRange *inRange;
1837    AlgorithmicRange *outRange;
1838
1839    /* udata_swapDataHeader checks the arguments */
1840    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1841    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1842        return 0;
1843    }
1844
1845    /* check data format and format version */
1846    pInfo=(const UDataInfo *)((const char *)inData+4);
1847    if(!(
1848        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
1849        pInfo->dataFormat[1]==0x6e &&
1850        pInfo->dataFormat[2]==0x61 &&
1851        pInfo->dataFormat[3]==0x6d &&
1852        pInfo->formatVersion[0]==1
1853    )) {
1854        udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1855                         pInfo->dataFormat[0], pInfo->dataFormat[1],
1856                         pInfo->dataFormat[2], pInfo->dataFormat[3],
1857                         pInfo->formatVersion[0]);
1858        *pErrorCode=U_UNSUPPORTED_ERROR;
1859        return 0;
1860    }
1861
1862    inBytes=(const uint8_t *)inData+headerSize;
1863    outBytes=(uint8_t *)outData+headerSize;
1864    if(length<0) {
1865        algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1866    } else {
1867        length-=headerSize;
1868        if( length<20 ||
1869            (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1870        ) {
1871            udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1872                             length);
1873            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1874            return 0;
1875        }
1876    }
1877
1878    if(length<0) {
1879        /* preflighting: iterate through algorithmic ranges */
1880        offset=algNamesOffset;
1881        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1882        offset+=4;
1883
1884        for(i=0; i<count; ++i) {
1885            inRange=(const AlgorithmicRange *)(inBytes+offset);
1886            offset+=ds->readUInt16(inRange->size);
1887        }
1888    } else {
1889        /* swap data */
1890        const uint16_t *p;
1891        uint16_t *q, *temp;
1892
1893        int16_t tokens[512];
1894        uint16_t tokenCount;
1895
1896        uint8_t map[256], trailMap[256];
1897
1898        /* copy the data for inaccessible bytes */
1899        if(inBytes!=outBytes) {
1900            uprv_memcpy(outBytes, inBytes, length);
1901        }
1902
1903        /* the initial 4 offsets first */
1904        tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1905        groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1906        groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1907        ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1908
1909        /*
1910         * now the tokens table
1911         * it needs to be permutated along with the compressed name strings
1912         */
1913        p=(const uint16_t *)(inBytes+16);
1914        q=(uint16_t *)(outBytes+16);
1915
1916        /* read and swap the tokenCount */
1917        tokenCount=ds->readUInt16(*p);
1918        ds->swapArray16(ds, p, 2, q, pErrorCode);
1919        ++p;
1920        ++q;
1921
1922        /* read the first 512 tokens and make the token maps */
1923        if(tokenCount<=512) {
1924            count=tokenCount;
1925        } else {
1926            count=512;
1927        }
1928        for(i=0; i<count; ++i) {
1929            tokens[i]=udata_readInt16(ds, p[i]);
1930        }
1931        for(; i<512; ++i) {
1932            tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1933        }
1934        makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1935        makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1936        if(U_FAILURE(*pErrorCode)) {
1937            return 0;
1938        }
1939
1940        /*
1941         * swap and permutate the tokens
1942         * go through a temporary array to support in-place swapping
1943         */
1944        temp=(uint16_t *)uprv_malloc(tokenCount*2);
1945        if(temp==NULL) {
1946            udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1947                             tokenCount);
1948            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1949            return 0;
1950        }
1951
1952        /* swap and permutate single-/lead-byte tokens */
1953        for(i=0; i<tokenCount && i<256; ++i) {
1954            ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1955        }
1956
1957        /* swap and permutate trail-byte tokens */
1958        for(; i<tokenCount; ++i) {
1959            ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1960        }
1961
1962        /* copy the result into the output and free the temporary array */
1963        uprv_memcpy(q, temp, tokenCount*2);
1964        uprv_free(temp);
1965
1966        /*
1967         * swap the token strings but not a possible padding byte after
1968         * the terminating NUL of the last string
1969         */
1970        udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1971                                    outBytes+tokenStringOffset, pErrorCode);
1972        if(U_FAILURE(*pErrorCode)) {
1973            udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1974            return 0;
1975        }
1976
1977        /* swap the group table */
1978        count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
1979        ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
1980                           outBytes+groupsOffset, pErrorCode);
1981
1982        /*
1983         * swap the group strings
1984         * swap the string bytes but not the nibble-encoded string lengths
1985         */
1986        if(ds->inCharset!=ds->outCharset) {
1987            uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
1988
1989            const uint8_t *inStrings, *nextInStrings;
1990            uint8_t *outStrings;
1991
1992            uint8_t c;
1993
1994            inStrings=inBytes+groupStringOffset;
1995            outStrings=outBytes+groupStringOffset;
1996
1997            stringsCount=algNamesOffset-groupStringOffset;
1998
1999            /* iterate through string groups until only a few padding bytes are left */
2000            while(stringsCount>32) {
2001                nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2002
2003                /* move past the length bytes */
2004                stringsCount-=(uint32_t)(nextInStrings-inStrings);
2005                outStrings+=nextInStrings-inStrings;
2006                inStrings=nextInStrings;
2007
2008                count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2009                stringsCount-=count;
2010
2011                /* swap the string bytes using map[] and trailMap[] */
2012                while(count>0) {
2013                    c=*inStrings++;
2014                    *outStrings++=map[c];
2015                    if(tokens[c]!=-2) {
2016                        --count;
2017                    } else {
2018                        /* token lead byte: swap the trail byte, too */
2019                        *outStrings++=trailMap[*inStrings++];
2020                        count-=2;
2021                    }
2022                }
2023            }
2024        }
2025
2026        /* swap the algorithmic ranges */
2027        offset=algNamesOffset;
2028        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2029        ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2030        offset+=4;
2031
2032        for(i=0; i<count; ++i) {
2033            if(offset>(uint32_t)length) {
2034                udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2035                                 length, i);
2036                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2037                return 0;
2038            }
2039
2040            inRange=(const AlgorithmicRange *)(inBytes+offset);
2041            outRange=(AlgorithmicRange *)(outBytes+offset);
2042            offset+=ds->readUInt16(inRange->size);
2043
2044            ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2045            ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2046            switch(inRange->type) {
2047            case 0:
2048                /* swap prefix string */
2049                ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2050                                    outRange+1, pErrorCode);
2051                if(U_FAILURE(*pErrorCode)) {
2052                    udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2053                                     i);
2054                    return 0;
2055                }
2056                break;
2057            case 1:
2058                {
2059                    /* swap factors and the prefix and factor strings */
2060                    uint32_t factorsCount;
2061
2062                    factorsCount=inRange->variant;
2063                    p=(const uint16_t *)(inRange+1);
2064                    q=(uint16_t *)(outRange+1);
2065                    ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2066
2067                    /* swap the strings, up to the last terminating NUL */
2068                    p+=factorsCount;
2069                    q+=factorsCount;
2070                    stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2071                    while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2072                        --stringsCount;
2073                    }
2074                    ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2075                }
2076                break;
2077            default:
2078                udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2079                                 inRange->type, i);
2080                *pErrorCode=U_UNSUPPORTED_ERROR;
2081                return 0;
2082            }
2083        }
2084    }
2085
2086    return headerSize+(int32_t)offset;
2087}
2088
2089U_NAMESPACE_END
2090
2091/*
2092 * Hey, Emacs, please set the following:
2093 *
2094 * Local Variables:
2095 * indent-tabs-mode: nil
2096 * End:
2097 *
2098 */
2099