1/*
2******************************************************************************
3*
4*   Copyright (C) 1999-2013, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  unames.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 1999oct04
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/putil.h"
19#include "unicode/uchar.h"
20#include "unicode/udata.h"
21#include "unicode/utf.h"
22#include "unicode/utf16.h"
23#include "uassert.h"
24#include "ustr_imp.h"
25#include "umutex.h"
26#include "cmemory.h"
27#include "cstring.h"
28#include "ucln_cmn.h"
29#include "udataswp.h"
30#include "uprops.h"
31
32U_NAMESPACE_BEGIN
33
34/* prototypes ------------------------------------------------------------- */
35
36#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
37
38static const char DATA_NAME[] = "unames";
39static const char DATA_TYPE[] = "icu";
40
41#define GROUP_SHIFT 5
42#define LINES_PER_GROUP (1L<<GROUP_SHIFT)
43#define GROUP_MASK (LINES_PER_GROUP-1)
44
45/*
46 * This struct was replaced by explicitly accessing equivalent
47 * fields from triples of uint16_t.
48 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
49 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
50 * would advance by 6 bytes (3 uint16_t).
51 *
52 * We can't just change the data structure because it's loaded from a data file,
53 * and we don't want to make it less compact, so we changed the access code.
54 *
55 * For details see ICU tickets 6331 and 6008.
56typedef struct {
57    uint16_t groupMSB,
58             offsetHigh, offsetLow; / * avoid padding * /
59} Group;
60 */
61enum {
62    GROUP_MSB,
63    GROUP_OFFSET_HIGH,
64    GROUP_OFFSET_LOW,
65    GROUP_LENGTH
66};
67
68/*
69 * Get the 32-bit group offset.
70 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
71 * @return group offset (int32_t)
72 */
73#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
74
75#define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
76#define PREV_GROUP(group) ((group)-GROUP_LENGTH)
77
78typedef struct {
79    uint32_t start, end;
80    uint8_t type, variant;
81    uint16_t size;
82} AlgorithmicRange;
83
84typedef struct {
85    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
86} UCharNames;
87
88/*
89 * Get the groups table from a UCharNames struct.
90 * The groups table consists of one uint16_t groupCount followed by
91 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
92 * and the comment for the old struct Group above.
93 *
94 * @param names (const UCharNames *) pointer to the UCharNames indexes
95 * @return (const uint16_t *) pointer to the groups table
96 */
97#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
98
99typedef struct {
100    const char *otherName;
101    UChar32 code;
102} FindName;
103
104#define DO_FIND_NAME NULL
105
106static UDataMemory *uCharNamesData=NULL;
107static UCharNames *uCharNames=NULL;
108static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER;
109
110/*
111 * Maximum length of character names (regular & 1.0).
112 */
113static int32_t gMaxNameLength=0;
114
115/*
116 * Set of chars used in character names (regular & 1.0).
117 * Chars are platform-dependent (can be EBCDIC).
118 */
119static uint32_t gNameSet[8]={ 0 };
120
121#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
122#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
123#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
124
125#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
126
127static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
128    "unassigned",
129    "uppercase letter",
130    "lowercase letter",
131    "titlecase letter",
132    "modifier letter",
133    "other letter",
134    "non spacing mark",
135    "enclosing mark",
136    "combining spacing mark",
137    "decimal digit number",
138    "letter number",
139    "other number",
140    "space separator",
141    "line separator",
142    "paragraph separator",
143    "control",
144    "format",
145    "private use area",
146    "surrogate",
147    "dash punctuation",
148    "start punctuation",
149    "end punctuation",
150    "connector punctuation",
151    "other punctuation",
152    "math symbol",
153    "currency symbol",
154    "modifier symbol",
155    "other symbol",
156    "initial punctuation",
157    "final punctuation",
158    "noncharacter",
159    "lead surrogate",
160    "trail surrogate"
161};
162
163/* implementation ----------------------------------------------------------- */
164
165static UBool U_CALLCONV unames_cleanup(void)
166{
167    if(uCharNamesData) {
168        udata_close(uCharNamesData);
169        uCharNamesData = NULL;
170    }
171    if(uCharNames) {
172        uCharNames = NULL;
173    }
174    gCharNamesInitOnce.reset();
175    gMaxNameLength=0;
176    return TRUE;
177}
178
179static UBool U_CALLCONV
180isAcceptable(void * /*context*/,
181             const char * /*type*/, const char * /*name*/,
182             const UDataInfo *pInfo) {
183    return (UBool)(
184        pInfo->size>=20 &&
185        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
186        pInfo->charsetFamily==U_CHARSET_FAMILY &&
187        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
188        pInfo->dataFormat[1]==0x6e &&
189        pInfo->dataFormat[2]==0x61 &&
190        pInfo->dataFormat[3]==0x6d &&
191        pInfo->formatVersion[0]==1);
192}
193
194static void U_CALLCONV
195loadCharNames(UErrorCode &status) {
196    U_ASSERT(uCharNamesData == NULL);
197    U_ASSERT(uCharNames == NULL);
198
199    uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status);
200    if(U_FAILURE(status)) {
201        uCharNamesData = NULL;
202    } else {
203        uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
204    }
205    ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
206}
207
208
209static UBool
210isDataLoaded(UErrorCode *pErrorCode) {
211    umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
212    return U_SUCCESS(*pErrorCode);
213}
214
215#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
216    if((bufferLength)>0) { \
217        *(buffer)++=c; \
218        --(bufferLength); \
219    } \
220    ++(bufferPos); \
221}
222
223#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
224
225/*
226 * Important: expandName() and compareName() are almost the same -
227 * apply fixes to both.
228 *
229 * UnicodeData.txt uses ';' as a field separator, so no
230 * field can contain ';' as part of its contents.
231 * In unames.dat, it is marked as token[';']==-1 only if the
232 * semicolon is used in the data file - which is iff we
233 * have Unicode 1.0 names or ISO comments or aliases.
234 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
235 * although we know that it will never be part of a name.
236 */
237static uint16_t
238expandName(UCharNames *names,
239           const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
240           char *buffer, uint16_t bufferLength) {
241    uint16_t *tokens=(uint16_t *)names+8;
242    uint16_t token, tokenCount=*tokens++, bufferPos=0;
243    uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
244    uint8_t c;
245
246    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
247        /*
248         * skip the modern name if it is not requested _and_
249         * if the semicolon byte value is a character, not a token number
250         */
251        if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
252            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
253            do {
254                while(nameLength>0) {
255                    --nameLength;
256                    if(*name++==';') {
257                        break;
258                    }
259                }
260            } while(--fieldIndex>0);
261        } else {
262            /*
263             * the semicolon byte value is a token number, therefore
264             * only modern names are stored in unames.dat and there is no
265             * such requested alternate name here
266             */
267            nameLength=0;
268        }
269    }
270
271    /* write each letter directly, and write a token word per token */
272    while(nameLength>0) {
273        --nameLength;
274        c=*name++;
275
276        if(c>=tokenCount) {
277            if(c!=';') {
278                /* implicit letter */
279                WRITE_CHAR(buffer, bufferLength, bufferPos, c);
280            } else {
281                /* finished */
282                break;
283            }
284        } else {
285            token=tokens[c];
286            if(token==(uint16_t)(-2)) {
287                /* this is a lead byte for a double-byte token */
288                token=tokens[c<<8|*name++];
289                --nameLength;
290            }
291            if(token==(uint16_t)(-1)) {
292                if(c!=';') {
293                    /* explicit letter */
294                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
295                } else {
296                    /* stop, but skip the semicolon if we are seeking
297                       extended names and there was no 2.0 name but there
298                       is a 1.0 name. */
299                    if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
300                        if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
301                            continue;
302                        }
303                    }
304                    /* finished */
305                    break;
306                }
307            } else {
308                /* write token word */
309                uint8_t *tokenString=tokenStrings+token;
310                while((c=*tokenString++)!=0) {
311                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
312                }
313            }
314        }
315    }
316
317    /* zero-terminate */
318    if(bufferLength>0) {
319        *buffer=0;
320    }
321
322    return bufferPos;
323}
324
325/*
326 * compareName() is almost the same as expandName() except that it compares
327 * the currently expanded name to an input name.
328 * It returns the match/no match result as soon as possible.
329 */
330static UBool
331compareName(UCharNames *names,
332            const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
333            const char *otherName) {
334    uint16_t *tokens=(uint16_t *)names+8;
335    uint16_t token, tokenCount=*tokens++;
336    uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
337    uint8_t c;
338    const char *origOtherName = otherName;
339
340    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
341        /*
342         * skip the modern name if it is not requested _and_
343         * if the semicolon byte value is a character, not a token number
344         */
345        if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
346            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
347            do {
348                while(nameLength>0) {
349                    --nameLength;
350                    if(*name++==';') {
351                        break;
352                    }
353                }
354            } while(--fieldIndex>0);
355        } else {
356            /*
357             * the semicolon byte value is a token number, therefore
358             * only modern names are stored in unames.dat and there is no
359             * such requested alternate name here
360             */
361            nameLength=0;
362        }
363    }
364
365    /* compare each letter directly, and compare a token word per token */
366    while(nameLength>0) {
367        --nameLength;
368        c=*name++;
369
370        if(c>=tokenCount) {
371            if(c!=';') {
372                /* implicit letter */
373                if((char)c!=*otherName++) {
374                    return FALSE;
375                }
376            } else {
377                /* finished */
378                break;
379            }
380        } else {
381            token=tokens[c];
382            if(token==(uint16_t)(-2)) {
383                /* this is a lead byte for a double-byte token */
384                token=tokens[c<<8|*name++];
385                --nameLength;
386            }
387            if(token==(uint16_t)(-1)) {
388                if(c!=';') {
389                    /* explicit letter */
390                    if((char)c!=*otherName++) {
391                        return FALSE;
392                    }
393                } else {
394                    /* stop, but skip the semicolon if we are seeking
395                       extended names and there was no 2.0 name but there
396                       is a 1.0 name. */
397                    if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
398                        if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
399                            continue;
400                        }
401                    }
402                    /* finished */
403                    break;
404                }
405            } else {
406                /* write token word */
407                uint8_t *tokenString=tokenStrings+token;
408                while((c=*tokenString++)!=0) {
409                    if((char)c!=*otherName++) {
410                        return FALSE;
411                    }
412                }
413            }
414        }
415    }
416
417    /* complete match? */
418    return (UBool)(*otherName==0);
419}
420
421static uint8_t getCharCat(UChar32 cp) {
422    uint8_t cat;
423
424    if (U_IS_UNICODE_NONCHAR(cp)) {
425        return U_NONCHARACTER_CODE_POINT;
426    }
427
428    if ((cat = u_charType(cp)) == U_SURROGATE) {
429        cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
430    }
431
432    return cat;
433}
434
435static const char *getCharCatName(UChar32 cp) {
436    uint8_t cat = getCharCat(cp);
437
438    /* Return unknown if the table of names above is not up to
439       date. */
440
441    if (cat >= LENGTHOF(charCatNames)) {
442        return "unknown";
443    } else {
444        return charCatNames[cat];
445    }
446}
447
448static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
449    const char *catname = getCharCatName(code);
450    uint16_t length = 0;
451
452    UChar32 cp;
453    int ndigits, i;
454
455    WRITE_CHAR(buffer, bufferLength, length, '<');
456    while (catname[length - 1]) {
457        WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
458    }
459    WRITE_CHAR(buffer, bufferLength, length, '-');
460    for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
461        ;
462    if (ndigits < 4)
463        ndigits = 4;
464    for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
465        uint8_t v = (uint8_t)(cp & 0xf);
466        buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
467    }
468    buffer += ndigits;
469    length += ndigits;
470    WRITE_CHAR(buffer, bufferLength, length, '>');
471
472    return length;
473}
474
475/*
476 * getGroup() does a binary search for the group that contains the
477 * Unicode code point "code".
478 * The return value is always a valid Group* that may contain "code"
479 * or else is the highest group before "code".
480 * If the lowest group is after "code", then that one is returned.
481 */
482static const uint16_t *
483getGroup(UCharNames *names, uint32_t code) {
484    const uint16_t *groups=GET_GROUPS(names);
485    uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
486             start=0,
487             limit=*groups++,
488             number;
489
490    /* binary search for the group of names that contains the one for code */
491    while(start<limit-1) {
492        number=(uint16_t)((start+limit)/2);
493        if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
494            limit=number;
495        } else {
496            start=number;
497        }
498    }
499
500    /* return this regardless of whether it is an exact match */
501    return groups+start*GROUP_LENGTH;
502}
503
504/*
505 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
506 * expands them into offsets and lengths for each string.
507 * Lengths are stored with a variable-width encoding in consecutive nibbles:
508 * If a nibble<0xc, then it is the length itself (0=empty string).
509 * If a nibble>=0xc, then it forms a length value with the following nibble.
510 * Calculation see below.
511 * The offsets and lengths arrays must be at least 33 (one more) long because
512 * there is no check here at the end if the last nibble is still used.
513 */
514static const uint8_t *
515expandGroupLengths(const uint8_t *s,
516                   uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
517    /* read the lengths of the 32 strings in this group and get each string's offset */
518    uint16_t i=0, offset=0, length=0;
519    uint8_t lengthByte;
520
521    /* all 32 lengths must be read to get the offset of the first group string */
522    while(i<LINES_PER_GROUP) {
523        lengthByte=*s++;
524
525        /* read even nibble - MSBs of lengthByte */
526        if(length>=12) {
527            /* double-nibble length spread across two bytes */
528            length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
529            lengthByte&=0xf;
530        } else if((lengthByte /* &0xf0 */)>=0xc0) {
531            /* double-nibble length spread across this one byte */
532            length=(uint16_t)((lengthByte&0x3f)+12);
533        } else {
534            /* single-nibble length in MSBs */
535            length=(uint16_t)(lengthByte>>4);
536            lengthByte&=0xf;
537        }
538
539        *offsets++=offset;
540        *lengths++=length;
541
542        offset+=length;
543        ++i;
544
545        /* read odd nibble - LSBs of lengthByte */
546        if((lengthByte&0xf0)==0) {
547            /* this nibble was not consumed for a double-nibble length above */
548            length=lengthByte;
549            if(length<12) {
550                /* single-nibble length in LSBs */
551                *offsets++=offset;
552                *lengths++=length;
553
554                offset+=length;
555                ++i;
556            }
557        } else {
558            length=0;   /* prevent double-nibble detection in the next iteration */
559        }
560    }
561
562    /* now, s is at the first group string */
563    return s;
564}
565
566static uint16_t
567expandGroupName(UCharNames *names, const uint16_t *group,
568                uint16_t lineNumber, UCharNameChoice nameChoice,
569                char *buffer, uint16_t bufferLength) {
570    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
571    const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
572    s=expandGroupLengths(s, offsets, lengths);
573    return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
574                      buffer, bufferLength);
575}
576
577static uint16_t
578getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
579        char *buffer, uint16_t bufferLength) {
580    const uint16_t *group=getGroup(names, code);
581    if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
582        return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
583                               buffer, bufferLength);
584    } else {
585        /* group not found */
586        /* zero-terminate */
587        if(bufferLength>0) {
588            *buffer=0;
589        }
590        return 0;
591    }
592}
593
594/*
595 * enumGroupNames() enumerates all the names in a 32-group
596 * and either calls the enumerator function or finds a given input name.
597 */
598static UBool
599enumGroupNames(UCharNames *names, const uint16_t *group,
600               UChar32 start, UChar32 end,
601               UEnumCharNamesFn *fn, void *context,
602               UCharNameChoice nameChoice) {
603    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
604    const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
605
606    s=expandGroupLengths(s, offsets, lengths);
607    if(fn!=DO_FIND_NAME) {
608        char buffer[200];
609        uint16_t length;
610
611        while(start<=end) {
612            length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
613            if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
614                buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
615            }
616            /* here, we assume that the buffer is large enough */
617            if(length>0) {
618                if(!fn(context, start, nameChoice, buffer, length)) {
619                    return FALSE;
620                }
621            }
622            ++start;
623        }
624    } else {
625        const char *otherName=((FindName *)context)->otherName;
626        while(start<=end) {
627            if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
628                ((FindName *)context)->code=start;
629                return FALSE;
630            }
631            ++start;
632        }
633    }
634    return TRUE;
635}
636
637/*
638 * enumExtNames enumerate extended names.
639 * It only needs to do it if it is called with a real function and not
640 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
641 * for extended names by itself.
642 */
643static UBool
644enumExtNames(UChar32 start, UChar32 end,
645             UEnumCharNamesFn *fn, void *context)
646{
647    if(fn!=DO_FIND_NAME) {
648        char buffer[200];
649        uint16_t length;
650
651        while(start<=end) {
652            buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
653            /* here, we assume that the buffer is large enough */
654            if(length>0) {
655                if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
656                    return FALSE;
657                }
658            }
659            ++start;
660        }
661    }
662
663    return TRUE;
664}
665
666static UBool
667enumNames(UCharNames *names,
668          UChar32 start, UChar32 limit,
669          UEnumCharNamesFn *fn, void *context,
670          UCharNameChoice nameChoice) {
671    uint16_t startGroupMSB, endGroupMSB, groupCount;
672    const uint16_t *group, *groupLimit;
673
674    startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
675    endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
676
677    /* find the group that contains start, or the highest before it */
678    group=getGroup(names, start);
679
680    if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
681        /* enumerate synthetic names between start and the group start */
682        UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT);
683        if(extLimit>limit) {
684            extLimit=limit;
685        }
686        if(!enumExtNames(start, extLimit-1, fn, context)) {
687            return FALSE;
688        }
689        start=extLimit;
690    }
691
692    if(startGroupMSB==endGroupMSB) {
693        if(startGroupMSB==group[GROUP_MSB]) {
694            /* if start and limit-1 are in the same group, then enumerate only in that one */
695            return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
696        }
697    } else {
698        const uint16_t *groups=GET_GROUPS(names);
699        groupCount=*groups++;
700        groupLimit=groups+groupCount*GROUP_LENGTH;
701
702        if(startGroupMSB==group[GROUP_MSB]) {
703            /* enumerate characters in the partial start group */
704            if((start&GROUP_MASK)!=0) {
705                if(!enumGroupNames(names, group,
706                                   start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
707                                   fn, context, nameChoice)) {
708                    return FALSE;
709                }
710                group=NEXT_GROUP(group); /* continue with the next group */
711            }
712        } else if(startGroupMSB>group[GROUP_MSB]) {
713            /* make sure that we start enumerating with the first group after start */
714            const uint16_t *nextGroup=NEXT_GROUP(group);
715            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
716                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
717                if (end > limit) {
718                    end = limit;
719                }
720                if (!enumExtNames(start, end - 1, fn, context)) {
721                    return FALSE;
722                }
723            }
724            group=nextGroup;
725        }
726
727        /* enumerate entire groups between the start- and end-groups */
728        while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
729            const uint16_t *nextGroup;
730            start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
731            if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
732                return FALSE;
733            }
734            nextGroup=NEXT_GROUP(group);
735            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
736                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
737                if (end > limit) {
738                    end = limit;
739                }
740                if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
741                    return FALSE;
742                }
743            }
744            group=nextGroup;
745        }
746
747        /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
748        if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
749            return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
750        } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
751            UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
752            if (next > start) {
753                start = next;
754            }
755        } else {
756            return TRUE;
757        }
758    }
759
760    /* we have not found a group, which means everything is made of
761       extended names. */
762    if (nameChoice == U_EXTENDED_CHAR_NAME) {
763        if (limit > UCHAR_MAX_VALUE + 1) {
764            limit = UCHAR_MAX_VALUE + 1;
765        }
766        return enumExtNames(start, limit - 1, fn, context);
767    }
768
769    return TRUE;
770}
771
772static uint16_t
773writeFactorSuffix(const uint16_t *factors, uint16_t count,
774                  const char *s, /* suffix elements */
775                  uint32_t code,
776                  uint16_t indexes[8], /* output fields from here */
777                  const char *elementBases[8], const char *elements[8],
778                  char *buffer, uint16_t bufferLength) {
779    uint16_t i, factor, bufferPos=0;
780    char c;
781
782    /* write elements according to the factors */
783
784    /*
785     * the factorized elements are determined by modulo arithmetic
786     * with the factors of this algorithm
787     *
788     * note that for fewer operations, count is decremented here
789     */
790    --count;
791    for(i=count; i>0; --i) {
792        factor=factors[i];
793        indexes[i]=(uint16_t)(code%factor);
794        code/=factor;
795    }
796    /*
797     * we don't need to calculate the last modulus because start<=code<=end
798     * guarantees here that code<=factors[0]
799     */
800    indexes[0]=(uint16_t)code;
801
802    /* write each element */
803    for(;;) {
804        if(elementBases!=NULL) {
805            *elementBases++=s;
806        }
807
808        /* skip indexes[i] strings */
809        factor=indexes[i];
810        while(factor>0) {
811            while(*s++!=0) {}
812            --factor;
813        }
814        if(elements!=NULL) {
815            *elements++=s;
816        }
817
818        /* write element */
819        while((c=*s++)!=0) {
820            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
821        }
822
823        /* we do not need to perform the rest of this loop for i==count - break here */
824        if(i>=count) {
825            break;
826        }
827
828        /* skip the rest of the strings for this factors[i] */
829        factor=(uint16_t)(factors[i]-indexes[i]-1);
830        while(factor>0) {
831            while(*s++!=0) {}
832            --factor;
833        }
834
835        ++i;
836    }
837
838    /* zero-terminate */
839    if(bufferLength>0) {
840        *buffer=0;
841    }
842
843    return bufferPos;
844}
845
846/*
847 * Important:
848 * Parts of findAlgName() are almost the same as some of getAlgName().
849 * Fixes must be applied to both.
850 */
851static uint16_t
852getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
853        char *buffer, uint16_t bufferLength) {
854    uint16_t bufferPos=0;
855
856    /* Only the normative character name can be algorithmic. */
857    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
858        /* zero-terminate */
859        if(bufferLength>0) {
860            *buffer=0;
861        }
862        return 0;
863    }
864
865    switch(range->type) {
866    case 0: {
867        /* name = prefix hex-digits */
868        const char *s=(const char *)(range+1);
869        char c;
870
871        uint16_t i, count;
872
873        /* copy prefix */
874        while((c=*s++)!=0) {
875            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
876        }
877
878        /* write hexadecimal code point value */
879        count=range->variant;
880
881        /* zero-terminate */
882        if(count<bufferLength) {
883            buffer[count]=0;
884        }
885
886        for(i=count; i>0;) {
887            if(--i<bufferLength) {
888                c=(char)(code&0xf);
889                if(c<10) {
890                    c+='0';
891                } else {
892                    c+='A'-10;
893                }
894                buffer[i]=c;
895            }
896            code>>=4;
897        }
898
899        bufferPos+=count;
900        break;
901    }
902    case 1: {
903        /* name = prefix factorized-elements */
904        uint16_t indexes[8];
905        const uint16_t *factors=(const uint16_t *)(range+1);
906        uint16_t count=range->variant;
907        const char *s=(const char *)(factors+count);
908        char c;
909
910        /* copy prefix */
911        while((c=*s++)!=0) {
912            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
913        }
914
915        bufferPos+=writeFactorSuffix(factors, count,
916                                     s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
917        break;
918    }
919    default:
920        /* undefined type */
921        /* zero-terminate */
922        if(bufferLength>0) {
923            *buffer=0;
924        }
925        break;
926    }
927
928    return bufferPos;
929}
930
931/*
932 * Important: enumAlgNames() and findAlgName() are almost the same.
933 * Any fix must be applied to both.
934 */
935static UBool
936enumAlgNames(AlgorithmicRange *range,
937             UChar32 start, UChar32 limit,
938             UEnumCharNamesFn *fn, void *context,
939             UCharNameChoice nameChoice) {
940    char buffer[200];
941    uint16_t length;
942
943    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
944        return TRUE;
945    }
946
947    switch(range->type) {
948    case 0: {
949        char *s, *end;
950        char c;
951
952        /* get the full name of the start character */
953        length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
954        if(length<=0) {
955            return TRUE;
956        }
957
958        /* call the enumerator function with this first character */
959        if(!fn(context, start, nameChoice, buffer, length)) {
960            return FALSE;
961        }
962
963        /* go to the end of the name; all these names have the same length */
964        end=buffer;
965        while(*end!=0) {
966            ++end;
967        }
968
969        /* enumerate the rest of the names */
970        while(++start<limit) {
971            /* increment the hexadecimal number on a character-basis */
972            s=end;
973            for (;;) {
974                c=*--s;
975                if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
976                    *s=(char)(c+1);
977                    break;
978                } else if(c=='9') {
979                    *s='A';
980                    break;
981                } else if(c=='F') {
982                    *s='0';
983                }
984            }
985
986            if(!fn(context, start, nameChoice, buffer, length)) {
987                return FALSE;
988            }
989        }
990        break;
991    }
992    case 1: {
993        uint16_t indexes[8];
994        const char *elementBases[8], *elements[8];
995        const uint16_t *factors=(const uint16_t *)(range+1);
996        uint16_t count=range->variant;
997        const char *s=(const char *)(factors+count);
998        char *suffix, *t;
999        uint16_t prefixLength, i, idx;
1000
1001        char c;
1002
1003        /* name = prefix factorized-elements */
1004
1005        /* copy prefix */
1006        suffix=buffer;
1007        prefixLength=0;
1008        while((c=*s++)!=0) {
1009            *suffix++=c;
1010            ++prefixLength;
1011        }
1012
1013        /* append the suffix of the start character */
1014        length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
1015                                              s, (uint32_t)start-range->start,
1016                                              indexes, elementBases, elements,
1017                                              suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
1018
1019        /* call the enumerator function with this first character */
1020        if(!fn(context, start, nameChoice, buffer, length)) {
1021            return FALSE;
1022        }
1023
1024        /* enumerate the rest of the names */
1025        while(++start<limit) {
1026            /* increment the indexes in lexical order bound by the factors */
1027            i=count;
1028            for (;;) {
1029                idx=(uint16_t)(indexes[--i]+1);
1030                if(idx<factors[i]) {
1031                    /* skip one index and its element string */
1032                    indexes[i]=idx;
1033                    s=elements[i];
1034                    while(*s++!=0) {
1035                    }
1036                    elements[i]=s;
1037                    break;
1038                } else {
1039                    /* reset this index to 0 and its element string to the first one */
1040                    indexes[i]=0;
1041                    elements[i]=elementBases[i];
1042                }
1043            }
1044
1045            /* to make matters a little easier, just append all elements to the suffix */
1046            t=suffix;
1047            length=prefixLength;
1048            for(i=0; i<count; ++i) {
1049                s=elements[i];
1050                while((c=*s++)!=0) {
1051                    *t++=c;
1052                    ++length;
1053                }
1054            }
1055            /* zero-terminate */
1056            *t=0;
1057
1058            if(!fn(context, start, nameChoice, buffer, length)) {
1059                return FALSE;
1060            }
1061        }
1062        break;
1063    }
1064    default:
1065        /* undefined type */
1066        break;
1067    }
1068
1069    return TRUE;
1070}
1071
1072/*
1073 * findAlgName() is almost the same as enumAlgNames() except that it
1074 * returns the code point for a name if it fits into the range.
1075 * It returns 0xffff otherwise.
1076 */
1077static UChar32
1078findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1079    UChar32 code;
1080
1081    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1082        return 0xffff;
1083    }
1084
1085    switch(range->type) {
1086    case 0: {
1087        /* name = prefix hex-digits */
1088        const char *s=(const char *)(range+1);
1089        char c;
1090
1091        uint16_t i, count;
1092
1093        /* compare prefix */
1094        while((c=*s++)!=0) {
1095            if((char)c!=*otherName++) {
1096                return 0xffff;
1097            }
1098        }
1099
1100        /* read hexadecimal code point value */
1101        count=range->variant;
1102        code=0;
1103        for(i=0; i<count; ++i) {
1104            c=*otherName++;
1105            if('0'<=c && c<='9') {
1106                code=(code<<4)|(c-'0');
1107            } else if('A'<=c && c<='F') {
1108                code=(code<<4)|(c-'A'+10);
1109            } else {
1110                return 0xffff;
1111            }
1112        }
1113
1114        /* does it fit into the range? */
1115        if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1116            return code;
1117        }
1118        break;
1119    }
1120    case 1: {
1121        char buffer[64];
1122        uint16_t indexes[8];
1123        const char *elementBases[8], *elements[8];
1124        const uint16_t *factors=(const uint16_t *)(range+1);
1125        uint16_t count=range->variant;
1126        const char *s=(const char *)(factors+count), *t;
1127        UChar32 start, limit;
1128        uint16_t i, idx;
1129
1130        char c;
1131
1132        /* name = prefix factorized-elements */
1133
1134        /* compare prefix */
1135        while((c=*s++)!=0) {
1136            if((char)c!=*otherName++) {
1137                return 0xffff;
1138            }
1139        }
1140
1141        start=(UChar32)range->start;
1142        limit=(UChar32)(range->end+1);
1143
1144        /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1145        writeFactorSuffix(factors, count, s, 0,
1146                          indexes, elementBases, elements, buffer, sizeof(buffer));
1147
1148        /* compare the first suffix */
1149        if(0==uprv_strcmp(otherName, buffer)) {
1150            return start;
1151        }
1152
1153        /* enumerate and compare the rest of the suffixes */
1154        while(++start<limit) {
1155            /* increment the indexes in lexical order bound by the factors */
1156            i=count;
1157            for (;;) {
1158                idx=(uint16_t)(indexes[--i]+1);
1159                if(idx<factors[i]) {
1160                    /* skip one index and its element string */
1161                    indexes[i]=idx;
1162                    s=elements[i];
1163                    while(*s++!=0) {}
1164                    elements[i]=s;
1165                    break;
1166                } else {
1167                    /* reset this index to 0 and its element string to the first one */
1168                    indexes[i]=0;
1169                    elements[i]=elementBases[i];
1170                }
1171            }
1172
1173            /* to make matters a little easier, just compare all elements of the suffix */
1174            t=otherName;
1175            for(i=0; i<count; ++i) {
1176                s=elements[i];
1177                while((c=*s++)!=0) {
1178                    if(c!=*t++) {
1179                        s=""; /* does not match */
1180                        i=99;
1181                    }
1182                }
1183            }
1184            if(i<99 && *t==0) {
1185                return start;
1186            }
1187        }
1188        break;
1189    }
1190    default:
1191        /* undefined type */
1192        break;
1193    }
1194
1195    return 0xffff;
1196}
1197
1198/* sets of name characters, maximum name lengths ---------------------------- */
1199
1200#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1201#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1202
1203static int32_t
1204calcStringSetLength(uint32_t set[8], const char *s) {
1205    int32_t length=0;
1206    char c;
1207
1208    while((c=*s++)!=0) {
1209        SET_ADD(set, c);
1210        ++length;
1211    }
1212    return length;
1213}
1214
1215static int32_t
1216calcAlgNameSetsLengths(int32_t maxNameLength) {
1217    AlgorithmicRange *range;
1218    uint32_t *p;
1219    uint32_t rangeCount;
1220    int32_t length;
1221
1222    /* enumerate algorithmic ranges */
1223    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1224    rangeCount=*p;
1225    range=(AlgorithmicRange *)(p+1);
1226    while(rangeCount>0) {
1227        switch(range->type) {
1228        case 0:
1229            /* name = prefix + (range->variant times) hex-digits */
1230            /* prefix */
1231            length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1232            if(length>maxNameLength) {
1233                maxNameLength=length;
1234            }
1235            break;
1236        case 1: {
1237            /* name = prefix factorized-elements */
1238            const uint16_t *factors=(const uint16_t *)(range+1);
1239            const char *s;
1240            int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1241
1242            /* prefix length */
1243            s=(const char *)(factors+count);
1244            length=calcStringSetLength(gNameSet, s);
1245            s+=length+1; /* start of factor suffixes */
1246
1247            /* get the set and maximum factor suffix length for each factor */
1248            for(i=0; i<count; ++i) {
1249                maxFactorLength=0;
1250                for(factor=factors[i]; factor>0; --factor) {
1251                    factorLength=calcStringSetLength(gNameSet, s);
1252                    s+=factorLength+1;
1253                    if(factorLength>maxFactorLength) {
1254                        maxFactorLength=factorLength;
1255                    }
1256                }
1257                length+=maxFactorLength;
1258            }
1259
1260            if(length>maxNameLength) {
1261                maxNameLength=length;
1262            }
1263            break;
1264        }
1265        default:
1266            /* unknown type */
1267            break;
1268        }
1269
1270        range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1271        --rangeCount;
1272    }
1273    return maxNameLength;
1274}
1275
1276static int32_t
1277calcExtNameSetsLengths(int32_t maxNameLength) {
1278    int32_t i, length;
1279
1280    for(i=0; i<LENGTHOF(charCatNames); ++i) {
1281        /*
1282         * for each category, count the length of the category name
1283         * plus 9=
1284         * 2 for <>
1285         * 1 for -
1286         * 6 for most hex digits per code point
1287         */
1288        length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1289        if(length>maxNameLength) {
1290            maxNameLength=length;
1291        }
1292    }
1293    return maxNameLength;
1294}
1295
1296static int32_t
1297calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1298                  uint32_t set[8],
1299                  const uint8_t **pLine, const uint8_t *lineLimit) {
1300    const uint8_t *line=*pLine;
1301    int32_t length=0, tokenLength;
1302    uint16_t c, token;
1303
1304    while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1305        if(c>=tokenCount) {
1306            /* implicit letter */
1307            SET_ADD(set, c);
1308            ++length;
1309        } else {
1310            token=tokens[c];
1311            if(token==(uint16_t)(-2)) {
1312                /* this is a lead byte for a double-byte token */
1313                c=c<<8|*line++;
1314                token=tokens[c];
1315            }
1316            if(token==(uint16_t)(-1)) {
1317                /* explicit letter */
1318                SET_ADD(set, c);
1319                ++length;
1320            } else {
1321                /* count token word */
1322                if(tokenLengths!=NULL) {
1323                    /* use cached token length */
1324                    tokenLength=tokenLengths[c];
1325                    if(tokenLength==0) {
1326                        tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1327                        tokenLengths[c]=(int8_t)tokenLength;
1328                    }
1329                } else {
1330                    tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1331                }
1332                length+=tokenLength;
1333            }
1334        }
1335    }
1336
1337    *pLine=line;
1338    return length;
1339}
1340
1341static void
1342calcGroupNameSetsLengths(int32_t maxNameLength) {
1343    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1344
1345    uint16_t *tokens=(uint16_t *)uCharNames+8;
1346    uint16_t tokenCount=*tokens++;
1347    uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1348
1349    int8_t *tokenLengths;
1350
1351    const uint16_t *group;
1352    const uint8_t *s, *line, *lineLimit;
1353
1354    int32_t groupCount, lineNumber, length;
1355
1356    tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1357    if(tokenLengths!=NULL) {
1358        uprv_memset(tokenLengths, 0, tokenCount);
1359    }
1360
1361    group=GET_GROUPS(uCharNames);
1362    groupCount=*group++;
1363
1364    /* enumerate all groups */
1365    while(groupCount>0) {
1366        s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
1367        s=expandGroupLengths(s, offsets, lengths);
1368
1369        /* enumerate all lines in each group */
1370        for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1371            line=s+offsets[lineNumber];
1372            length=lengths[lineNumber];
1373            if(length==0) {
1374                continue;
1375            }
1376
1377            lineLimit=line+length;
1378
1379            /* read regular name */
1380            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1381            if(length>maxNameLength) {
1382                maxNameLength=length;
1383            }
1384            if(line==lineLimit) {
1385                continue;
1386            }
1387
1388            /* read Unicode 1.0 name */
1389            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1390            if(length>maxNameLength) {
1391                maxNameLength=length;
1392            }
1393            if(line==lineLimit) {
1394                continue;
1395            }
1396
1397            /* read ISO comment */
1398            /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1399        }
1400
1401        group=NEXT_GROUP(group);
1402        --groupCount;
1403    }
1404
1405    if(tokenLengths!=NULL) {
1406        uprv_free(tokenLengths);
1407    }
1408
1409    /* set gMax... - name length last for threading */
1410    gMaxNameLength=maxNameLength;
1411}
1412
1413static UBool
1414calcNameSetsLengths(UErrorCode *pErrorCode) {
1415    static const char extChars[]="0123456789ABCDEF<>-";
1416    int32_t i, maxNameLength;
1417
1418    if(gMaxNameLength!=0) {
1419        return TRUE;
1420    }
1421
1422    if(!isDataLoaded(pErrorCode)) {
1423        return FALSE;
1424    }
1425
1426    /* set hex digits, used in various names, and <>-, used in extended names */
1427    for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) {
1428        SET_ADD(gNameSet, extChars[i]);
1429    }
1430
1431    /* set sets and lengths from algorithmic names */
1432    maxNameLength=calcAlgNameSetsLengths(0);
1433
1434    /* set sets and lengths from extended names */
1435    maxNameLength=calcExtNameSetsLengths(maxNameLength);
1436
1437    /* set sets and lengths from group names, set global maximum values */
1438    calcGroupNameSetsLengths(maxNameLength);
1439
1440    return TRUE;
1441}
1442
1443U_NAMESPACE_END
1444
1445/* public API --------------------------------------------------------------- */
1446
1447U_NAMESPACE_USE
1448
1449U_CAPI int32_t U_EXPORT2
1450u_charName(UChar32 code, UCharNameChoice nameChoice,
1451           char *buffer, int32_t bufferLength,
1452           UErrorCode *pErrorCode) {
1453     AlgorithmicRange *algRange;
1454    uint32_t *p;
1455    uint32_t i;
1456    int32_t length;
1457
1458    /* check the argument values */
1459    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1460        return 0;
1461    } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1462              bufferLength<0 || (bufferLength>0 && buffer==NULL)
1463    ) {
1464        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1465        return 0;
1466    }
1467
1468    if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1469        return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1470    }
1471
1472    length=0;
1473
1474    /* try algorithmic names first */
1475    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1476    i=*p;
1477    algRange=(AlgorithmicRange *)(p+1);
1478    while(i>0) {
1479        if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1480            length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1481            break;
1482        }
1483        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1484        --i;
1485    }
1486
1487    if(i==0) {
1488        if (nameChoice == U_EXTENDED_CHAR_NAME) {
1489            length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1490            if (!length) {
1491                /* extended character name */
1492                length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1493            }
1494        } else {
1495            /* normal character name */
1496            length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1497        }
1498    }
1499
1500    return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1501}
1502
1503U_CAPI int32_t U_EXPORT2
1504u_getISOComment(UChar32 /*c*/,
1505                char *dest, int32_t destCapacity,
1506                UErrorCode *pErrorCode) {
1507    /* check the argument values */
1508    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1509        return 0;
1510    } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1511        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1512        return 0;
1513    }
1514
1515    return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1516}
1517
1518U_CAPI UChar32 U_EXPORT2
1519u_charFromName(UCharNameChoice nameChoice,
1520               const char *name,
1521               UErrorCode *pErrorCode) {
1522    char upper[120], lower[120];
1523    FindName findName;
1524    AlgorithmicRange *algRange;
1525    uint32_t *p;
1526    uint32_t i;
1527    UChar32 cp = 0;
1528    char c0;
1529    UChar32 error = 0xffff;     /* Undefined, but use this for backwards compatibility. */
1530
1531    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1532        return error;
1533    }
1534
1535    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1536        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1537        return error;
1538    }
1539
1540    if(!isDataLoaded(pErrorCode)) {
1541        return error;
1542    }
1543
1544    /* construct the uppercase and lowercase of the name first */
1545    for(i=0; i<sizeof(upper); ++i) {
1546        if((c0=*name++)!=0) {
1547            upper[i]=uprv_toupper(c0);
1548            lower[i]=uprv_tolower(c0);
1549        } else {
1550            upper[i]=lower[i]=0;
1551            break;
1552        }
1553    }
1554    if(i==sizeof(upper)) {
1555        /* name too long, there is no such character */
1556        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1557        return error;
1558    }
1559
1560    /* try extended names first */
1561    if (lower[0] == '<') {
1562        if (nameChoice == U_EXTENDED_CHAR_NAME) {
1563            if (lower[--i] == '>') {
1564                for (--i; lower[i] && lower[i] != '-'; --i) {
1565                }
1566
1567                if (lower[i] == '-') { /* We've got a category. */
1568                    uint32_t cIdx;
1569
1570                    lower[i] = 0;
1571
1572                    for (++i; lower[i] != '>'; ++i) {
1573                        if (lower[i] >= '0' && lower[i] <= '9') {
1574                            cp = (cp << 4) + lower[i] - '0';
1575                        } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1576                            cp = (cp << 4) + lower[i] - 'a' + 10;
1577                        } else {
1578                            *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1579                            return error;
1580                        }
1581                    }
1582
1583                    /* Now validate the category name.
1584                       We could use a binary search, or a trie, if
1585                       we really wanted to. */
1586
1587                    for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
1588
1589                        if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1590                            if (getCharCat(cp) == cIdx) {
1591                                return cp;
1592                            }
1593                            break;
1594                        }
1595                    }
1596                }
1597            }
1598        }
1599
1600        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1601        return error;
1602    }
1603
1604    /* try algorithmic names now */
1605    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1606    i=*p;
1607    algRange=(AlgorithmicRange *)(p+1);
1608    while(i>0) {
1609        if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1610            return cp;
1611        }
1612        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1613        --i;
1614    }
1615
1616    /* normal character name */
1617    findName.otherName=upper;
1618    findName.code=error;
1619    enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1620    if (findName.code == error) {
1621         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1622    }
1623    return findName.code;
1624}
1625
1626U_CAPI void U_EXPORT2
1627u_enumCharNames(UChar32 start, UChar32 limit,
1628                UEnumCharNamesFn *fn,
1629                void *context,
1630                UCharNameChoice nameChoice,
1631                UErrorCode *pErrorCode) {
1632    AlgorithmicRange *algRange;
1633    uint32_t *p;
1634    uint32_t i;
1635
1636    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1637        return;
1638    }
1639
1640    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1641        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1642        return;
1643    }
1644
1645    if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1646        limit = UCHAR_MAX_VALUE + 1;
1647    }
1648    if((uint32_t)start>=(uint32_t)limit) {
1649        return;
1650    }
1651
1652    if(!isDataLoaded(pErrorCode)) {
1653        return;
1654    }
1655
1656    /* interleave the data-driven ones with the algorithmic ones */
1657    /* iterate over all algorithmic ranges; assume that they are in ascending order */
1658    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1659    i=*p;
1660    algRange=(AlgorithmicRange *)(p+1);
1661    while(i>0) {
1662        /* enumerate the character names before the current algorithmic range */
1663        /* here: start<limit */
1664        if((uint32_t)start<algRange->start) {
1665            if((uint32_t)limit<=algRange->start) {
1666                enumNames(uCharNames, start, limit, fn, context, nameChoice);
1667                return;
1668            }
1669            if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1670                return;
1671            }
1672            start=(UChar32)algRange->start;
1673        }
1674        /* enumerate the character names in the current algorithmic range */
1675        /* here: algRange->start<=start<limit */
1676        if((uint32_t)start<=algRange->end) {
1677            if((uint32_t)limit<=(algRange->end+1)) {
1678                enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1679                return;
1680            }
1681            if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1682                return;
1683            }
1684            start=(UChar32)algRange->end+1;
1685        }
1686        /* continue to the next algorithmic range (here: start<limit) */
1687        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1688        --i;
1689    }
1690    /* enumerate the character names after the last algorithmic range */
1691    enumNames(uCharNames, start, limit, fn, context, nameChoice);
1692}
1693
1694U_CAPI int32_t U_EXPORT2
1695uprv_getMaxCharNameLength() {
1696    UErrorCode errorCode=U_ZERO_ERROR;
1697    if(calcNameSetsLengths(&errorCode)) {
1698        return gMaxNameLength;
1699    } else {
1700        return 0;
1701    }
1702}
1703
1704/**
1705 * Converts the char set cset into a Unicode set uset.
1706 * @param cset Set of 256 bit flags corresponding to a set of chars.
1707 * @param uset USet to receive characters. Existing contents are deleted.
1708 */
1709static void
1710charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1711    UChar us[256];
1712    char cs[256];
1713
1714    int32_t i, length;
1715    UErrorCode errorCode;
1716
1717    errorCode=U_ZERO_ERROR;
1718
1719    if(!calcNameSetsLengths(&errorCode)) {
1720        return;
1721    }
1722
1723    /* build a char string with all chars that are used in character names */
1724    length=0;
1725    for(i=0; i<256; ++i) {
1726        if(SET_CONTAINS(cset, i)) {
1727            cs[length++]=(char)i;
1728        }
1729    }
1730
1731    /* convert the char string to a UChar string */
1732    u_charsToUChars(cs, us, length);
1733
1734    /* add each UChar to the USet */
1735    for(i=0; i<length; ++i) {
1736        if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
1737            sa->add(sa->set, us[i]);
1738        }
1739    }
1740}
1741
1742/**
1743 * Fills set with characters that are used in Unicode character names.
1744 * @param set USet to receive characters.
1745 */
1746U_CAPI void U_EXPORT2
1747uprv_getCharNameCharacters(const USetAdder *sa) {
1748    charSetToUSet(gNameSet, sa);
1749}
1750
1751/* data swapping ------------------------------------------------------------ */
1752
1753/*
1754 * The token table contains non-negative entries for token bytes,
1755 * and -1 for bytes that represent themselves in the data file's charset.
1756 * -2 entries are used for lead bytes.
1757 *
1758 * Direct bytes (-1 entries) must be translated from the input charset family
1759 * to the output charset family.
1760 * makeTokenMap() writes a permutation mapping for this.
1761 * Use it once for single-/lead-byte tokens and once more for all trail byte
1762 * tokens. (';' is an unused trail byte marked with -1.)
1763 */
1764static void
1765makeTokenMap(const UDataSwapper *ds,
1766             int16_t tokens[], uint16_t tokenCount,
1767             uint8_t map[256],
1768             UErrorCode *pErrorCode) {
1769    UBool usedOutChar[256];
1770    uint16_t i, j;
1771    uint8_t c1, c2;
1772
1773    if(U_FAILURE(*pErrorCode)) {
1774        return;
1775    }
1776
1777    if(ds->inCharset==ds->outCharset) {
1778        /* Same charset family: identity permutation */
1779        for(i=0; i<256; ++i) {
1780            map[i]=(uint8_t)i;
1781        }
1782    } else {
1783        uprv_memset(map, 0, 256);
1784        uprv_memset(usedOutChar, 0, 256);
1785
1786        if(tokenCount>256) {
1787            tokenCount=256;
1788        }
1789
1790        /* set the direct bytes (byte 0 always maps to itself) */
1791        for(i=1; i<tokenCount; ++i) {
1792            if(tokens[i]==-1) {
1793                /* convert the direct byte character */
1794                c1=(uint8_t)i;
1795                ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1796                if(U_FAILURE(*pErrorCode)) {
1797                    udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1798                                     i, ds->inCharset);
1799                    return;
1800                }
1801
1802                /* enter the converted character into the map and mark it used */
1803                map[c1]=c2;
1804                usedOutChar[c2]=TRUE;
1805            }
1806        }
1807
1808        /* set the mappings for the rest of the permutation */
1809        for(i=j=1; i<tokenCount; ++i) {
1810            /* set mappings that were not set for direct bytes */
1811            if(map[i]==0) {
1812                /* set an output byte value that was not used as an output byte above */
1813                while(usedOutChar[j]) {
1814                    ++j;
1815                }
1816                map[i]=(uint8_t)j++;
1817            }
1818        }
1819
1820        /*
1821         * leave mappings at tokenCount and above unset if tokenCount<256
1822         * because they won't be used
1823         */
1824    }
1825}
1826
1827U_CAPI int32_t U_EXPORT2
1828uchar_swapNames(const UDataSwapper *ds,
1829                const void *inData, int32_t length, void *outData,
1830                UErrorCode *pErrorCode) {
1831    const UDataInfo *pInfo;
1832    int32_t headerSize;
1833
1834    const uint8_t *inBytes;
1835    uint8_t *outBytes;
1836
1837    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1838             offset, i, count, stringsCount;
1839
1840    const AlgorithmicRange *inRange;
1841    AlgorithmicRange *outRange;
1842
1843    /* udata_swapDataHeader checks the arguments */
1844    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1845    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1846        return 0;
1847    }
1848
1849    /* check data format and format version */
1850    pInfo=(const UDataInfo *)((const char *)inData+4);
1851    if(!(
1852        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
1853        pInfo->dataFormat[1]==0x6e &&
1854        pInfo->dataFormat[2]==0x61 &&
1855        pInfo->dataFormat[3]==0x6d &&
1856        pInfo->formatVersion[0]==1
1857    )) {
1858        udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1859                         pInfo->dataFormat[0], pInfo->dataFormat[1],
1860                         pInfo->dataFormat[2], pInfo->dataFormat[3],
1861                         pInfo->formatVersion[0]);
1862        *pErrorCode=U_UNSUPPORTED_ERROR;
1863        return 0;
1864    }
1865
1866    inBytes=(const uint8_t *)inData+headerSize;
1867    outBytes=(uint8_t *)outData+headerSize;
1868    if(length<0) {
1869        algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1870    } else {
1871        length-=headerSize;
1872        if( length<20 ||
1873            (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1874        ) {
1875            udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1876                             length);
1877            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1878            return 0;
1879        }
1880    }
1881
1882    if(length<0) {
1883        /* preflighting: iterate through algorithmic ranges */
1884        offset=algNamesOffset;
1885        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1886        offset+=4;
1887
1888        for(i=0; i<count; ++i) {
1889            inRange=(const AlgorithmicRange *)(inBytes+offset);
1890            offset+=ds->readUInt16(inRange->size);
1891        }
1892    } else {
1893        /* swap data */
1894        const uint16_t *p;
1895        uint16_t *q, *temp;
1896
1897        int16_t tokens[512];
1898        uint16_t tokenCount;
1899
1900        uint8_t map[256], trailMap[256];
1901
1902        /* copy the data for inaccessible bytes */
1903        if(inBytes!=outBytes) {
1904            uprv_memcpy(outBytes, inBytes, length);
1905        }
1906
1907        /* the initial 4 offsets first */
1908        tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1909        groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1910        groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1911        ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1912
1913        /*
1914         * now the tokens table
1915         * it needs to be permutated along with the compressed name strings
1916         */
1917        p=(const uint16_t *)(inBytes+16);
1918        q=(uint16_t *)(outBytes+16);
1919
1920        /* read and swap the tokenCount */
1921        tokenCount=ds->readUInt16(*p);
1922        ds->swapArray16(ds, p, 2, q, pErrorCode);
1923        ++p;
1924        ++q;
1925
1926        /* read the first 512 tokens and make the token maps */
1927        if(tokenCount<=512) {
1928            count=tokenCount;
1929        } else {
1930            count=512;
1931        }
1932        for(i=0; i<count; ++i) {
1933            tokens[i]=udata_readInt16(ds, p[i]);
1934        }
1935        for(; i<512; ++i) {
1936            tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1937        }
1938        makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1939        makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1940        if(U_FAILURE(*pErrorCode)) {
1941            return 0;
1942        }
1943
1944        /*
1945         * swap and permutate the tokens
1946         * go through a temporary array to support in-place swapping
1947         */
1948        temp=(uint16_t *)uprv_malloc(tokenCount*2);
1949        if(temp==NULL) {
1950            udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1951                             tokenCount);
1952            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1953            return 0;
1954        }
1955
1956        /* swap and permutate single-/lead-byte tokens */
1957        for(i=0; i<tokenCount && i<256; ++i) {
1958            ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1959        }
1960
1961        /* swap and permutate trail-byte tokens */
1962        for(; i<tokenCount; ++i) {
1963            ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1964        }
1965
1966        /* copy the result into the output and free the temporary array */
1967        uprv_memcpy(q, temp, tokenCount*2);
1968        uprv_free(temp);
1969
1970        /*
1971         * swap the token strings but not a possible padding byte after
1972         * the terminating NUL of the last string
1973         */
1974        udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1975                                    outBytes+tokenStringOffset, pErrorCode);
1976        if(U_FAILURE(*pErrorCode)) {
1977            udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1978            return 0;
1979        }
1980
1981        /* swap the group table */
1982        count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
1983        ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
1984                           outBytes+groupsOffset, pErrorCode);
1985
1986        /*
1987         * swap the group strings
1988         * swap the string bytes but not the nibble-encoded string lengths
1989         */
1990        if(ds->inCharset!=ds->outCharset) {
1991            uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
1992
1993            const uint8_t *inStrings, *nextInStrings;
1994            uint8_t *outStrings;
1995
1996            uint8_t c;
1997
1998            inStrings=inBytes+groupStringOffset;
1999            outStrings=outBytes+groupStringOffset;
2000
2001            stringsCount=algNamesOffset-groupStringOffset;
2002
2003            /* iterate through string groups until only a few padding bytes are left */
2004            while(stringsCount>32) {
2005                nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2006
2007                /* move past the length bytes */
2008                stringsCount-=(uint32_t)(nextInStrings-inStrings);
2009                outStrings+=nextInStrings-inStrings;
2010                inStrings=nextInStrings;
2011
2012                count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2013                stringsCount-=count;
2014
2015                /* swap the string bytes using map[] and trailMap[] */
2016                while(count>0) {
2017                    c=*inStrings++;
2018                    *outStrings++=map[c];
2019                    if(tokens[c]!=-2) {
2020                        --count;
2021                    } else {
2022                        /* token lead byte: swap the trail byte, too */
2023                        *outStrings++=trailMap[*inStrings++];
2024                        count-=2;
2025                    }
2026                }
2027            }
2028        }
2029
2030        /* swap the algorithmic ranges */
2031        offset=algNamesOffset;
2032        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2033        ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2034        offset+=4;
2035
2036        for(i=0; i<count; ++i) {
2037            if(offset>(uint32_t)length) {
2038                udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2039                                 length, i);
2040                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2041                return 0;
2042            }
2043
2044            inRange=(const AlgorithmicRange *)(inBytes+offset);
2045            outRange=(AlgorithmicRange *)(outBytes+offset);
2046            offset+=ds->readUInt16(inRange->size);
2047
2048            ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2049            ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2050            switch(inRange->type) {
2051            case 0:
2052                /* swap prefix string */
2053                ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2054                                    outRange+1, pErrorCode);
2055                if(U_FAILURE(*pErrorCode)) {
2056                    udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2057                                     i);
2058                    return 0;
2059                }
2060                break;
2061            case 1:
2062                {
2063                    /* swap factors and the prefix and factor strings */
2064                    uint32_t factorsCount;
2065
2066                    factorsCount=inRange->variant;
2067                    p=(const uint16_t *)(inRange+1);
2068                    q=(uint16_t *)(outRange+1);
2069                    ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2070
2071                    /* swap the strings, up to the last terminating NUL */
2072                    p+=factorsCount;
2073                    q+=factorsCount;
2074                    stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2075                    while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2076                        --stringsCount;
2077                    }
2078                    ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2079                }
2080                break;
2081            default:
2082                udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2083                                 inRange->type, i);
2084                *pErrorCode=U_UNSUPPORTED_ERROR;
2085                return 0;
2086            }
2087        }
2088    }
2089
2090    return headerSize+(int32_t)offset;
2091}
2092
2093/*
2094 * Hey, Emacs, please set the following:
2095 *
2096 * Local Variables:
2097 * indent-tabs-mode: nil
2098 * End:
2099 *
2100 */
2101