1/*
2*******************************************************************************
3*   Copyright (C) 2004-2010, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  ucol_sit.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11* Modification history
12* Date        Name      Comments
13* 03/12/2004  weiv      Creation
14*/
15
16#include "unicode/ustring.h"
17#include "unicode/udata.h"
18
19#include "utracimp.h"
20#include "ucol_imp.h"
21#include "ucol_tok.h"
22#include "cmemory.h"
23#include "cstring.h"
24#include "uresimp.h"
25
26#if !UCONFIG_NO_COLLATION
27
28enum OptionsList {
29    UCOL_SIT_LANGUAGE = 0,
30    UCOL_SIT_SCRIPT,
31    UCOL_SIT_REGION,
32    UCOL_SIT_VARIANT,
33    UCOL_SIT_KEYWORD,
34    UCOL_SIT_BCP47,
35    UCOL_SIT_STRENGTH,
36    UCOL_SIT_CASE_LEVEL,
37    UCOL_SIT_CASE_FIRST,
38    UCOL_SIT_NUMERIC_COLLATION,
39    UCOL_SIT_ALTERNATE_HANDLING,
40    UCOL_SIT_NORMALIZATION_MODE,
41    UCOL_SIT_FRENCH_COLLATION,
42    UCOL_SIT_HIRAGANA_QUATERNARY,
43    UCOL_SIT_VARIABLE_TOP,
44    UCOL_SIT_VARIABLE_TOP_VALUE,
45    UCOL_SIT_ITEMS_COUNT
46};
47
48/* option starters chars. */
49static const char alternateHArg     = 'A';
50static const char variableTopValArg = 'B';
51static const char caseFirstArg      = 'C';
52static const char numericCollArg    = 'D';
53static const char caseLevelArg      = 'E';
54static const char frenchCollArg     = 'F';
55static const char hiraganaQArg      = 'H';
56static const char keywordArg        = 'K';
57static const char languageArg       = 'L';
58static const char normArg           = 'N';
59static const char regionArg         = 'R';
60static const char strengthArg       = 'S';
61static const char variableTopArg    = 'T';
62static const char variantArg        = 'V';
63static const char RFC3066Arg        = 'X';
64static const char scriptArg         = 'Z';
65
66static const char collationKeyword[]  = "@collation=";
67
68static const int32_t locElementCount = 5;
69static const int32_t locElementCapacity = 32;
70static const int32_t loc3066Capacity = 256;
71static const int32_t internalBufferSize = 512;
72
73/* structure containing specification of a collator. Initialized
74 * from a short string. Also used to construct a short string from a
75 * collator instance
76 */
77struct CollatorSpec {
78    char locElements[locElementCount][locElementCapacity];
79    char locale[loc3066Capacity];
80    UColAttributeValue options[UCOL_ATTRIBUTE_COUNT];
81    uint32_t variableTopValue;
82    UChar variableTopString[locElementCapacity];
83    int32_t variableTopStringLen;
84    UBool variableTopSet;
85    struct {
86        const char *start;
87        int32_t len;
88    } entries[UCOL_SIT_ITEMS_COUNT];
89};
90
91
92/* structure for converting between character attribute
93 * representation and real collation attribute value.
94 */
95struct AttributeConversion {
96    char letter;
97    UColAttributeValue value;
98};
99
100static const AttributeConversion conversions[12] = {
101    { '1', UCOL_PRIMARY },
102    { '2', UCOL_SECONDARY },
103    { '3', UCOL_TERTIARY },
104    { '4', UCOL_QUATERNARY },
105    { 'D', UCOL_DEFAULT },
106    { 'I', UCOL_IDENTICAL },
107    { 'L', UCOL_LOWER_FIRST },
108    { 'N', UCOL_NON_IGNORABLE },
109    { 'O', UCOL_ON },
110    { 'S', UCOL_SHIFTED },
111    { 'U', UCOL_UPPER_FIRST },
112    { 'X', UCOL_OFF }
113};
114
115
116static char
117ucol_sit_attributeValueToLetter(UColAttributeValue value, UErrorCode *status) {
118    uint32_t i = 0;
119    for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) {
120        if(conversions[i].value == value) {
121            return conversions[i].letter;
122        }
123    }
124    *status = U_ILLEGAL_ARGUMENT_ERROR;
125    return 0;
126}
127
128static UColAttributeValue
129ucol_sit_letterToAttributeValue(char letter, UErrorCode *status) {
130    uint32_t i = 0;
131    for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) {
132        if(conversions[i].letter == letter) {
133            return conversions[i].value;
134        }
135    }
136    *status = U_ILLEGAL_ARGUMENT_ERROR;
137    return UCOL_DEFAULT;
138}
139
140/* function prototype for functions used to parse a short string */
141U_CDECL_BEGIN
142typedef const char* U_CALLCONV
143ActionFunction(CollatorSpec *spec, uint32_t value1, const char* string,
144               UErrorCode *status);
145U_CDECL_END
146
147U_CDECL_BEGIN
148static const char* U_CALLCONV
149_processLocaleElement(CollatorSpec *spec, uint32_t value, const char* string,
150                      UErrorCode *status)
151{
152    int32_t len = 0;
153    do {
154        if(value == 0 || value == 4) {
155            spec->locElements[value][len++] = uprv_tolower(*string);
156        } else {
157            spec->locElements[value][len++] = *string;
158        }
159    } while(*(++string) != '_' && *string && len < locElementCapacity);
160    if(len >= locElementCapacity) {
161        *status = U_BUFFER_OVERFLOW_ERROR;
162        return string;
163    }
164    // don't skip the underscore at the end
165    return string;
166}
167U_CDECL_END
168
169U_CDECL_BEGIN
170static const char* U_CALLCONV
171_processRFC3066Locale(CollatorSpec *spec, uint32_t, const char* string,
172                      UErrorCode *status)
173{
174    char terminator = *string;
175    string++;
176    const char *end = uprv_strchr(string+1, terminator);
177    if(end == NULL || end - string >= loc3066Capacity) {
178        *status = U_BUFFER_OVERFLOW_ERROR;
179        return string;
180    } else {
181        uprv_strncpy(spec->locale, string, end-string);
182        return end+1;
183    }
184}
185
186U_CDECL_END
187
188U_CDECL_BEGIN
189static const char* U_CALLCONV
190_processCollatorOption(CollatorSpec *spec, uint32_t option, const char* string,
191                       UErrorCode *status)
192{
193    spec->options[option] = ucol_sit_letterToAttributeValue(*string, status);
194    if((*(++string) != '_' && *string) || U_FAILURE(*status)) {
195        *status = U_ILLEGAL_ARGUMENT_ERROR;
196    }
197    return string;
198}
199U_CDECL_END
200
201
202static UChar
203readHexCodeUnit(const char **string, UErrorCode *status)
204{
205    UChar result = 0;
206    int32_t value = 0;
207    char c;
208    int32_t noDigits = 0;
209    while((c = **string) != 0 && noDigits < 4) {
210        if( c >= '0' && c <= '9') {
211            value = c - '0';
212        } else if ( c >= 'a' && c <= 'f') {
213            value = c - 'a' + 10;
214        } else if ( c >= 'A' && c <= 'F') {
215            value = c - 'A' + 10;
216        } else {
217            *status = U_ILLEGAL_ARGUMENT_ERROR;
218            return 0;
219        }
220        result = (result << 4) | (UChar)value;
221        noDigits++;
222        (*string)++;
223    }
224    // if the string was terminated before we read 4 digits, set an error
225    if(noDigits < 4) {
226        *status = U_ILLEGAL_ARGUMENT_ERROR;
227    }
228    return result;
229}
230
231U_CDECL_BEGIN
232static const char* U_CALLCONV
233_processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UErrorCode *status)
234{
235    // get four digits
236    int32_t i = 0;
237    if(!value1) {
238        while(U_SUCCESS(*status) && i < locElementCapacity && *string != 0 && *string != '_') {
239            spec->variableTopString[i++] = readHexCodeUnit(&string, status);
240        }
241        spec->variableTopStringLen = i;
242        if(i == locElementCapacity && (*string != 0 || *string != '_')) {
243            *status = U_BUFFER_OVERFLOW_ERROR;
244        }
245    } else {
246        spec->variableTopValue = readHexCodeUnit(&string, status);
247    }
248    if(U_SUCCESS(*status)) {
249        spec->variableTopSet = TRUE;
250    }
251    return string;
252}
253U_CDECL_END
254
255
256/* Table for parsing short strings */
257struct ShortStringOptions {
258    char optionStart;
259    ActionFunction *action;
260    uint32_t attr;
261};
262
263static const ShortStringOptions options[UCOL_SIT_ITEMS_COUNT] =
264{
265/* 10 ALTERNATE_HANDLING */   {alternateHArg,     _processCollatorOption, UCOL_ALTERNATE_HANDLING }, // alternate  N, S, D
266/* 15 VARIABLE_TOP_VALUE */   {variableTopValArg, _processVariableTop,    1 },
267/* 08 CASE_FIRST */           {caseFirstArg,      _processCollatorOption, UCOL_CASE_FIRST }, // case first L, U, X, D
268/* 09 NUMERIC_COLLATION */    {numericCollArg,    _processCollatorOption, UCOL_NUMERIC_COLLATION }, // codan      O, X, D
269/* 07 CASE_LEVEL */           {caseLevelArg,      _processCollatorOption, UCOL_CASE_LEVEL }, // case level O, X, D
270/* 12 FRENCH_COLLATION */     {frenchCollArg,     _processCollatorOption, UCOL_FRENCH_COLLATION }, // french     O, X, D
271/* 13 HIRAGANA_QUATERNARY] */ {hiraganaQArg,      _processCollatorOption, UCOL_HIRAGANA_QUATERNARY_MODE }, // hiragana   O, X, D
272/* 04 KEYWORD */              {keywordArg,        _processLocaleElement,  4 }, // keyword
273/* 00 LANGUAGE */             {languageArg,       _processLocaleElement,  0 }, // language
274/* 11 NORMALIZATION_MODE */   {normArg,           _processCollatorOption, UCOL_NORMALIZATION_MODE }, // norm       O, X, D
275/* 02 REGION */               {regionArg,         _processLocaleElement,  2 }, // region
276/* 06 STRENGTH */             {strengthArg,       _processCollatorOption, UCOL_STRENGTH }, // strength   1, 2, 3, 4, I, D
277/* 14 VARIABLE_TOP */         {variableTopArg,    _processVariableTop,    0 },
278/* 03 VARIANT */              {variantArg,        _processLocaleElement,  3 }, // variant
279/* 05 RFC3066BIS */           {RFC3066Arg,        _processRFC3066Locale,  0 }, // rfc3066bis locale name
280/* 01 SCRIPT */               {scriptArg,         _processLocaleElement,  1 }  // script
281};
282
283
284static
285const char* ucol_sit_readOption(const char *start, CollatorSpec *spec,
286                            UErrorCode *status)
287{
288  int32_t i = 0;
289
290  for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) {
291      if(*start == options[i].optionStart) {
292          spec->entries[i].start = start;
293          const char* end = options[i].action(spec, options[i].attr, start+1, status);
294          spec->entries[i].len = (int32_t)(end - start);
295          return end;
296      }
297  }
298  *status = U_ILLEGAL_ARGUMENT_ERROR;
299  return start;
300}
301
302static
303void ucol_sit_initCollatorSpecs(CollatorSpec *spec)
304{
305    // reset everything
306    uprv_memset(spec, 0, sizeof(CollatorSpec));
307    // set collation options to default
308    int32_t i = 0;
309    for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
310        spec->options[i] = UCOL_DEFAULT;
311    }
312}
313
314static const char*
315ucol_sit_readSpecs(CollatorSpec *s, const char *string,
316                        UParseError *parseError, UErrorCode *status)
317{
318    const char *definition = string;
319    while(U_SUCCESS(*status) && *string) {
320        string = ucol_sit_readOption(string, s, status);
321        // advance over '_'
322        while(*string && *string == '_') {
323            string++;
324        }
325    }
326    if(U_FAILURE(*status)) {
327        parseError->offset = (int32_t)(string - definition);
328    }
329    return string;
330}
331
332static
333int32_t ucol_sit_dumpSpecs(CollatorSpec *s, char *destination, int32_t capacity, UErrorCode *status)
334{
335    int32_t i = 0, j = 0;
336    int32_t len = 0;
337    char optName;
338    if(U_SUCCESS(*status)) {
339        for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) {
340            if(s->entries[i].start) {
341                if(len) {
342                    if(len < capacity) {
343                        uprv_strcat(destination, "_");
344                    }
345                    len++;
346                }
347                optName = *(s->entries[i].start);
348                if(optName == languageArg || optName == regionArg || optName == variantArg || optName == keywordArg) {
349                    for(j = 0; j < s->entries[i].len; j++) {
350                        if(len + j < capacity) {
351                            destination[len+j] = uprv_toupper(*(s->entries[i].start+j));
352                        }
353                    }
354                    len += s->entries[i].len;
355                } else {
356                    len += s->entries[i].len;
357                    if(len < capacity) {
358                        uprv_strncat(destination,s->entries[i].start, s->entries[i].len);
359                    }
360                }
361            }
362        }
363        return len;
364    } else {
365        return 0;
366    }
367}
368
369static void
370ucol_sit_calculateWholeLocale(CollatorSpec *s) {
371    // put the locale together, unless we have a done
372    // locale
373    if(s->locale[0] == 0) {
374        // first the language
375        uprv_strcat(s->locale, s->locElements[0]);
376        // then the script, if present
377        if(*(s->locElements[1])) {
378            uprv_strcat(s->locale, "_");
379            uprv_strcat(s->locale, s->locElements[1]);
380        }
381        // then the region, if present
382        if(*(s->locElements[2])) {
383            uprv_strcat(s->locale, "_");
384            uprv_strcat(s->locale, s->locElements[2]);
385        } else if(*(s->locElements[3])) { // if there is a variant, we need an underscore
386            uprv_strcat(s->locale, "_");
387        }
388        // add variant, if there
389        if(*(s->locElements[3])) {
390            uprv_strcat(s->locale, "_");
391            uprv_strcat(s->locale, s->locElements[3]);
392        }
393
394        // if there is a collation keyword, add that too
395        if(*(s->locElements[4])) {
396            uprv_strcat(s->locale, collationKeyword);
397            uprv_strcat(s->locale, s->locElements[4]);
398        }
399    }
400}
401
402
403U_CAPI void U_EXPORT2
404ucol_prepareShortStringOpen( const char *definition,
405                          UBool,
406                          UParseError *parseError,
407                          UErrorCode *status)
408{
409    if(U_FAILURE(*status)) return;
410
411    UParseError internalParseError;
412
413    if(!parseError) {
414        parseError = &internalParseError;
415    }
416    parseError->line = 0;
417    parseError->offset = 0;
418    parseError->preContext[0] = 0;
419    parseError->postContext[0] = 0;
420
421
422    // first we want to pick stuff out of short string.
423    // we'll end up with an UCA version, locale and a bunch of
424    // settings
425
426    // analyse the string in order to get everything we need.
427    CollatorSpec s;
428    ucol_sit_initCollatorSpecs(&s);
429    ucol_sit_readSpecs(&s, definition, parseError, status);
430    ucol_sit_calculateWholeLocale(&s);
431
432    char buffer[internalBufferSize];
433    uprv_memset(buffer, 0, internalBufferSize);
434    uloc_canonicalize(s.locale, buffer, internalBufferSize, status);
435
436    UResourceBundle *b = ures_open(U_ICUDATA_COLL, buffer, status);
437    /* we try to find stuff from keyword */
438    UResourceBundle *collations = ures_getByKey(b, "collations", NULL, status);
439    UResourceBundle *collElem = NULL;
440    char keyBuffer[256];
441    // if there is a keyword, we pick it up and try to get elements
442    if(!uloc_getKeywordValue(buffer, "collation", keyBuffer, 256, status)) {
443      // no keyword. we try to find the default setting, which will give us the keyword value
444      UResourceBundle *defaultColl = ures_getByKeyWithFallback(collations, "default", NULL, status);
445      if(U_SUCCESS(*status)) {
446        int32_t defaultKeyLen = 0;
447        const UChar *defaultKey = ures_getString(defaultColl, &defaultKeyLen, status);
448        u_UCharsToChars(defaultKey, keyBuffer, defaultKeyLen);
449        keyBuffer[defaultKeyLen] = 0;
450      } else {
451        *status = U_INTERNAL_PROGRAM_ERROR;
452        return;
453      }
454      ures_close(defaultColl);
455    }
456    collElem = ures_getByKeyWithFallback(collations, keyBuffer, collElem, status);
457    ures_close(collElem);
458    ures_close(collations);
459    ures_close(b);
460}
461
462
463U_CAPI UCollator* U_EXPORT2
464ucol_openFromShortString( const char *definition,
465                          UBool forceDefaults,
466                          UParseError *parseError,
467                          UErrorCode *status)
468{
469    UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN_FROM_SHORT_STRING);
470    UTRACE_DATA1(UTRACE_INFO, "short string = \"%s\"", definition);
471
472    if(U_FAILURE(*status)) return 0;
473
474    UParseError internalParseError;
475
476    if(!parseError) {
477        parseError = &internalParseError;
478    }
479    parseError->line = 0;
480    parseError->offset = 0;
481    parseError->preContext[0] = 0;
482    parseError->postContext[0] = 0;
483
484
485    // first we want to pick stuff out of short string.
486    // we'll end up with an UCA version, locale and a bunch of
487    // settings
488
489    // analyse the string in order to get everything we need.
490    const char *string = definition;
491    CollatorSpec s;
492    ucol_sit_initCollatorSpecs(&s);
493    string = ucol_sit_readSpecs(&s, definition, parseError, status);
494    ucol_sit_calculateWholeLocale(&s);
495
496    char buffer[internalBufferSize];
497    uprv_memset(buffer, 0, internalBufferSize);
498    uloc_canonicalize(s.locale, buffer, internalBufferSize, status);
499
500    UCollator *result = ucol_open(buffer, status);
501    int32_t i = 0;
502
503    for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
504        if(s.options[i] != UCOL_DEFAULT) {
505            if(forceDefaults || ucol_getAttribute(result, (UColAttribute)i, status) != s.options[i]) {
506                ucol_setAttribute(result, (UColAttribute)i, s.options[i], status);
507            }
508
509            if(U_FAILURE(*status)) {
510                parseError->offset = (int32_t)(string - definition);
511                ucol_close(result);
512                return NULL;
513            }
514
515        }
516    }
517    if(s.variableTopSet) {
518        if(s.variableTopString[0]) {
519            ucol_setVariableTop(result, s.variableTopString, s.variableTopStringLen, status);
520        } else { // we set by value, using 'B'
521            ucol_restoreVariableTop(result, s.variableTopValue, status);
522        }
523    }
524
525
526    if(U_FAILURE(*status)) { // here it can only be a bogus value
527        ucol_close(result);
528        result = NULL;
529    }
530
531    UTRACE_EXIT_PTR_STATUS(result, *status);
532    return result;
533}
534
535
536static void appendShortStringElement(const char *src, int32_t len, char *result, int32_t *resultSize, int32_t capacity, char arg)
537{
538    if(len) {
539        if(*resultSize) {
540            if(*resultSize < capacity) {
541                uprv_strcat(result, "_");
542            }
543            (*resultSize)++;
544        }
545        *resultSize += len + 1;
546        if(*resultSize < capacity) {
547            uprv_strncat(result, &arg, 1);
548            uprv_strncat(result, src, len);
549        }
550    }
551}
552
553U_CAPI int32_t U_EXPORT2
554ucol_getShortDefinitionString(const UCollator *coll,
555                              const char *locale,
556                              char *dst,
557                              int32_t capacity,
558                              UErrorCode *status)
559{
560    if(U_FAILURE(*status)) return 0;
561    char buffer[internalBufferSize];
562    uprv_memset(buffer, 0, internalBufferSize*sizeof(char));
563    int32_t resultSize = 0;
564    char tempbuff[internalBufferSize];
565    char locBuff[internalBufferSize];
566    uprv_memset(buffer, 0, internalBufferSize*sizeof(char));
567    int32_t elementSize = 0;
568    UBool isAvailable = 0;
569    CollatorSpec s;
570    ucol_sit_initCollatorSpecs(&s);
571
572    if(!locale) {
573        locale = ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, status);
574    }
575    elementSize = ucol_getFunctionalEquivalent(locBuff, internalBufferSize, "collation", locale, &isAvailable, status);
576
577    if(elementSize) {
578        // we should probably canonicalize here...
579        elementSize = uloc_getLanguage(locBuff, tempbuff, internalBufferSize, status);
580        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, languageArg);
581        elementSize = uloc_getCountry(locBuff, tempbuff, internalBufferSize, status);
582        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, regionArg);
583        elementSize = uloc_getScript(locBuff, tempbuff, internalBufferSize, status);
584        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, scriptArg);
585        elementSize = uloc_getVariant(locBuff, tempbuff, internalBufferSize, status);
586        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, variantArg);
587        elementSize = uloc_getKeywordValue(locBuff, "collation", tempbuff, internalBufferSize, status);
588        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, keywordArg);
589    }
590
591    int32_t i = 0;
592    UColAttributeValue attribute = UCOL_DEFAULT;
593    for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) {
594        if(options[i].action == _processCollatorOption) {
595            attribute = ucol_getAttributeOrDefault(coll, (UColAttribute)options[i].attr, status);
596            if(attribute != UCOL_DEFAULT) {
597                char letter = ucol_sit_attributeValueToLetter(attribute, status);
598                appendShortStringElement(&letter, 1,
599                    buffer, &resultSize, /*capacity*/internalBufferSize, options[i].optionStart);
600            }
601        }
602    }
603    if(coll->variableTopValueisDefault == FALSE) {
604        //s.variableTopValue = ucol_getVariableTop(coll, status);
605        elementSize = T_CString_integerToString(tempbuff, coll->variableTopValue, 16);
606        appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variableTopValArg);
607    }
608
609    UParseError parseError;
610    return ucol_normalizeShortDefinitionString(buffer, dst, capacity, &parseError, status);
611}
612
613U_CAPI int32_t U_EXPORT2
614ucol_normalizeShortDefinitionString(const char *definition,
615                                    char *destination,
616                                    int32_t capacity,
617                                    UParseError *parseError,
618                                    UErrorCode *status)
619{
620
621    if(U_FAILURE(*status)) {
622        return 0;
623    }
624
625    if(destination) {
626        uprv_memset(destination, 0, capacity*sizeof(char));
627    }
628
629    UParseError pe;
630    if(!parseError) {
631        parseError = &pe;
632    }
633
634    // validate
635    CollatorSpec s;
636    ucol_sit_initCollatorSpecs(&s);
637    ucol_sit_readSpecs(&s, definition, parseError, status);
638    return ucol_sit_dumpSpecs(&s, destination, capacity, status);
639}
640
641U_CAPI UColAttributeValue  U_EXPORT2
642ucol_getAttributeOrDefault(const UCollator *coll, UColAttribute attr, UErrorCode *status)
643{
644    if(U_FAILURE(*status) || coll == NULL) {
645      return UCOL_DEFAULT;
646    }
647    switch(attr) {
648    case UCOL_NUMERIC_COLLATION:
649        return coll->numericCollationisDefault?UCOL_DEFAULT:coll->numericCollation;
650    case UCOL_HIRAGANA_QUATERNARY_MODE:
651        return coll->hiraganaQisDefault?UCOL_DEFAULT:coll->hiraganaQ;
652    case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
653        return coll->frenchCollationisDefault?UCOL_DEFAULT:coll->frenchCollation;
654    case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
655        return coll->alternateHandlingisDefault?UCOL_DEFAULT:coll->alternateHandling;
656    case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
657        return coll->caseFirstisDefault?UCOL_DEFAULT:coll->caseFirst;
658    case UCOL_CASE_LEVEL: /* do we have an extra case level */
659        return coll->caseLevelisDefault?UCOL_DEFAULT:coll->caseLevel;
660    case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
661        return coll->normalizationModeisDefault?UCOL_DEFAULT:coll->normalizationMode;
662    case UCOL_STRENGTH:         /* attribute for strength */
663        return coll->strengthisDefault?UCOL_DEFAULT:coll->strength;
664    case UCOL_ATTRIBUTE_COUNT:
665    default:
666        *status = U_ILLEGAL_ARGUMENT_ERROR;
667        break;
668    }
669    return UCOL_DEFAULT;
670}
671
672
673struct contContext {
674    const UCollator *coll;
675    USet            *conts;
676    USet            *expansions;
677    USet            *removedContractions;
678    UBool           addPrefixes;
679    UErrorCode      *status;
680};
681
682
683
684static void
685addSpecial(contContext *context, UChar *buffer, int32_t bufLen,
686               uint32_t CE, int32_t leftIndex, int32_t rightIndex, UErrorCode *status)
687{
688  const UCollator *coll = context->coll;
689  USet *contractions = context->conts;
690  USet *expansions = context->expansions;
691  UBool addPrefixes = context->addPrefixes;
692
693    const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
694    uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
695    // we might have a contraction that ends from previous level
696    if(newCE != UCOL_NOT_FOUND) {
697      if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG && isSpecial(newCE) && getCETag(newCE) == SPEC_PROC_TAG && addPrefixes) {
698        addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status);
699      }
700      if(contractions && rightIndex-leftIndex > 1) {
701            uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex);
702            if(expansions && isSpecial(CE) && getCETag(CE) == EXPANSION_TAG) {
703              uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex);
704            }
705      }
706    }
707
708    UCharOffset++;
709    // check whether we're doing contraction or prefix
710    if(getCETag(CE) == SPEC_PROC_TAG && addPrefixes) {
711      if(leftIndex == 0) {
712          *status = U_INTERNAL_PROGRAM_ERROR;
713          return;
714      }
715      --leftIndex;
716      while(*UCharOffset != 0xFFFF) {
717          newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
718          buffer[leftIndex] = *UCharOffset;
719          if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) {
720              addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status);
721          } else {
722            if(contractions) {
723                uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex);
724            }
725            if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) {
726              uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex);
727            }
728          }
729          UCharOffset++;
730      }
731    } else if(getCETag(CE) == CONTRACTION_TAG) {
732      if(rightIndex == bufLen-1) {
733          *status = U_INTERNAL_PROGRAM_ERROR;
734          return;
735      }
736      while(*UCharOffset != 0xFFFF) {
737          newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
738          buffer[rightIndex] = *UCharOffset;
739          if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) {
740              addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex+1, status);
741          } else {
742            if(contractions) {
743              uset_addString(contractions, buffer+leftIndex, rightIndex+1-leftIndex);
744            }
745            if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) {
746              uset_addString(expansions, buffer+leftIndex, rightIndex+1-leftIndex);
747            }
748          }
749          UCharOffset++;
750      }
751    }
752
753}
754
755U_CDECL_BEGIN
756static UBool U_CALLCONV
757_processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE)
758{
759    UErrorCode *status = ((contContext *)context)->status;
760    USet *expansions = ((contContext *)context)->expansions;
761    USet *removed = ((contContext *)context)->removedContractions;
762    UBool addPrefixes = ((contContext *)context)->addPrefixes;
763    UChar contraction[internalBufferSize];
764    if(isSpecial(CE)) {
765      if(((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) {
766        while(start < limit && U_SUCCESS(*status)) {
767            // if there are suppressed contractions, we don't
768            // want to add them.
769            if(removed && uset_contains(removed, start)) {
770                start++;
771                continue;
772            }
773            // we start our contraction from middle, since we don't know if it
774            // will grow toward right or left
775            contraction[internalBufferSize/2] = (UChar)start;
776            addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status);
777            start++;
778        }
779      } else if(expansions && getCETag(CE) == EXPANSION_TAG) {
780        while(start < limit && U_SUCCESS(*status)) {
781          uset_add(expansions, start++);
782        }
783      }
784    }
785    if(U_FAILURE(*status)) {
786        return FALSE;
787    } else {
788        return TRUE;
789    }
790}
791
792U_CDECL_END
793
794
795
796/**
797 * Get a set containing the contractions defined by the collator. The set includes
798 * both the UCA contractions and the contractions defined by the collator
799 * @param coll collator
800 * @param conts the set to hold the result
801 * @param status to hold the error code
802 * @return the size of the contraction set
803 */
804U_CAPI int32_t U_EXPORT2
805ucol_getContractions( const UCollator *coll,
806                  USet *contractions,
807                  UErrorCode *status)
808{
809  ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
810  return uset_getItemCount(contractions);
811}
812
813/**
814 * Get a set containing the expansions defined by the collator. The set includes
815 * both the UCA expansions and the expansions defined by the tailoring
816 * @param coll collator
817 * @param conts the set to hold the result
818 * @param addPrefixes add the prefix contextual elements to contractions
819 * @param status to hold the error code
820 *
821 * @draft ICU 3.4
822 */
823U_CAPI void U_EXPORT2
824ucol_getContractionsAndExpansions( const UCollator *coll,
825                  USet *contractions,
826                  USet *expansions,
827                  UBool addPrefixes,
828                  UErrorCode *status)
829{
830    if(U_FAILURE(*status)) {
831        return;
832    }
833    if(coll == NULL) {
834        *status = U_ILLEGAL_ARGUMENT_ERROR;
835        return;
836    }
837
838    if(contractions) {
839      uset_clear(contractions);
840    }
841    if(expansions) {
842      uset_clear(expansions);
843    }
844    int32_t rulesLen = 0;
845    const UChar* rules = ucol_getRules(coll, &rulesLen);
846    UColTokenParser src;
847    ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA,
848                           ucol_tok_getRulesFromBundle, NULL, status);
849
850    contContext c = { NULL, contractions, expansions, src.removeSet, addPrefixes, status };
851
852    // Add the UCA contractions
853    c.coll = coll->UCA;
854    utrie_enum(&coll->UCA->mapping, NULL, _processSpecials, &c);
855
856    // This is collator specific. Add contractions from a collator
857    c.coll = coll;
858    c.removedContractions =  NULL;
859    utrie_enum(&coll->mapping, NULL, _processSpecials, &c);
860    ucol_tok_closeTokenList(&src);
861}
862
863U_CAPI int32_t U_EXPORT2
864ucol_getUnsafeSet( const UCollator *coll,
865                  USet *unsafe,
866                  UErrorCode *status)
867{
868    UChar buffer[internalBufferSize];
869    int32_t len = 0;
870
871    uset_clear(unsafe);
872
873    // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant
874    static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,
875                                    0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };
876
877    // add chars that fail the fcd check
878    uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status);
879
880    // add Thai/Lao prevowels
881    uset_addRange(unsafe, 0xe40, 0xe44);
882    uset_addRange(unsafe, 0xec0, 0xec4);
883    // add lead/trail surrogates
884    uset_addRange(unsafe, 0xd800, 0xdfff);
885
886    USet *contractions = uset_open(0,0);
887
888    int32_t i = 0, j = 0;
889    int32_t contsSize = ucol_getContractions(coll, contractions, status);
890    UChar32 c = 0;
891    // Contraction set consists only of strings
892    // to get unsafe code points, we need to
893    // break the strings apart and add them to the unsafe set
894    for(i = 0; i < contsSize; i++) {
895        len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status);
896        if(len > 0) {
897            j = 0;
898            while(j < len) {
899                U16_NEXT(buffer, j, len, c);
900                if(j < len) {
901                    uset_add(unsafe, c);
902                }
903            }
904        }
905    }
906
907    uset_close(contractions);
908
909    return uset_size(unsafe);
910}
911#endif
912