1/*
2*******************************************************************************
3*
4*   Copyright (C) 2001-2008, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  ucol_tok.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created 02/22/2001
14*   created by: Vladimir Weinstein
15*
16* This module reads a tailoring rule string and produces a list of
17* tokens that will be turned into collation elements
18*
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_COLLATION
24
25#include "unicode/ustring.h"
26#include "unicode/uchar.h"
27#include "unicode/uniset.h"
28
29#include "ucol_tok.h"
30#include "ucol_bld.h"
31#include "cmemory.h"
32#include "../common/util.h"
33
34U_CDECL_BEGIN
35static int32_t U_CALLCONV
36uhash_hashTokens(const UHashTok k)
37{
38    int32_t hash = 0;
39    //uint32_t key = (uint32_t)k.integer;
40    UColToken *key = (UColToken *)k.pointer;
41    if (key != 0) {
42        //int32_t len = (key & 0xFF000000)>>24;
43        int32_t len = (key->source & 0xFF000000)>>24;
44        int32_t inc = ((len - 32) / 32) + 1;
45
46        //const UChar *p = (key & 0x00FFFFFF) + rulesToParse;
47        const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse;
48        const UChar *limit = p + len;
49
50        while (p<limit) {
51            hash = (hash * 37) + *p;
52            p += inc;
53        }
54    }
55    return hash;
56}
57
58static UBool U_CALLCONV
59uhash_compareTokens(const UHashTok key1, const UHashTok key2)
60{
61    //uint32_t p1 = (uint32_t) key1.integer;
62    //uint32_t p2 = (uint32_t) key2.integer;
63    UColToken *p1 = (UColToken *)key1.pointer;
64    UColToken *p2 = (UColToken *)key2.pointer;
65    const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse;
66    const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse;
67    uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
68    uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
69    const UChar *end = s1+s1L-1;
70
71    if (p1 == p2) {
72        return TRUE;
73    }
74    if (p1->source == 0 || p2->source == 0) {
75        return FALSE;
76    }
77    if(s1L != s2L) {
78        return FALSE;
79    }
80    if(p1->source == p2->source) {
81        return TRUE;
82    }
83    while((s1 < end) && *s1 == *s2) {
84        ++s1;
85        ++s2;
86    }
87    if(*s1 == *s2) {
88        return TRUE;
89    } else {
90        return FALSE;
91    }
92}
93U_CDECL_END
94
95/*static inline void U_CALLCONV
96uhash_freeBlockWrapper(void *obj) {
97    uhash_freeBlock(obj);
98}*/
99
100
101typedef struct {
102    uint32_t startCE;
103    uint32_t startContCE;
104    uint32_t limitCE;
105    uint32_t limitContCE;
106} indirectBoundaries;
107
108/* these values are used for finding CE values for indirect positioning. */
109/* Indirect positioning is a mechanism for allowing resets on symbolic   */
110/* values. It only works for resets and you cannot tailor indirect names */
111/* An indirect name can define either an anchor point or a range. An     */
112/* anchor point behaves in exactly the same way as a code point in reset */
113/* would, except that it cannot be tailored. A range (we currently only  */
114/* know for the [top] range will explicitly set the upper bound for      */
115/* generated CEs, thus allowing for better control over how many CEs can */
116/* be squeezed between in the range without performance penalty.         */
117/* In that respect, we use [top] for tailoring of locales that use CJK   */
118/* characters. Other indirect values are currently a pure convenience,   */
119/* they can be used to assure that the CEs will be always positioned in  */
120/* the same place relative to a point with known properties (e.g. first  */
121/* primary ignorable). */
122static indirectBoundaries ucolIndirectBoundaries[15];
123/*
124static indirectBoundaries ucolIndirectBoundaries[11] = {
125{ UCOL_RESET_TOP_VALUE,               0,
126UCOL_NEXT_TOP_VALUE,                0 },
127{ UCOL_FIRST_PRIMARY_IGNORABLE,       0,
1280,                                  0 },
129{ UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
1300,                                  0 },
131{ UCOL_FIRST_SECONDARY_IGNORABLE,     0,
1320,                                  0 },
133{ UCOL_LAST_SECONDARY_IGNORABLE,      0,
1340,                                  0 },
135{ UCOL_FIRST_TERTIARY_IGNORABLE,      0,
1360,                                  0 },
137{ UCOL_LAST_TERTIARY_IGNORABLE,       0,
1380,                                  0 },
139{ UCOL_FIRST_VARIABLE,                0,
1400,                                  0 },
141{ UCOL_LAST_VARIABLE,                 0,
1420,                                  0 },
143{ UCOL_FIRST_NON_VARIABLE,            0,
1440,                                  0 },
145{ UCOL_LAST_NON_VARIABLE,             0,
1460,                                  0 },
147};
148*/
149
150static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
151
152    // Set values for the top - TODO: once we have values for all the indirects, we are going
153    // to initalize here.
154    ucolIndirectBoundaries[indexR].startCE = start[0];
155    ucolIndirectBoundaries[indexR].startContCE = start[1];
156    if(end) {
157        ucolIndirectBoundaries[indexR].limitCE = end[0];
158        ucolIndirectBoundaries[indexR].limitContCE = end[1];
159    } else {
160        ucolIndirectBoundaries[indexR].limitCE = 0;
161        ucolIndirectBoundaries[indexR].limitContCE = 0;
162    }
163}
164
165
166static inline
167void syntaxError(const UChar* rules,
168                 int32_t pos,
169                 int32_t rulesLen,
170                 UParseError* parseError)
171{
172    parseError->offset = pos;
173    parseError->line = 0 ; /* we are not using line numbers */
174
175    // for pre-context
176    int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
177    int32_t stop  = pos;
178
179    u_memcpy(parseError->preContext,rules+start,stop-start);
180    //null terminate the buffer
181    parseError->preContext[stop-start] = 0;
182
183    //for post-context
184    start = pos+1;
185    stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
186    rulesLen;
187
188    if(start < stop) {
189        u_memcpy(parseError->postContext,rules+start,stop-start);
190        //null terminate the buffer
191        parseError->postContext[stop-start]= 0;
192    } else {
193        parseError->postContext[0] = 0;
194    }
195}
196
197static
198void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
199    switch(attrib) {
200    case UCOL_HIRAGANA_QUATERNARY_MODE:
201        opts->hiraganaQ = value;
202        break;
203    case UCOL_FRENCH_COLLATION:
204        opts->frenchCollation = value;
205        break;
206    case UCOL_ALTERNATE_HANDLING:
207        opts->alternateHandling = value;
208        break;
209    case UCOL_CASE_FIRST:
210        opts->caseFirst = value;
211        break;
212    case UCOL_CASE_LEVEL:
213        opts->caseLevel = value;
214        break;
215    case UCOL_NORMALIZATION_MODE:
216        opts->normalizationMode = value;
217        break;
218    case UCOL_STRENGTH:
219        opts->strength = value;
220        break;
221    case UCOL_NUMERIC_COLLATION:
222        opts->numericCollation = value;
223        break;
224    case UCOL_ATTRIBUTE_COUNT:
225    default:
226        break;
227    }
228}
229
230#define UTOK_OPTION_COUNT 20
231
232static UBool didInit = FALSE;
233/* we can be strict, or we can be lenient */
234/* I'd surely be lenient with the option arguments */
235/* maybe even with options */
236U_STRING_DECL(suboption_00, "non-ignorable", 13);
237U_STRING_DECL(suboption_01, "shifted",        7);
238
239U_STRING_DECL(suboption_02, "lower",          5);
240U_STRING_DECL(suboption_03, "upper",          5);
241U_STRING_DECL(suboption_04, "off",            3);
242U_STRING_DECL(suboption_05, "on",             2);
243U_STRING_DECL(suboption_06, "1",              1);
244U_STRING_DECL(suboption_07, "2",              1);
245U_STRING_DECL(suboption_08, "3",              1);
246U_STRING_DECL(suboption_09, "4",              1);
247U_STRING_DECL(suboption_10, "I",              1);
248
249U_STRING_DECL(suboption_11, "primary",        7);
250U_STRING_DECL(suboption_12, "secondary",      9);
251U_STRING_DECL(suboption_13, "tertiary",       8);
252U_STRING_DECL(suboption_14, "variable",       8);
253U_STRING_DECL(suboption_15, "regular",        7);
254U_STRING_DECL(suboption_16, "implicit",       8);
255U_STRING_DECL(suboption_17, "trailing",       8);
256
257
258U_STRING_DECL(option_00,    "undefined",      9);
259U_STRING_DECL(option_01,    "rearrange",      9);
260U_STRING_DECL(option_02,    "alternate",      9);
261U_STRING_DECL(option_03,    "backwards",      9);
262U_STRING_DECL(option_04,    "variable top",  12);
263U_STRING_DECL(option_05,    "top",            3);
264U_STRING_DECL(option_06,    "normalization", 13);
265U_STRING_DECL(option_07,    "caseLevel",      9);
266U_STRING_DECL(option_08,    "caseFirst",      9);
267U_STRING_DECL(option_09,    "scriptOrder",   11);
268U_STRING_DECL(option_10,    "charsetname",   11);
269U_STRING_DECL(option_11,    "charset",        7);
270U_STRING_DECL(option_12,    "before",         6);
271U_STRING_DECL(option_13,    "hiraganaQ",      9);
272U_STRING_DECL(option_14,    "strength",       8);
273U_STRING_DECL(option_15,    "first",          5);
274U_STRING_DECL(option_16,    "last",           4);
275U_STRING_DECL(option_17,    "optimize",       8);
276U_STRING_DECL(option_18,    "suppressContractions",         20);
277U_STRING_DECL(option_19,    "numericOrdering",              15);
278
279
280/*
281[last variable] last variable value
282[last primary ignorable] largest CE for primary ignorable
283[last secondary ignorable] largest CE for secondary ignorable
284[last tertiary ignorable] largest CE for tertiary ignorable
285[top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
286*/
287
288
289static const ucolTokSuboption alternateSub[2] = {
290    {suboption_00, 13, UCOL_NON_IGNORABLE},
291    {suboption_01,  7, UCOL_SHIFTED}
292};
293
294static const ucolTokSuboption caseFirstSub[3] = {
295    {suboption_02, 5, UCOL_LOWER_FIRST},
296    {suboption_03,  5, UCOL_UPPER_FIRST},
297    {suboption_04,  3, UCOL_OFF},
298};
299
300static const ucolTokSuboption onOffSub[2] = {
301    {suboption_04, 3, UCOL_OFF},
302    {suboption_05, 2, UCOL_ON}
303};
304
305static const ucolTokSuboption frenchSub[1] = {
306    {suboption_07, 1, UCOL_ON}
307};
308
309static const ucolTokSuboption beforeSub[3] = {
310    {suboption_06, 1, UCOL_PRIMARY},
311    {suboption_07, 1, UCOL_SECONDARY},
312    {suboption_08, 1, UCOL_TERTIARY}
313};
314
315static const ucolTokSuboption strengthSub[5] = {
316    {suboption_06, 1, UCOL_PRIMARY},
317    {suboption_07, 1, UCOL_SECONDARY},
318    {suboption_08, 1, UCOL_TERTIARY},
319    {suboption_09, 1, UCOL_QUATERNARY},
320    {suboption_10, 1, UCOL_IDENTICAL},
321};
322
323static const ucolTokSuboption firstLastSub[7] = {
324    {suboption_11, 7, UCOL_PRIMARY},
325    {suboption_12, 9, UCOL_PRIMARY},
326    {suboption_13, 8, UCOL_PRIMARY},
327    {suboption_14, 8, UCOL_PRIMARY},
328    {suboption_15, 7, UCOL_PRIMARY},
329    {suboption_16, 8, UCOL_PRIMARY},
330    {suboption_17, 8, UCOL_PRIMARY},
331};
332
333enum OptionNumber {
334    OPTION_ALTERNATE_HANDLING = 0,
335    OPTION_FRENCH_COLLATION,
336    OPTION_CASE_LEVEL,
337    OPTION_CASE_FIRST,
338    OPTION_NORMALIZATION_MODE,
339    OPTION_HIRAGANA_QUATERNARY,
340    OPTION_STRENGTH,
341    OPTION_NUMERIC_COLLATION,
342    OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
343    OPTION_VARIABLE_TOP,
344    OPTION_REARRANGE,
345    OPTION_BEFORE,
346    OPTION_TOP,
347    OPTION_FIRST,
348    OPTION_LAST,
349    OPTION_OPTIMIZE,
350    OPTION_SUPPRESS_CONTRACTIONS,
351    OPTION_UNDEFINED,
352    OPTION_SCRIPT_ORDER,
353    OPTION_CHARSET_NAME,
354    OPTION_CHARSET
355} ;
356
357static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
358    /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
359    /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
360    /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
361    /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
362    /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
363    /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
364    /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
365    /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
366    /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
367    /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
368    /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
369    /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
370    /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
371    /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
372    /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
373    /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
374    /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
375    /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
376    /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
377    /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"charset"        */
378};
379
380static
381int32_t u_strncmpNoCase(const UChar     *s1,
382                        const UChar     *s2,
383                        int32_t     n)
384{
385    if(n > 0) {
386        int32_t rc;
387        for(;;) {
388            rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
389            if(rc != 0 || *s1 == 0 || --n == 0) {
390                return rc;
391            }
392            ++s1;
393            ++s2;
394        }
395    }
396    return 0;
397}
398
399static
400void ucol_uprv_tok_initData() {
401    if(!didInit) {
402        U_STRING_INIT(suboption_00, "non-ignorable", 13);
403        U_STRING_INIT(suboption_01, "shifted",        7);
404
405        U_STRING_INIT(suboption_02, "lower",          5);
406        U_STRING_INIT(suboption_03, "upper",          5);
407        U_STRING_INIT(suboption_04, "off",            3);
408        U_STRING_INIT(suboption_05, "on",             2);
409
410        U_STRING_INIT(suboption_06, "1",              1);
411        U_STRING_INIT(suboption_07, "2",              1);
412        U_STRING_INIT(suboption_08, "3",              1);
413        U_STRING_INIT(suboption_09, "4",              1);
414        U_STRING_INIT(suboption_10, "I",              1);
415
416        U_STRING_INIT(suboption_11, "primary",        7);
417        U_STRING_INIT(suboption_12, "secondary",      9);
418        U_STRING_INIT(suboption_13, "tertiary",       8);
419        U_STRING_INIT(suboption_14, "variable",       8);
420        U_STRING_INIT(suboption_15, "regular",        7);
421        U_STRING_INIT(suboption_16, "implicit",       8);
422        U_STRING_INIT(suboption_17, "trailing",       8);
423
424
425        U_STRING_INIT(option_00, "undefined",      9);
426        U_STRING_INIT(option_01, "rearrange",      9);
427        U_STRING_INIT(option_02, "alternate",      9);
428        U_STRING_INIT(option_03, "backwards",      9);
429        U_STRING_INIT(option_04, "variable top",  12);
430        U_STRING_INIT(option_05, "top",            3);
431        U_STRING_INIT(option_06, "normalization", 13);
432        U_STRING_INIT(option_07, "caseLevel",      9);
433        U_STRING_INIT(option_08, "caseFirst",      9);
434        U_STRING_INIT(option_09, "scriptOrder",   11);
435        U_STRING_INIT(option_10, "charsetname",   11);
436        U_STRING_INIT(option_11, "charset",        7);
437        U_STRING_INIT(option_12, "before",         6);
438        U_STRING_INIT(option_13, "hiraganaQ",      9);
439        U_STRING_INIT(option_14, "strength",       8);
440        U_STRING_INIT(option_15, "first",          5);
441        U_STRING_INIT(option_16, "last",           4);
442        U_STRING_INIT(option_17, "optimize",       8);
443        U_STRING_INIT(option_18, "suppressContractions",         20);
444        U_STRING_INIT(option_19, "numericOrdering",      15);
445        didInit = TRUE;
446    }
447}
448
449
450// This function reads basic options to set in the runtime collator
451// used by data driven tests. Should not support build time options
452U_CAPI const UChar * U_EXPORT2
453ucol_tok_getNextArgument(const UChar *start, const UChar *end,
454                         UColAttribute *attrib, UColAttributeValue *value,
455                         UErrorCode *status)
456{
457    uint32_t i = 0;
458    int32_t j=0;
459    UBool foundOption = FALSE;
460    const UChar *optionArg = NULL;
461
462    ucol_uprv_tok_initData();
463
464    while(start < end && u_isWhitespace(*start)) { /* eat whitespace */
465        start++;
466    }
467    if(start >= end) {
468        return NULL;
469    }
470    /* skip opening '[' */
471    if(*start == 0x005b) {
472        start++;
473    } else {
474        *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
475        return NULL;
476    }
477
478    while(i < UTOK_OPTION_COUNT) {
479        if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
480            foundOption = TRUE;
481            if(end - start > rulesOptions[i].optionLen) {
482                optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
483                while(u_isWhitespace(*optionArg)) { /* eat whitespace */
484                    optionArg++;
485                }
486            }
487            break;
488        }
489        i++;
490    }
491
492    if(!foundOption) {
493        *status = U_ILLEGAL_ARGUMENT_ERROR;
494        return NULL;
495    }
496
497    if(optionArg) {
498        for(j = 0; j<rulesOptions[i].subSize; j++) {
499            if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
500                //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
501                *attrib = rulesOptions[i].attr;
502                *value = rulesOptions[i].subopts[j].attrVal;
503                optionArg += rulesOptions[i].subopts[j].subLen;
504                while(u_isWhitespace(*optionArg)) { /* eat whitespace */
505                    optionArg++;
506                }
507                if(*optionArg == 0x005d) {
508                    optionArg++;
509                    return optionArg;
510                } else {
511                    *status = U_ILLEGAL_ARGUMENT_ERROR;
512                    return NULL;
513                }
514            }
515        }
516    }
517    *status = U_ILLEGAL_ARGUMENT_ERROR;
518    return NULL;
519}
520
521static
522USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
523    while(*start != 0x005b) { /* advance while we find the first '[' */
524        start++;
525    }
526    // now we need to get a balanced set of '[]'. The problem is that a set can have
527    // many, and *end point to the first closing '['
528    int32_t noOpenBraces = 1;
529    int32_t current = 1; // skip the opening brace
530    while(start+current < end && noOpenBraces != 0) {
531        if(start[current] == 0x005b) {
532            noOpenBraces++;
533        } else if(start[current] == 0x005D) { // closing brace
534            noOpenBraces--;
535        }
536        current++;
537    }
538
539    if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
540        *status = U_ILLEGAL_ARGUMENT_ERROR;
541        return NULL;
542    }
543    return uset_openPattern(start, current, status);
544}
545
546static
547int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
548    int32_t i = 0;
549    ucol_uprv_tok_initData();
550
551    while(u_isWhitespace(*start)) { /* eat whitespace */
552        start++;
553    }
554    while(i < UTOK_OPTION_COUNT) {
555        if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
556            if(end - start > rulesOptions[i].optionLen) {
557                *optionArg = start+rulesOptions[i].optionLen; /* start of the options*/
558                while(u_isWhitespace(**optionArg)) { /* eat whitespace */
559                    (*optionArg)++;
560                }
561            }
562            break;
563        }
564        i++;
565    }
566    if(i == UTOK_OPTION_COUNT) {
567        i = -1; // didn't find an option
568    }
569    return i;
570}
571
572
573// reads and conforms to various options in rules
574// end is the position of the first closing ']'
575// However, some of the options take an UnicodeSet definition
576// which needs to duplicate the closing ']'
577// for example: '[copy [\uAC00-\uD7FF]]'
578// These options will move end to the second ']' and the
579// caller will set the current to it.
580static
581uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
582    const UChar* start = src->current;
583    int32_t i = 0;
584    int32_t j=0;
585    const UChar *optionArg = NULL;
586
587    uint8_t result = 0;
588
589    start++; /*skip opening '['*/
590    i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
591    if(optionArg) {
592        src->current = optionArg;
593    }
594
595    if(i < 0) {
596        *status = U_ILLEGAL_ARGUMENT_ERROR;
597    } else {
598        int32_t noOpenBraces = 1;
599        switch(i) {
600    case OPTION_ALTERNATE_HANDLING:
601    case OPTION_FRENCH_COLLATION:
602    case OPTION_CASE_LEVEL:
603    case OPTION_CASE_FIRST:
604    case OPTION_NORMALIZATION_MODE:
605    case OPTION_HIRAGANA_QUATERNARY:
606    case OPTION_STRENGTH:
607    case OPTION_NUMERIC_COLLATION:
608        if(optionArg) {
609            for(j = 0; j<rulesOptions[i].subSize; j++) {
610                if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
611                    ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
612                    result =  UCOL_TOK_SUCCESS;
613                }
614            }
615        }
616        if(result == 0) {
617            *status = U_ILLEGAL_ARGUMENT_ERROR;
618        }
619        break;
620    case OPTION_VARIABLE_TOP:
621        result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
622        break;
623    case OPTION_REARRANGE:
624        result = UCOL_TOK_SUCCESS;
625        break;
626    case OPTION_BEFORE:
627        if(optionArg) {
628            for(j = 0; j<rulesOptions[i].subSize; j++) {
629                if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
630                    result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
631                }
632            }
633        }
634        if(result == 0) {
635            *status = U_ILLEGAL_ARGUMENT_ERROR;
636        }
637        break;
638    case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
639        /* index to this array will be src->parsedToken.indirectIndex*/
640        src->parsedToken.indirectIndex = 0;
641        result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
642        break;
643    case OPTION_FIRST:
644    case OPTION_LAST: /* first, last */
645        for(j = 0; j<rulesOptions[i].subSize; j++) {
646            if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
647                // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
648                // element of indirect boundaries is reserved for top.
649                src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
650                result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
651            }
652        }
653        if(result == 0) {
654            *status = U_ILLEGAL_ARGUMENT_ERROR;
655        }
656        break;
657    case OPTION_OPTIMIZE:
658    case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
659        // we need to move end here
660        src->current++; // skip opening brace
661        while(src->current < src->end && noOpenBraces != 0) {
662            if(*src->current == 0x005b) {
663                noOpenBraces++;
664            } else if(*src->current == 0x005D) { // closing brace
665                noOpenBraces--;
666            }
667            src->current++;
668        }
669        result = UCOL_TOK_SUCCESS;
670        break;
671    default:
672        *status = U_UNSUPPORTED_ERROR;
673        break;
674        }
675    }
676    src->current = u_memchr(src->current, 0x005d, src->end-src->current);
677    return result;
678}
679
680
681inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
682    if(src->extraCurrent+len >= src->extraEnd) {
683        /* reallocate */
684        UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
685        if(newSrc != NULL) {
686            src->current = newSrc + (src->current - src->source);
687            src->extraCurrent = newSrc + (src->extraCurrent - src->source);
688            src->end = newSrc + (src->end - src->source);
689            src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
690            src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
691            src->source = newSrc;
692        } else {
693            *status = U_MEMORY_ALLOCATION_ERROR;
694        }
695    }
696    if(len == 1) {
697        *src->extraCurrent++ = *stuff;
698    } else {
699        uprv_memcpy(src->extraCurrent, stuff, len*sizeof(UChar));
700        src->extraCurrent += len;
701    }
702
703
704}
705
706inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
707    /*
708    top = TRUE;
709    */
710    UChar buff[5];
711    src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
712    buff[0] = 0xFFFE;
713    buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
714    buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
715    if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
716        src->parsedToken.charsLen = 3;
717        ucol_tok_addToExtraCurrent(src, buff, 3, status);
718    } else {
719        buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
720        buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
721        src->parsedToken.charsLen = 5;
722        ucol_tok_addToExtraCurrent(src, buff, 5, status);
723    }
724    return TRUE;
725}
726
727static UBool isCharNewLine(UChar c){
728    switch(c){
729    case 0x000A: /* LF  */
730    case 0x000D: /* CR  */
731    case 0x000C: /* FF  */
732    case 0x0085: /* NEL */
733    case 0x2028: /* LS  */
734    case 0x2029: /* PS  */
735        return TRUE;
736    default:
737        return FALSE;
738    }
739}
740
741U_CAPI const UChar* U_EXPORT2
742ucol_tok_parseNextToken(UColTokenParser *src,
743                        UBool startOfRules,
744                        UParseError *parseError,
745                        UErrorCode *status)
746{
747    /* parsing part */
748    UBool variableTop = FALSE;
749    UBool top = FALSE;
750    UBool inChars = TRUE;
751    UBool inQuote = FALSE;
752    UBool wasInQuote = FALSE;
753    uint8_t before = 0;
754    UBool isEscaped = FALSE;
755    // TODO: replace these variables with src->parsedToken counterparts
756    // no need to use them anymore since we have src->parsedToken.
757    // Ideally, token parser would be a nice class... Once, when I have
758    // more time (around 2020 probably).
759    uint32_t newExtensionLen = 0;
760    uint32_t extensionOffset = 0;
761    uint32_t newStrength = UCOL_TOK_UNSET;
762    UChar buff[10];
763
764    src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
765    src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
766    src->parsedToken.indirectIndex = 0;
767
768    while (src->current < src->end) {
769        UChar ch = *(src->current);
770
771        if (inQuote) {
772            if (ch == 0x0027/*'\''*/) {
773                inQuote = FALSE;
774            } else {
775                if ((src->parsedToken.charsLen == 0) || inChars) {
776                    if(src->parsedToken.charsLen == 0) {
777                        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
778                    }
779                    src->parsedToken.charsLen++;
780                } else {
781                    if(newExtensionLen == 0) {
782                        extensionOffset = (uint32_t)(src->extraCurrent - src->source);
783                    }
784                    newExtensionLen++;
785                }
786            }
787        }else if(isEscaped){
788            isEscaped =FALSE;
789            if (newStrength == UCOL_TOK_UNSET) {
790                *status = U_INVALID_FORMAT_ERROR;
791                syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
792                return NULL;
793                // enabling rules to start with non-tokens a < b
794                // newStrength = UCOL_TOK_RESET;
795            }
796            if(ch != 0x0000  && src->current != src->end) {
797                if (inChars) {
798                    if(src->parsedToken.charsLen == 0) {
799                        src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
800                    }
801                    src->parsedToken.charsLen++;
802                } else {
803                    if(newExtensionLen == 0) {
804                        extensionOffset = (uint32_t)(src->current - src->source);
805                    }
806                    newExtensionLen++;
807                }
808            }
809        }else {
810            if(!uprv_isRuleWhiteSpace(ch)) {
811                /* Sets the strength for this entry */
812                switch (ch) {
813                case 0x003D/*'='*/ :
814                    if (newStrength != UCOL_TOK_UNSET) {
815                        goto EndOfLoop;
816                    }
817
818                    /* if we start with strength, we'll reset to top */
819                    if(startOfRules == TRUE) {
820                        src->parsedToken.indirectIndex = 5;
821                        top = ucol_tok_doSetTop(src, status);
822                        newStrength = UCOL_TOK_RESET;
823                        goto EndOfLoop;
824                    }
825                    newStrength = UCOL_IDENTICAL;
826                    break;
827
828                case 0x002C/*','*/:
829                    if (newStrength != UCOL_TOK_UNSET) {
830                        goto EndOfLoop;
831                    }
832
833                    /* if we start with strength, we'll reset to top */
834                    if(startOfRules == TRUE) {
835                        src->parsedToken.indirectIndex = 5;
836                        top = ucol_tok_doSetTop(src, status);
837                        newStrength = UCOL_TOK_RESET;
838                        goto EndOfLoop;
839                    }
840                    newStrength = UCOL_TERTIARY;
841                    break;
842
843                case  0x003B/*';'*/:
844                    if (newStrength != UCOL_TOK_UNSET) {
845                        goto EndOfLoop;
846                    }
847
848                    /* if we start with strength, we'll reset to top */
849                    if(startOfRules == TRUE) {
850                        src->parsedToken.indirectIndex = 5;
851                        top = ucol_tok_doSetTop(src, status);
852                        newStrength = UCOL_TOK_RESET;
853                        goto EndOfLoop;
854                    }
855                    newStrength = UCOL_SECONDARY;
856                    break;
857
858                case 0x003C/*'<'*/:
859                    if (newStrength != UCOL_TOK_UNSET) {
860                        goto EndOfLoop;
861                    }
862
863                    /* if we start with strength, we'll reset to top */
864                    if(startOfRules == TRUE) {
865                        src->parsedToken.indirectIndex = 5;
866                        top = ucol_tok_doSetTop(src, status);
867                        newStrength = UCOL_TOK_RESET;
868                        goto EndOfLoop;
869                    }
870                    /* before this, do a scan to verify whether this is */
871                    /* another strength */
872                    if(*(src->current+1) == 0x003C) {
873                        src->current++;
874                        if(*(src->current+1) == 0x003C) {
875                            src->current++; /* three in a row! */
876                            newStrength = UCOL_TERTIARY;
877                        } else { /* two in a row */
878                            newStrength = UCOL_SECONDARY;
879                        }
880                    } else { /* just one */
881                        newStrength = UCOL_PRIMARY;
882                    }
883                    break;
884
885                case 0x0026/*'&'*/:
886                    if (newStrength != UCOL_TOK_UNSET) {
887                        /**/
888                        goto EndOfLoop;
889                    }
890
891                    newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
892                    break;
893
894                case 0x005b/*'['*/:
895                    /* options - read an option, analyze it */
896                    if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
897                        uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
898                        if(U_SUCCESS(*status)) {
899                            if(result & UCOL_TOK_TOP) {
900                                if(newStrength == UCOL_TOK_RESET) {
901                                    top = ucol_tok_doSetTop(src, status);
902                                    if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
903                                        src->parsedToken.charsLen+=2;
904                                        buff[0] = 0x002d;
905                                        buff[1] = before;
906                                        ucol_tok_addToExtraCurrent(src, buff, 2, status);
907                                    }
908
909                                    src->current++;
910                                    goto EndOfLoop;
911                                } else {
912                                    *status = U_INVALID_FORMAT_ERROR;
913                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
914                                }
915                            } else if(result & UCOL_TOK_VARIABLE_TOP) {
916                                if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
917                                    variableTop = TRUE;
918                                    src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
919                                    src->parsedToken.charsLen = 1;
920                                    buff[0] = 0xFFFF;
921                                    ucol_tok_addToExtraCurrent(src, buff, 1, status);
922                                    src->current++;
923                                    goto EndOfLoop;
924                                } else {
925                                    *status = U_INVALID_FORMAT_ERROR;
926                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
927                                }
928                            } else if (result & UCOL_TOK_BEFORE){
929                                if(newStrength == UCOL_TOK_RESET) {
930                                    before = result & UCOL_TOK_BEFORE;
931                                } else {
932                                    *status = U_INVALID_FORMAT_ERROR;
933                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
934
935                                }
936                            }
937                        } else {
938                            *status = U_INVALID_FORMAT_ERROR;
939                            syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
940                            return NULL;
941                        }
942                    }
943                    break;
944                case 0x0021/*! skip java thai modifier reordering*/:
945                    break;
946                case 0x002F/*'/'*/:
947                    wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
948                    inChars = FALSE; /* we're now processing expansion */
949                    break;
950                case 0x005C /* back slash for escaped chars */:
951                    isEscaped = TRUE;
952                    break;
953                    /* found a quote, we're gonna start copying */
954                case 0x0027/*'\''*/:
955                    if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
956                        *status = U_INVALID_FORMAT_ERROR;
957                        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
958                        return NULL;
959                        // enabling rules to start with a non-token character a < b
960                        // newStrength = UCOL_TOK_RESET;
961                    }
962
963                    inQuote = TRUE;
964
965                    if(inChars) { /* we're doing characters */
966                        if(wasInQuote == FALSE) {
967                            src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
968                        }
969                        if (src->parsedToken.charsLen != 0) {
970                            ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
971                        }
972                        src->parsedToken.charsLen++;
973                    } else { /* we're doing an expansion */
974                        if(wasInQuote == FALSE) {
975                            extensionOffset = (uint32_t)(src->extraCurrent - src->source);
976                        }
977                        if (newExtensionLen != 0) {
978                            ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
979                        }
980                        newExtensionLen++;
981                    }
982
983                    wasInQuote = TRUE;
984
985                    ch = *(++(src->current));
986                    if(ch == 0x0027) { /* copy the double quote */
987                        ucol_tok_addToExtraCurrent(src, &ch, 1, status);
988                        inQuote = FALSE;
989                    }
990                    break;
991
992                    /* '@' is french only if the strength is not currently set */
993                    /* if it is, it's just a regular character in collation rules */
994                case 0x0040/*'@'*/:
995                    if (newStrength == UCOL_TOK_UNSET) {
996                        src->opts->frenchCollation = UCOL_ON;
997                        break;
998                    }
999
1000                case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1001                    // we want to store read characters to the prefix part and continue reading
1002                    // the characters (proper way would be to restart reading the chars, but in
1003                    // that case we would have to complicate the token hasher, which I do not
1004                    // intend to play with. Instead, we will do prefixes when prefixes are due
1005                    // (before adding the elements).
1006                    src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
1007                    src->parsedToken.prefixLen = src->parsedToken.charsLen;
1008
1009                    if(inChars) { /* we're doing characters */
1010                        if(wasInQuote == FALSE) {
1011                            src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1012                        }
1013                        if (src->parsedToken.charsLen != 0) {
1014                            ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1015                        }
1016                        src->parsedToken.charsLen++;
1017                    }
1018
1019                    wasInQuote = TRUE;
1020
1021                    do {
1022                        ch = *(++(src->current));
1023                        // skip whitespace between '|' and the character
1024                    } while (uprv_isRuleWhiteSpace(ch));
1025                    break;
1026
1027                    //charsOffset = 0;
1028                    //newCharsLen = 0;
1029                    //break; // We want to store the whole prefix/character sequence. If we break
1030                    // the '|' is going to get lost.
1031                case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1032                    do {
1033                        ch = *(++(src->current));
1034                    } while (!isCharNewLine(ch));
1035
1036                    break;
1037                default:
1038                    if (newStrength == UCOL_TOK_UNSET) {
1039                        *status = U_INVALID_FORMAT_ERROR;
1040                        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1041                        return NULL;
1042                    }
1043
1044                    if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
1045                        *status = U_INVALID_FORMAT_ERROR;
1046                        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1047                        return NULL;
1048                    }
1049
1050                    if(ch == 0x0000 && src->current+1 == src->end) {
1051                        break;
1052                    }
1053
1054                    if (inChars) {
1055                        if(src->parsedToken.charsLen == 0) {
1056                            src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1057                        }
1058                        src->parsedToken.charsLen++;
1059                    } else {
1060                        if(newExtensionLen == 0) {
1061                            extensionOffset = (uint32_t)(src->current - src->source);
1062                        }
1063                        newExtensionLen++;
1064                    }
1065
1066                    break;
1067                }
1068            }
1069        }
1070
1071        if(wasInQuote) {
1072            if(ch != 0x27) {
1073                if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
1074                    ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1075                }
1076            }
1077        }
1078
1079        src->current++;
1080    }
1081
1082EndOfLoop:
1083    wasInQuote = FALSE;
1084    if (newStrength == UCOL_TOK_UNSET) {
1085        return NULL;
1086    }
1087
1088    if (src->parsedToken.charsLen == 0 && top == FALSE) {
1089        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1090        *status = U_INVALID_FORMAT_ERROR;
1091        return NULL;
1092    }
1093
1094    src->parsedToken.strength = newStrength;
1095    src->parsedToken.extensionOffset = extensionOffset;
1096    src->parsedToken.extensionLen = newExtensionLen;
1097    src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
1098
1099    return src->current;
1100}
1101
1102/*
1103Processing Description
11041 Build a ListList. Each list has a header, which contains two lists (positive
1105and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1106reset may be null.
11072 As you process, you keep a LAST pointer that points to the last token you
1108handled.
1109*/
1110
1111static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
1112                                      UParseError *parseError, UErrorCode *status)
1113{
1114    if(src->resultLen == src->listCapacity) {
1115        // Unfortunately, this won't work, as we store addresses of lhs in token
1116        src->listCapacity *= 2;
1117        src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
1118        if(src->lh == NULL) {
1119            *status = U_MEMORY_ALLOCATION_ERROR;
1120            return NULL;
1121        }
1122    }
1123    /* do the reset thing */
1124    UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1125    /* test for NULL */
1126    if (sourceToken == NULL) {
1127        *status = U_MEMORY_ALLOCATION_ERROR;
1128        return NULL;
1129    }
1130    sourceToken->rulesToParse = src->source;
1131    sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1132    sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1133
1134    sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1135    sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1136
1137    // keep the flags around so that we know about before
1138    sourceToken->flags = src->parsedToken.flags;
1139
1140    if(src->parsedToken.prefixOffset != 0) {
1141        // this is a syntax error
1142        *status = U_INVALID_FORMAT_ERROR;
1143        syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1144        uprv_free(sourceToken);
1145        return 0;
1146    } else {
1147        sourceToken->prefix = 0;
1148    }
1149
1150    sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1151    sourceToken->strength = UCOL_TOK_RESET;
1152    sourceToken->next = NULL;
1153    sourceToken->previous = NULL;
1154    sourceToken->noOfCEs = 0;
1155    sourceToken->noOfExpCEs = 0;
1156    sourceToken->listHeader = &src->lh[src->resultLen];
1157
1158    src->lh[src->resultLen].first = NULL;
1159    src->lh[src->resultLen].last = NULL;
1160    src->lh[src->resultLen].first = NULL;
1161    src->lh[src->resultLen].last = NULL;
1162
1163    src->lh[src->resultLen].reset = sourceToken;
1164
1165    /*
1166    3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1167    First convert all expansions into normal form. Examples:
1168    If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1169    d * ... into &x * c/y * d * ...
1170    Note: reset values can never have expansions, although they can cause the
1171    very next item to have one. They may be contractions, if they are found
1172    earlier in the list.
1173    */
1174    *expandNext = 0;
1175    if(expand != NULL) {
1176        /* check to see if there is an expansion */
1177        if(src->parsedToken.charsLen > 1) {
1178            uint32_t resetCharsOffset;
1179            resetCharsOffset = (uint32_t)(expand - src->source);
1180            sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
1181            *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
1182        }
1183    }
1184
1185    src->resultLen++;
1186
1187    uhash_put(src->tailored, sourceToken, sourceToken, status);
1188
1189    return sourceToken;
1190}
1191
1192static
1193inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
1194    if(U_FAILURE(*status)) {
1195        return NULL;
1196    }
1197    /* this is a virgin before - we need to fish the anchor from the UCA */
1198    collIterate s;
1199    uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
1200    uint32_t CE, SecondCE;
1201    uint32_t invPos;
1202    if(sourceToken != NULL) {
1203        uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s);
1204    } else {
1205        uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s);
1206    }
1207
1208    baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
1209    baseContCE = ucol_getNextCE(src->UCA, &s, status);
1210    if(baseContCE == UCOL_NO_MORE_CES) {
1211        baseContCE = 0;
1212    }
1213
1214
1215    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1216    uint32_t ch = 0;
1217    uint32_t expandNext = 0;
1218    UColToken key;
1219
1220    if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1221        uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1222        uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1223        ch = uprv_uca_getCodePointFromRaw(raw-1);
1224        uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1225        CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1226        SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1227
1228        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1229        *src->extraCurrent++ = 0xFFFE;
1230        *src->extraCurrent++ = (UChar)ch;
1231        src->parsedToken.charsLen++;
1232
1233        key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1234        key.rulesToParse = src->source;
1235
1236        //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1237        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1238
1239        if(sourceToken == NULL) {
1240            src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1241            if(isContinuation(SecondCE)) {
1242                src->lh[src->resultLen].baseContCE = SecondCE;
1243            } else {
1244                src->lh[src->resultLen].baseContCE = 0;
1245            }
1246            src->lh[src->resultLen].nextCE = 0;
1247            src->lh[src->resultLen].nextContCE = 0;
1248            src->lh[src->resultLen].previousCE = 0;
1249            src->lh[src->resultLen].previousContCE = 0;
1250
1251            src->lh[src->resultLen].indirect = FALSE;
1252
1253            sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1254        }
1255
1256    } else {
1257        invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1258
1259        // we got the previous CE. Now we need to see if the difference between
1260        // the two CEs is really of the requested strength.
1261        // if it's a bigger difference (we asked for secondary and got primary), we
1262        // need to modify the CE.
1263        if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
1264            // adjust the strength
1265            // now we are in the situation where our baseCE should actually be modified in
1266            // order to get the CE in the right position.
1267            if(strength == UCOL_SECONDARY) {
1268                CE = baseCE - 0x0200;
1269            } else { // strength == UCOL_TERTIARY
1270                CE = baseCE - 0x02;
1271            }
1272            if(baseContCE) {
1273                if(strength == UCOL_SECONDARY) {
1274                    SecondCE = baseContCE - 0x0200;
1275                } else { // strength == UCOL_TERTIARY
1276                    SecondCE = baseContCE - 0x02;
1277                }
1278            }
1279        }
1280
1281#if 0
1282        // the code below relies on getting a code point from the inverse table, in order to be
1283        // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1284        // 1. There are many code points that have the same CE
1285        // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1286        // Also, in case when there is no equivalent strength before an element, we have to actually
1287        // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1288        // before a is a primary difference.
1289
1290        //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
1291
1292
1293        ch = CETable[3*invPos+2];
1294
1295        if((ch &  UCOL_INV_SIZEMASK) != 0) {
1296            uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
1297            uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
1298            ch = conts[offset];
1299        }
1300
1301        *src->extraCurrent++ = (UChar)ch;
1302        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
1303        src->parsedToken.charsLen = 1;
1304
1305        // We got an UCA before. However, this might have been tailored.
1306        // example:
1307        // &\u30ca = \u306a
1308        // &[before 3]\u306a<<<\u306a|\u309d
1309
1310
1311        // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1312        key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1313        key.rulesToParse = src->source;
1314
1315        //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1316        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1317#endif
1318
1319        // here is how it should be. The situation such as &[before 1]a < x, should be
1320        // resolved exactly as if we wrote &a > x.
1321        // therefore, I don't really care if the UCA value before a has been changed.
1322        // However, I do care if the strength between my element and the previous element
1323        // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1324        // have to construct the base CE.
1325
1326
1327
1328        // if we found a tailored thing, we have to use the UCA value and construct
1329        // a new reset token with constructed name
1330        //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1331        // character to which we want to anchor is already tailored.
1332        // We need to construct a new token which will be the anchor
1333        // point
1334        //*(src->extraCurrent-1) = 0xFFFE;
1335        //*src->extraCurrent++ = (UChar)ch;
1336        // grab before
1337        src->parsedToken.charsOffset -= 10;
1338        src->parsedToken.charsLen += 10;
1339        src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1340        if(isContinuation(SecondCE)) {
1341            src->lh[src->resultLen].baseContCE = SecondCE;
1342        } else {
1343            src->lh[src->resultLen].baseContCE = 0;
1344        }
1345        src->lh[src->resultLen].nextCE = 0;
1346        src->lh[src->resultLen].nextContCE = 0;
1347        src->lh[src->resultLen].previousCE = 0;
1348        src->lh[src->resultLen].previousContCE = 0;
1349
1350        src->lh[src->resultLen].indirect = FALSE;
1351
1352        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1353        //}
1354    }
1355
1356    return sourceToken;
1357
1358}
1359
1360uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
1361    UColToken *lastToken = NULL;
1362    const UChar *parseEnd = NULL;
1363    uint32_t expandNext = 0;
1364    UBool variableTop = FALSE;
1365    UBool top = FALSE;
1366    uint16_t specs = 0;
1367    UColTokListHeader *ListList = NULL;
1368
1369    src->parsedToken.strength = UCOL_TOK_UNSET;
1370
1371    ListList = src->lh;
1372
1373    if(U_FAILURE(*status)) {
1374        return 0;
1375    }
1376
1377    while(src->current < src->end) {
1378        src->parsedToken.prefixOffset = 0;
1379
1380        parseEnd = ucol_tok_parseNextToken(src,
1381            (UBool)(lastToken == NULL),
1382            parseError,
1383            status);
1384
1385        specs = src->parsedToken.flags;
1386
1387
1388        variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
1389        top = ((specs & UCOL_TOK_TOP) != 0);
1390
1391        if(U_SUCCESS(*status) && parseEnd != NULL) {
1392            UColToken *sourceToken = NULL;
1393            //uint32_t key = 0;
1394            uint32_t lastStrength = UCOL_TOK_UNSET;
1395
1396            if(lastToken != NULL ) {
1397                lastStrength = lastToken->strength;
1398            }
1399
1400            //key = newCharsLen << 24 | charsOffset;
1401            UColToken key;
1402            key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1403            key.rulesToParse = src->source;
1404
1405            /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
1406            sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1407
1408            if(src->parsedToken.strength != UCOL_TOK_RESET) {
1409                if(lastToken == NULL) { /* this means that rules haven't started properly */
1410                    *status = U_INVALID_FORMAT_ERROR;
1411                    syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1412                    return 0;
1413                }
1414                /*  6 Otherwise (when relation != reset) */
1415                if(sourceToken == NULL) {
1416                    /* If sourceToken is null, create new one, */
1417                    sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1418                    /* test for NULL */
1419                    if (sourceToken == NULL) {
1420                        *status = U_MEMORY_ALLOCATION_ERROR;
1421                        return 0;
1422                    }
1423                    sourceToken->rulesToParse = src->source;
1424                    sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1425
1426                    sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1427
1428                    sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
1429                    sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
1430
1431                    sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1432                    sourceToken->next = NULL;
1433                    sourceToken->previous = NULL;
1434                    sourceToken->noOfCEs = 0;
1435                    sourceToken->noOfExpCEs = 0;
1436                    // keep the flags around so that we know about before
1437                    sourceToken->flags = src->parsedToken.flags;
1438                    uhash_put(src->tailored, sourceToken, sourceToken, status);
1439                    if(U_FAILURE(*status)) {
1440                        return 0;
1441                    }
1442                } else {
1443                    /* we could have fished out a reset here */
1444                    if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
1445                        /* otherwise remove sourceToken from where it was. */
1446                        if(sourceToken->next != NULL) {
1447                            if(sourceToken->next->strength > sourceToken->strength) {
1448                                sourceToken->next->strength = sourceToken->strength;
1449                            }
1450                            sourceToken->next->previous = sourceToken->previous;
1451                        } else {
1452                            sourceToken->listHeader->last = sourceToken->previous;
1453                        }
1454
1455                        if(sourceToken->previous != NULL) {
1456                            sourceToken->previous->next = sourceToken->next;
1457                        } else {
1458                            sourceToken->listHeader->first = sourceToken->next;
1459                        }
1460                        sourceToken->next = NULL;
1461                        sourceToken->previous = NULL;
1462                    }
1463                }
1464
1465                sourceToken->strength = src->parsedToken.strength;
1466                sourceToken->listHeader = lastToken->listHeader;
1467
1468                /*
1469                1.  Find the strongest strength in each list, and set strongestP and strongestN
1470                accordingly in the headers.
1471                */
1472                if(lastStrength == UCOL_TOK_RESET
1473                    || sourceToken->listHeader->first == 0) {
1474                        /* If LAST is a reset
1475                        insert sourceToken in the list. */
1476                        if(sourceToken->listHeader->first == 0) {
1477                            sourceToken->listHeader->first = sourceToken;
1478                            sourceToken->listHeader->last = sourceToken;
1479                        } else { /* we need to find a place for us */
1480                            /* and we'll get in front of the same strength */
1481                            if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
1482                                sourceToken->next = sourceToken->listHeader->first;
1483                                sourceToken->next->previous = sourceToken;
1484                                sourceToken->listHeader->first = sourceToken;
1485                                sourceToken->previous = NULL;
1486                            } else {
1487                                lastToken = sourceToken->listHeader->first;
1488                                while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1489                                    lastToken = lastToken->next;
1490                                }
1491                                if(lastToken->next != NULL) {
1492                                    lastToken->next->previous = sourceToken;
1493                                } else {
1494                                    sourceToken->listHeader->last = sourceToken;
1495                                }
1496                                sourceToken->previous = lastToken;
1497                                sourceToken->next = lastToken->next;
1498                                lastToken->next = sourceToken;
1499                            }
1500                        }
1501                    } else {
1502                        /* Otherwise (when LAST is not a reset)
1503                        if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1504                        otherwise insert before.
1505                        when inserting after or before, search to the next position with the same
1506                        strength in that direction. (This is called postpone insertion).         */
1507                        if(sourceToken != lastToken) {
1508                            if(lastToken->polarity == sourceToken->polarity) {
1509                                while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1510                                    lastToken = lastToken->next;
1511                                }
1512                                sourceToken->previous = lastToken;
1513                                if(lastToken->next != NULL) {
1514                                    lastToken->next->previous = sourceToken;
1515                                } else {
1516                                    sourceToken->listHeader->last = sourceToken;
1517                                }
1518
1519                                sourceToken->next = lastToken->next;
1520                                lastToken->next = sourceToken;
1521                            } else {
1522                                while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
1523                                    lastToken = lastToken->previous;
1524                                }
1525                                sourceToken->next = lastToken;
1526                                if(lastToken->previous != NULL) {
1527                                    lastToken->previous->next = sourceToken;
1528                                } else {
1529                                    sourceToken->listHeader->first = sourceToken;
1530                                }
1531                                sourceToken->previous = lastToken->previous;
1532                                lastToken->previous = sourceToken;
1533                            }
1534                        } else { /* repeated one thing twice in rules, stay with the stronger strength */
1535                            if(lastStrength < sourceToken->strength) {
1536                                sourceToken->strength = lastStrength;
1537                            }
1538                        }
1539                    }
1540
1541                    /* if the token was a variable top, we're gonna put it in */
1542                    if(variableTop == TRUE && src->varTop == NULL) {
1543                        variableTop = FALSE;
1544                        src->varTop = sourceToken;
1545                    }
1546
1547                    // Treat the expansions.
1548                    // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1549                    // (&abc * d * e <=> &ab * d / c * e / c)
1550                    // if both of them are in effect for a token, they are combined.
1551
1552                    sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1553
1554                    if(expandNext != 0) {
1555                        if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
1556                            expandNext = 0;
1557                        } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
1558                            sourceToken->expansion = expandNext;
1559                        } else { /* there is both explicit and implicit expansion. We need to make a combination */
1560                            uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
1561                            uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
1562                            sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
1563                            src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
1564                        }
1565                    }
1566
1567                    // This is just for debugging purposes
1568                    if(sourceToken->expansion != 0) {
1569                        sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1570                    } else {
1571                        sourceToken->debugExpansion = 0;
1572                    }
1573                    // if the previous token was a reset before, the strength of this
1574                    // token must match the strength of before. Otherwise we have an
1575                    // undefined situation.
1576                    // In other words, we currently have a cludge which we use to
1577                    // represent &a >> x. This is written as &[before 2]a << x.
1578                    if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
1579                        uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
1580                        if(beforeStrength != sourceToken->strength) {
1581                            *status = U_INVALID_FORMAT_ERROR;
1582                            syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1583                            return 0;
1584                        }
1585                    }
1586            } else {
1587                if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
1588                    /* if the previous token was also a reset, */
1589                    /*this means that we have two consecutive resets */
1590                    /* and we want to remove the previous one if empty*/
1591                    if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1592                        src->resultLen--;
1593                    }
1594                }
1595
1596                if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1597                    uint32_t searchCharsLen = src->parsedToken.charsLen;
1598                    while(searchCharsLen > 1 && sourceToken == NULL) {
1599                        searchCharsLen--;
1600                        //key = searchCharsLen << 24 | charsOffset;
1601                        UColToken key;
1602                        key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
1603                        key.rulesToParse = src->source;
1604                        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1605                    }
1606                    if(sourceToken != NULL) {
1607                        expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
1608                    }
1609                }
1610
1611                if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
1612                    if(top == FALSE) { /* there is no indirection */
1613                        uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1614                        if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1615                            /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1616                            while(sourceToken->strength > strength && sourceToken->previous != NULL) {
1617                                sourceToken = sourceToken->previous;
1618                            }
1619                            /* here, either we hit the strength or NULL */
1620                            if(sourceToken->strength == strength) {
1621                                if(sourceToken->previous != NULL) {
1622                                    sourceToken = sourceToken->previous;
1623                                } else { /* start of list */
1624                                    sourceToken = sourceToken->listHeader->reset;
1625                                }
1626                            } else { /* we hit NULL */
1627                                /* we should be doing the else part */
1628                                sourceToken = sourceToken->listHeader->reset;
1629                                sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1630                            }
1631                        } else {
1632                            sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1633                        }
1634                    } else { /* this is both before and indirection */
1635                        top = FALSE;
1636                        ListList[src->resultLen].previousCE = 0;
1637                        ListList[src->resultLen].previousContCE = 0;
1638                        ListList[src->resultLen].indirect = TRUE;
1639                        /* we need to do slightly more work. we need to get the baseCE using the */
1640                        /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
1641                        /* in ucol_bld */
1642                        uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1643                        uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1644                        uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
1645                        uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1646
1647                        UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1648                        if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1649                            uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1650                            uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1651                            uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1652                            CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1653                            SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1654                        } else {
1655                            /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
1656                            ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1657                        }
1658
1659                        ListList[src->resultLen].baseCE = CE;
1660                        ListList[src->resultLen].baseContCE = SecondCE;
1661                        ListList[src->resultLen].nextCE = 0;
1662                        ListList[src->resultLen].nextContCE = 0;
1663
1664                        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1665                    }
1666                }
1667
1668
1669                /*  5 If the relation is a reset:
1670                If sourceToken is null
1671                Create new list, create new sourceToken, make the baseCE from source, put
1672                the sourceToken in ListHeader of the new list */
1673                if(sourceToken == NULL) {
1674                    /*
1675                    3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1676                    First convert all expansions into normal form. Examples:
1677                    If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1678                    d * ... into &x * c/y * d * ...
1679                    Note: reset values can never have expansions, although they can cause the
1680                    very next item to have one. They may be contractions, if they are found
1681                    earlier in the list.
1682                    */
1683                    if(top == FALSE) {
1684                        collIterate s;
1685                        uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1686
1687                        uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s);
1688
1689                        CE = ucol_getNextCE(src->UCA, &s, status);
1690                        UChar *expand = s.pos;
1691                        SecondCE = ucol_getNextCE(src->UCA, &s, status);
1692
1693                        ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1694                        if(isContinuation(SecondCE)) {
1695                            ListList[src->resultLen].baseContCE = SecondCE;
1696                        } else {
1697                            ListList[src->resultLen].baseContCE = 0;
1698                        }
1699                        ListList[src->resultLen].nextCE = 0;
1700                        ListList[src->resultLen].nextContCE = 0;
1701                        ListList[src->resultLen].previousCE = 0;
1702                        ListList[src->resultLen].previousContCE = 0;
1703                        ListList[src->resultLen].indirect = FALSE;
1704                        sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
1705                    } else { /* top == TRUE */
1706                        /* just use the supplied values */
1707                        top = FALSE;
1708                        ListList[src->resultLen].previousCE = 0;
1709                        ListList[src->resultLen].previousContCE = 0;
1710                        ListList[src->resultLen].indirect = TRUE;
1711                        ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1712                        ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
1713                        ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
1714                        ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
1715
1716                        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1717
1718                    }
1719                } else { /* reset to something already in rules */
1720                    top = FALSE;
1721                }
1722            }
1723            /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
1724            lastToken = sourceToken;
1725        } else {
1726            if(U_FAILURE(*status)) {
1727                return 0;
1728            }
1729        }
1730    }
1731
1732    if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1733        src->resultLen--;
1734    }
1735    return src->resultLen;
1736}
1737
1738void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) {
1739    U_NAMESPACE_USE
1740
1741    uint32_t nSize = 0;
1742    uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
1743    if(U_FAILURE(*status)) {
1744        return;
1745    }
1746
1747    // set everything to zero, so that we can clean up gracefully
1748    uprv_memset(src, 0, sizeof(UColTokenParser));
1749
1750    // first we need to find options that don't like to be normalized,
1751    // like copy and remove...
1752    //const UChar *openBrace = rules;
1753    int32_t optionNumber = -1;
1754    const UChar *setStart = NULL;
1755    uint32_t i = 0;
1756    while(i < rulesLength) {
1757        if(rules[i] == 0x005B) {
1758            // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
1759            //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
1760            optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
1761            if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
1762                USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1763                if(U_SUCCESS(*status)) {
1764                    if(src->copySet == NULL) {
1765                        src->copySet = newSet;
1766                    } else {
1767                        uset_addAll(src->copySet, newSet);
1768                        uset_close(newSet);
1769                    }
1770                } else {
1771                    return;
1772                }
1773            } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
1774                USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1775                if(U_SUCCESS(*status)) {
1776                    if(src->removeSet == NULL) {
1777                        src->removeSet = newSet;
1778                    } else {
1779                        uset_addAll(src->removeSet, newSet);
1780                        uset_close(newSet);
1781                    }
1782                } else {
1783                    return;
1784                }
1785            }
1786        }
1787        //openBrace++;
1788        i++;
1789    }
1790
1791    src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
1792    /* test for NULL */
1793    if (src->source == NULL) {
1794        *status = U_MEMORY_ALLOCATION_ERROR;
1795        return;
1796    }
1797    uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
1798    nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
1799    if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
1800        *status = U_ZERO_ERROR;
1801        src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
1802        /* test for NULL */
1803        if (src->source == NULL) {
1804            *status = U_MEMORY_ALLOCATION_ERROR;
1805            return;
1806        }
1807        nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
1808    }
1809    src->current = src->source;
1810    src->end = src->source+nSize;
1811    src->sourceCurrent = src->source;
1812    src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
1813    src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1814    src->varTop = NULL;
1815    src->UCA = UCA;
1816    src->invUCA = ucol_initInverseUCA(status);
1817    src->parsedToken.charsLen = 0;
1818    src->parsedToken.charsOffset = 0;
1819    src->parsedToken.extensionLen = 0;
1820    src->parsedToken.extensionOffset = 0;
1821    src->parsedToken.prefixLen = 0;
1822    src->parsedToken.prefixOffset = 0;
1823    src->parsedToken.flags = 0;
1824    src->parsedToken.strength = UCOL_TOK_UNSET;
1825    src->buildCCTabFlag = FALSE;
1826
1827    if(U_FAILURE(*status)) {
1828        return;
1829    }
1830    src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
1831    if(U_FAILURE(*status)) {
1832        return;
1833    }
1834    uhash_setValueDeleter(src->tailored, uhash_freeBlock);
1835
1836    src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
1837    /* test for NULL */
1838    if (src->opts == NULL) {
1839        *status = U_MEMORY_ALLOCATION_ERROR;
1840        return;
1841    }
1842
1843    uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
1844
1845    // rulesToParse = src->source;
1846    src->lh = 0;
1847    src->listCapacity = 1024;
1848    src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
1849    //Test for NULL
1850    if (src->lh == NULL) {
1851        *status = U_MEMORY_ALLOCATION_ERROR;
1852        return;
1853    }
1854    uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
1855    src->resultLen = 0;
1856
1857    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1858
1859    // UCOL_RESET_TOP_VALUE
1860    setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1861    // UCOL_FIRST_PRIMARY_IGNORABLE
1862    setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
1863    // UCOL_LAST_PRIMARY_IGNORABLE
1864    setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
1865    // UCOL_FIRST_SECONDARY_IGNORABLE
1866    setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
1867    // UCOL_LAST_SECONDARY_IGNORABLE
1868    setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
1869    // UCOL_FIRST_TERTIARY_IGNORABLE
1870    setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
1871    // UCOL_LAST_TERTIARY_IGNORABLE
1872    setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
1873    // UCOL_FIRST_VARIABLE
1874    setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
1875    // UCOL_LAST_VARIABLE
1876    setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
1877    // UCOL_FIRST_NON_VARIABLE
1878    setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
1879    // UCOL_LAST_NON_VARIABLE
1880    setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1881    // UCOL_FIRST_IMPLICIT
1882    setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
1883    // UCOL_LAST_IMPLICIT
1884    setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
1885    // UCOL_FIRST_TRAILING
1886    setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
1887    // UCOL_LAST_TRAILING
1888    setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
1889    ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
1890}
1891
1892
1893void ucol_tok_closeTokenList(UColTokenParser *src) {
1894    if(src->copySet != NULL) {
1895        uset_close(src->copySet);
1896    }
1897    if(src->removeSet != NULL) {
1898        uset_close(src->removeSet);
1899    }
1900    if(src->tailored != NULL) {
1901        uhash_close(src->tailored);
1902    }
1903    if(src->lh != NULL) {
1904        uprv_free(src->lh);
1905    }
1906    if(src->source != NULL) {
1907        uprv_free(src->source);
1908    }
1909    if(src->opts != NULL) {
1910        uprv_free(src->opts);
1911    }
1912}
1913
1914#endif /* #if !UCONFIG_NO_COLLATION */
1915
1916