1/*
2*******************************************************************************
3*
4*   Copyright (C) 2001-2011, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  ucol_tok.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created 02/22/2001
14*   created by: Vladimir Weinstein
15*
16* This module reads a tailoring rule string and produces a list of
17* tokens that will be turned into collation elements
18*
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_COLLATION
24
25#include "unicode/uscript.h"
26#include "unicode/ustring.h"
27#include "unicode/uchar.h"
28#include "unicode/uniset.h"
29
30#include "cmemory.h"
31#include "cstring.h"
32#include "patternprops.h"
33#include "ucol_bld.h"
34#include "ucol_tok.h"
35#include "ulocimp.h"
36#include "uresimp.h"
37
38// Define this only for debugging.
39// #define DEBUG_FOR_COLL_RULES 1
40
41#ifdef DEBUG_FOR_COLL_RULES
42#include <iostream>
43#endif
44
45U_NAMESPACE_USE
46
47U_CDECL_BEGIN
48static int32_t U_CALLCONV
49uhash_hashTokens(const UHashTok k)
50{
51    int32_t hash = 0;
52    //uint32_t key = (uint32_t)k.integer;
53    UColToken *key = (UColToken *)k.pointer;
54    if (key != 0) {
55        int32_t len = (key->source & 0xFF000000)>>24;
56        int32_t inc = ((len - 32) / 32) + 1;
57
58        const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl);
59        const UChar *limit = p + len;
60
61        while (p<limit) {
62            hash = (hash * 37) + *p;
63            p += inc;
64        }
65    }
66    return hash;
67}
68
69static UBool U_CALLCONV
70uhash_compareTokens(const UHashTok key1, const UHashTok key2)
71{
72    //uint32_t p1 = (uint32_t) key1.integer;
73    //uint32_t p2 = (uint32_t) key2.integer;
74    UColToken *p1 = (UColToken *)key1.pointer;
75    UColToken *p2 = (UColToken *)key2.pointer;
76    const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl);
77    const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl);
78    uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
79    uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
80    const UChar *end = s1+s1L-1;
81
82    if (p1 == p2) {
83        return TRUE;
84    }
85    if (p1->source == 0 || p2->source == 0) {
86        return FALSE;
87    }
88    if(s1L != s2L) {
89        return FALSE;
90    }
91    if(p1->source == p2->source) {
92        return TRUE;
93    }
94    while((s1 < end) && *s1 == *s2) {
95        ++s1;
96        ++s2;
97    }
98    if(*s1 == *s2) {
99        return TRUE;
100    } else {
101        return FALSE;
102    }
103}
104U_CDECL_END
105
106/*
107 * Debug messages used to pinpoint where a format error occurred.
108 * A better way is to include context-sensitive information in syntaxError() function.
109 *
110 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR
111 * in the compile line.
112 */
113/* #define DEBUG_FOR_FORMAT_ERROR 1 */
114
115#ifdef DEBUG_FOR_FORMAT_ERROR
116#define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);}
117#else
118#define DBG_FORMAT_ERROR
119#endif
120
121
122/*
123 * Controls debug messages so that the output can be compared before and after a
124 * big change.  Prints the information of every code point that comes out of the
125 * collation parser and its strength into a file.  When a big change in format
126 * happens, the files before and after the change should be identical.
127 *
128 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS
129 * in the compile line.
130 */
131// #define DEBUG_FOR_CODE_POINTS 1
132
133#ifdef DEBUG_FOR_CODE_POINTS
134    FILE* dfcp_fp = NULL;
135#endif
136
137
138/*static inline void U_CALLCONV
139uhash_freeBlockWrapper(void *obj) {
140    uhash_freeBlock(obj);
141}*/
142
143
144typedef struct {
145    uint32_t startCE;
146    uint32_t startContCE;
147    uint32_t limitCE;
148    uint32_t limitContCE;
149} indirectBoundaries;
150
151/* these values are used for finding CE values for indirect positioning. */
152/* Indirect positioning is a mechanism for allowing resets on symbolic   */
153/* values. It only works for resets and you cannot tailor indirect names */
154/* An indirect name can define either an anchor point or a range. An     */
155/* anchor point behaves in exactly the same way as a code point in reset */
156/* would, except that it cannot be tailored. A range (we currently only  */
157/* know for the [top] range will explicitly set the upper bound for      */
158/* generated CEs, thus allowing for better control over how many CEs can */
159/* be squeezed between in the range without performance penalty.         */
160/* In that respect, we use [top] for tailoring of locales that use CJK   */
161/* characters. Other indirect values are currently a pure convenience,   */
162/* they can be used to assure that the CEs will be always positioned in  */
163/* the same place relative to a point with known properties (e.g. first  */
164/* primary ignorable). */
165static indirectBoundaries ucolIndirectBoundaries[15];
166/*
167static indirectBoundaries ucolIndirectBoundaries[11] = {
168{ UCOL_RESET_TOP_VALUE,               0,
169UCOL_NEXT_TOP_VALUE,                0 },
170{ UCOL_FIRST_PRIMARY_IGNORABLE,       0,
1710,                                  0 },
172{ UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
1730,                                  0 },
174{ UCOL_FIRST_SECONDARY_IGNORABLE,     0,
1750,                                  0 },
176{ UCOL_LAST_SECONDARY_IGNORABLE,      0,
1770,                                  0 },
178{ UCOL_FIRST_TERTIARY_IGNORABLE,      0,
1790,                                  0 },
180{ UCOL_LAST_TERTIARY_IGNORABLE,       0,
1810,                                  0 },
182{ UCOL_FIRST_VARIABLE,                0,
1830,                                  0 },
184{ UCOL_LAST_VARIABLE,                 0,
1850,                                  0 },
186{ UCOL_FIRST_NON_VARIABLE,            0,
1870,                                  0 },
188{ UCOL_LAST_NON_VARIABLE,             0,
1890,                                  0 },
190};
191*/
192
193static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
194
195    // Set values for the top - TODO: once we have values for all the indirects, we are going
196    // to initalize here.
197    ucolIndirectBoundaries[indexR].startCE = start[0];
198    ucolIndirectBoundaries[indexR].startContCE = start[1];
199    if(end) {
200        ucolIndirectBoundaries[indexR].limitCE = end[0];
201        ucolIndirectBoundaries[indexR].limitContCE = end[1];
202    } else {
203        ucolIndirectBoundaries[indexR].limitCE = 0;
204        ucolIndirectBoundaries[indexR].limitContCE = 0;
205    }
206}
207
208
209static inline
210void syntaxError(const UChar* rules,
211                 int32_t pos,
212                 int32_t rulesLen,
213                 UParseError* parseError)
214{
215    parseError->offset = pos;
216    parseError->line = 0 ; /* we are not using line numbers */
217
218    // for pre-context
219    int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
220    int32_t stop  = pos;
221
222    u_memcpy(parseError->preContext,rules+start,stop-start);
223    //null terminate the buffer
224    parseError->preContext[stop-start] = 0;
225
226    //for post-context
227    start = pos+1;
228    stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
229    rulesLen;
230
231    if(start < stop) {
232        u_memcpy(parseError->postContext,rules+start,stop-start);
233        //null terminate the buffer
234        parseError->postContext[stop-start]= 0;
235    } else {
236        parseError->postContext[0] = 0;
237    }
238}
239
240static
241void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
242    switch(attrib) {
243    case UCOL_HIRAGANA_QUATERNARY_MODE:
244        opts->hiraganaQ = value;
245        break;
246    case UCOL_FRENCH_COLLATION:
247        opts->frenchCollation = value;
248        break;
249    case UCOL_ALTERNATE_HANDLING:
250        opts->alternateHandling = value;
251        break;
252    case UCOL_CASE_FIRST:
253        opts->caseFirst = value;
254        break;
255    case UCOL_CASE_LEVEL:
256        opts->caseLevel = value;
257        break;
258    case UCOL_NORMALIZATION_MODE:
259        opts->normalizationMode = value;
260        break;
261    case UCOL_STRENGTH:
262        opts->strength = value;
263        break;
264    case UCOL_NUMERIC_COLLATION:
265        opts->numericCollation = value;
266        break;
267    case UCOL_ATTRIBUTE_COUNT:
268    default:
269        break;
270    }
271}
272
273#define UTOK_OPTION_COUNT 22
274
275static UBool didInit = FALSE;
276/* we can be strict, or we can be lenient */
277/* I'd surely be lenient with the option arguments */
278/* maybe even with options */
279U_STRING_DECL(suboption_00, "non-ignorable", 13);
280U_STRING_DECL(suboption_01, "shifted",        7);
281
282U_STRING_DECL(suboption_02, "lower",          5);
283U_STRING_DECL(suboption_03, "upper",          5);
284U_STRING_DECL(suboption_04, "off",            3);
285U_STRING_DECL(suboption_05, "on",             2);
286U_STRING_DECL(suboption_06, "1",              1);
287U_STRING_DECL(suboption_07, "2",              1);
288U_STRING_DECL(suboption_08, "3",              1);
289U_STRING_DECL(suboption_09, "4",              1);
290U_STRING_DECL(suboption_10, "I",              1);
291
292U_STRING_DECL(suboption_11, "primary",        7);
293U_STRING_DECL(suboption_12, "secondary",      9);
294U_STRING_DECL(suboption_13, "tertiary",       8);
295U_STRING_DECL(suboption_14, "variable",       8);
296U_STRING_DECL(suboption_15, "regular",        7);
297U_STRING_DECL(suboption_16, "implicit",       8);
298U_STRING_DECL(suboption_17, "trailing",       8);
299
300
301U_STRING_DECL(option_00,    "undefined",      9);
302U_STRING_DECL(option_01,    "rearrange",      9);
303U_STRING_DECL(option_02,    "alternate",      9);
304U_STRING_DECL(option_03,    "backwards",      9);
305U_STRING_DECL(option_04,    "variable top",  12);
306U_STRING_DECL(option_05,    "top",            3);
307U_STRING_DECL(option_06,    "normalization", 13);
308U_STRING_DECL(option_07,    "caseLevel",      9);
309U_STRING_DECL(option_08,    "caseFirst",      9);
310U_STRING_DECL(option_09,    "scriptOrder",   11);
311U_STRING_DECL(option_10,    "charsetname",   11);
312U_STRING_DECL(option_11,    "charset",        7);
313U_STRING_DECL(option_12,    "before",         6);
314U_STRING_DECL(option_13,    "hiraganaQ",      9);
315U_STRING_DECL(option_14,    "strength",       8);
316U_STRING_DECL(option_15,    "first",          5);
317U_STRING_DECL(option_16,    "last",           4);
318U_STRING_DECL(option_17,    "optimize",       8);
319U_STRING_DECL(option_18,    "suppressContractions",         20);
320U_STRING_DECL(option_19,    "numericOrdering",              15);
321U_STRING_DECL(option_20,    "import",         6);
322U_STRING_DECL(option_21,    "reorder",         7);
323
324/*
325[last variable] last variable value
326[last primary ignorable] largest CE for primary ignorable
327[last secondary ignorable] largest CE for secondary ignorable
328[last tertiary ignorable] largest CE for tertiary ignorable
329[top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
330*/
331
332
333static const ucolTokSuboption alternateSub[2] = {
334    {suboption_00, 13, UCOL_NON_IGNORABLE},
335    {suboption_01,  7, UCOL_SHIFTED}
336};
337
338static const ucolTokSuboption caseFirstSub[3] = {
339    {suboption_02, 5, UCOL_LOWER_FIRST},
340    {suboption_03,  5, UCOL_UPPER_FIRST},
341    {suboption_04,  3, UCOL_OFF},
342};
343
344static const ucolTokSuboption onOffSub[2] = {
345    {suboption_04, 3, UCOL_OFF},
346    {suboption_05, 2, UCOL_ON}
347};
348
349static const ucolTokSuboption frenchSub[1] = {
350    {suboption_07, 1, UCOL_ON}
351};
352
353static const ucolTokSuboption beforeSub[3] = {
354    {suboption_06, 1, UCOL_PRIMARY},
355    {suboption_07, 1, UCOL_SECONDARY},
356    {suboption_08, 1, UCOL_TERTIARY}
357};
358
359static const ucolTokSuboption strengthSub[5] = {
360    {suboption_06, 1, UCOL_PRIMARY},
361    {suboption_07, 1, UCOL_SECONDARY},
362    {suboption_08, 1, UCOL_TERTIARY},
363    {suboption_09, 1, UCOL_QUATERNARY},
364    {suboption_10, 1, UCOL_IDENTICAL},
365};
366
367static const ucolTokSuboption firstLastSub[7] = {
368    {suboption_11, 7, UCOL_PRIMARY},
369    {suboption_12, 9, UCOL_PRIMARY},
370    {suboption_13, 8, UCOL_PRIMARY},
371    {suboption_14, 8, UCOL_PRIMARY},
372    {suboption_15, 7, UCOL_PRIMARY},
373    {suboption_16, 8, UCOL_PRIMARY},
374    {suboption_17, 8, UCOL_PRIMARY},
375};
376
377enum OptionNumber {
378    OPTION_ALTERNATE_HANDLING = 0,
379    OPTION_FRENCH_COLLATION,
380    OPTION_CASE_LEVEL,
381    OPTION_CASE_FIRST,
382    OPTION_NORMALIZATION_MODE,
383    OPTION_HIRAGANA_QUATERNARY,
384    OPTION_STRENGTH,
385    OPTION_NUMERIC_COLLATION,
386    OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
387    OPTION_VARIABLE_TOP,
388    OPTION_REARRANGE,
389    OPTION_BEFORE,
390    OPTION_TOP,
391    OPTION_FIRST,
392    OPTION_LAST,
393    OPTION_OPTIMIZE,
394    OPTION_SUPPRESS_CONTRACTIONS,
395    OPTION_UNDEFINED,
396    OPTION_SCRIPT_ORDER,
397    OPTION_CHARSET_NAME,
398    OPTION_CHARSET,
399    OPTION_IMPORT,
400    OPTION_SCRIPTREORDER
401} ;
402
403static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
404    /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
405    /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
406    /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
407    /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
408    /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
409    /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
410    /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
411    /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
412    /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
413    /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
414    /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
415    /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
416    /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
417    /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
418    /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
419    /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
420    /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
421    /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
422    /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
423    /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"charset"        */
424    /*20*/ {option_20,  6, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"import"        */
425    /*21*/ {option_21,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"reorder"        */
426};
427
428static
429int32_t u_strncmpNoCase(const UChar     *s1,
430                        const UChar     *s2,
431                        int32_t     n)
432{
433    if(n > 0) {
434        int32_t rc;
435        for(;;) {
436            rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
437            if(rc != 0 || *s1 == 0 || --n == 0) {
438                return rc;
439            }
440            ++s1;
441            ++s2;
442        }
443    }
444    return 0;
445}
446
447static
448void ucol_uprv_tok_initData() {
449    if(!didInit) {
450        U_STRING_INIT(suboption_00, "non-ignorable", 13);
451        U_STRING_INIT(suboption_01, "shifted",        7);
452
453        U_STRING_INIT(suboption_02, "lower",          5);
454        U_STRING_INIT(suboption_03, "upper",          5);
455        U_STRING_INIT(suboption_04, "off",            3);
456        U_STRING_INIT(suboption_05, "on",             2);
457
458        U_STRING_INIT(suboption_06, "1",              1);
459        U_STRING_INIT(suboption_07, "2",              1);
460        U_STRING_INIT(suboption_08, "3",              1);
461        U_STRING_INIT(suboption_09, "4",              1);
462        U_STRING_INIT(suboption_10, "I",              1);
463
464        U_STRING_INIT(suboption_11, "primary",        7);
465        U_STRING_INIT(suboption_12, "secondary",      9);
466        U_STRING_INIT(suboption_13, "tertiary",       8);
467        U_STRING_INIT(suboption_14, "variable",       8);
468        U_STRING_INIT(suboption_15, "regular",        7);
469        U_STRING_INIT(suboption_16, "implicit",       8);
470        U_STRING_INIT(suboption_17, "trailing",       8);
471
472
473        U_STRING_INIT(option_00, "undefined",      9);
474        U_STRING_INIT(option_01, "rearrange",      9);
475        U_STRING_INIT(option_02, "alternate",      9);
476        U_STRING_INIT(option_03, "backwards",      9);
477        U_STRING_INIT(option_04, "variable top",  12);
478        U_STRING_INIT(option_05, "top",            3);
479        U_STRING_INIT(option_06, "normalization", 13);
480        U_STRING_INIT(option_07, "caseLevel",      9);
481        U_STRING_INIT(option_08, "caseFirst",      9);
482        U_STRING_INIT(option_09, "scriptOrder",   11);
483        U_STRING_INIT(option_10, "charsetname",   11);
484        U_STRING_INIT(option_11, "charset",        7);
485        U_STRING_INIT(option_12, "before",         6);
486        U_STRING_INIT(option_13, "hiraganaQ",      9);
487        U_STRING_INIT(option_14, "strength",       8);
488        U_STRING_INIT(option_15, "first",          5);
489        U_STRING_INIT(option_16, "last",           4);
490        U_STRING_INIT(option_17, "optimize",       8);
491        U_STRING_INIT(option_18, "suppressContractions",         20);
492        U_STRING_INIT(option_19, "numericOrdering",      15);
493        U_STRING_INIT(option_20, "import ",        6);
494        U_STRING_INIT(option_21, "reorder",        7);
495        didInit = TRUE;
496    }
497}
498
499
500// This function reads basic options to set in the runtime collator
501// used by data driven tests. Should not support build time options
502U_CAPI const UChar * U_EXPORT2
503ucol_tok_getNextArgument(const UChar *start, const UChar *end,
504                         UColAttribute *attrib, UColAttributeValue *value,
505                         UErrorCode *status)
506{
507    uint32_t i = 0;
508    int32_t j=0;
509    UBool foundOption = FALSE;
510    const UChar *optionArg = NULL;
511
512    ucol_uprv_tok_initData();
513
514    while(start < end && PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
515        start++;
516    }
517    if(start >= end) {
518        return NULL;
519    }
520    /* skip opening '[' */
521    if(*start == 0x005b) {
522        start++;
523    } else {
524        *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
525        return NULL;
526    }
527
528    while(i < UTOK_OPTION_COUNT) {
529        if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
530            foundOption = TRUE;
531            if(end - start > rulesOptions[i].optionLen) {
532                optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
533                while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
534                    optionArg++;
535                }
536            }
537            break;
538        }
539        i++;
540    }
541
542    if(!foundOption) {
543        *status = U_ILLEGAL_ARGUMENT_ERROR;
544        return NULL;
545    }
546
547    if(optionArg) {
548        for(j = 0; j<rulesOptions[i].subSize; j++) {
549            if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
550                //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
551                *attrib = rulesOptions[i].attr;
552                *value = rulesOptions[i].subopts[j].attrVal;
553                optionArg += rulesOptions[i].subopts[j].subLen;
554                while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
555                    optionArg++;
556                }
557                if(*optionArg == 0x005d) {
558                    optionArg++;
559                    return optionArg;
560                } else {
561                    *status = U_ILLEGAL_ARGUMENT_ERROR;
562                    return NULL;
563                }
564            }
565        }
566    }
567    *status = U_ILLEGAL_ARGUMENT_ERROR;
568    return NULL;
569}
570
571static
572USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
573    while(*start != 0x005b) { /* advance while we find the first '[' */
574        start++;
575    }
576    // now we need to get a balanced set of '[]'. The problem is that a set can have
577    // many, and *end point to the first closing '['
578    int32_t noOpenBraces = 1;
579    int32_t current = 1; // skip the opening brace
580    while(start+current < end && noOpenBraces != 0) {
581        if(start[current] == 0x005b) {
582            noOpenBraces++;
583        } else if(start[current] == 0x005D) { // closing brace
584            noOpenBraces--;
585        }
586        current++;
587    }
588
589    if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
590        *status = U_ILLEGAL_ARGUMENT_ERROR;
591        return NULL;
592    }
593    return uset_openPattern(start, current, status);
594}
595
596/**
597 * Reads an option and matches the option name with the predefined options. (Case-insensitive.)
598 * @param start Pointer to the start UChar.
599 * @param end Pointer to the last valid pointer beyond which the option will not extend.
600 * @param optionArg Address of the pointer at which the options start (after the option name)
601 * @return The index of the option, or -1 if the option is not valid.
602 */
603static
604int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
605    int32_t i = 0;
606    ucol_uprv_tok_initData();
607
608    while(PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
609        start++;
610    }
611    while(i < UTOK_OPTION_COUNT) {
612        if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
613            if(end - start > rulesOptions[i].optionLen) {
614                *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */
615                while(PatternProps::isWhiteSpace(**optionArg)) { /* eat whitespace */
616                    (*optionArg)++;
617                }
618            }
619            break;
620        }
621        i++;
622    }
623    if(i == UTOK_OPTION_COUNT) {
624        i = -1; // didn't find an option
625    }
626    return i;
627}
628
629
630static
631void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) {
632    int32_t codeCount = 0;
633    int32_t codeIndex = 0;
634    char conversion[64];
635    int32_t tokenLength = 0;
636    const UChar* space;
637
638    const UChar* current = src->current;
639    const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);
640
641    // eat leading whitespace
642    while(current < end && u_isWhitespace(*current)) {
643        current++;
644    }
645
646    while(current < end) {
647        space = u_memchr(current, 0x0020, end - current);
648        space = space == 0 ? end : space;
649        tokenLength = space - current;
650        if (tokenLength < 4) {
651            *status = U_INVALID_FORMAT_ERROR;
652            return;
653        }
654        codeCount++;
655        current += tokenLength;
656        while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
657            ++current;
658        }
659    }
660
661    if (codeCount == 0) {
662        *status = U_INVALID_FORMAT_ERROR;
663    }
664
665    src->reorderCodesLength = codeCount;
666    src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t));
667    current = src->current;
668
669    // eat leading whitespace
670    while(current < end && u_isWhitespace(*current)) {
671        current++;
672    }
673
674    while(current < end) {
675        space = u_memchr(current, 0x0020, end - current);
676        space = space == 0 ? end : space;
677        tokenLength = space - current;
678        if (tokenLength < 4) {
679            *status = U_ILLEGAL_ARGUMENT_ERROR;
680            return;
681        } else {
682            u_UCharsToChars(current, conversion, tokenLength);
683            conversion[tokenLength] = '\0';
684            src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion);
685            if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
686                src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion);
687            }
688            if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
689                *status = U_ILLEGAL_ARGUMENT_ERROR;
690            }
691        }
692        codeIndex++;
693        current += tokenLength;
694        while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
695            ++current;
696        }
697    }
698}
699
700// reads and conforms to various options in rules
701// end is the position of the first closing ']'
702// However, some of the options take an UnicodeSet definition
703// which needs to duplicate the closing ']'
704// for example: '[copy [\uAC00-\uD7FF]]'
705// These options will move end to the second ']' and the
706// caller will set the current to it.
707static
708uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
709    const UChar* start = src->current;
710    int32_t i = 0;
711    int32_t j=0;
712    const UChar *optionArg = NULL;
713
714    uint8_t result = 0;
715
716    start++; /*skip opening '['*/
717    i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
718    if(optionArg) {
719        src->current = optionArg;
720    }
721
722    if(i < 0) {
723        *status = U_ILLEGAL_ARGUMENT_ERROR;
724    } else {
725        int32_t noOpenBraces = 1;
726        switch(i) {
727    case OPTION_ALTERNATE_HANDLING:
728    case OPTION_FRENCH_COLLATION:
729    case OPTION_CASE_LEVEL:
730    case OPTION_CASE_FIRST:
731    case OPTION_NORMALIZATION_MODE:
732    case OPTION_HIRAGANA_QUATERNARY:
733    case OPTION_STRENGTH:
734    case OPTION_NUMERIC_COLLATION:
735        if(optionArg) {
736            for(j = 0; j<rulesOptions[i].subSize; j++) {
737                if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
738                    ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
739                    result =  UCOL_TOK_SUCCESS;
740                }
741            }
742        }
743        if(result == 0) {
744            *status = U_ILLEGAL_ARGUMENT_ERROR;
745        }
746        break;
747    case OPTION_VARIABLE_TOP:
748        result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
749        break;
750    case OPTION_REARRANGE:
751        result = UCOL_TOK_SUCCESS;
752        break;
753    case OPTION_BEFORE:
754        if(optionArg) {
755            for(j = 0; j<rulesOptions[i].subSize; j++) {
756                if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
757                    result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1);
758                }
759            }
760        }
761        if(result == 0) {
762            *status = U_ILLEGAL_ARGUMENT_ERROR;
763        }
764        break;
765    case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
766        /* index to this array will be src->parsedToken.indirectIndex*/
767        src->parsedToken.indirectIndex = 0;
768        result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
769        break;
770    case OPTION_FIRST:
771    case OPTION_LAST: /* first, last */
772        for(j = 0; j<rulesOptions[i].subSize; j++) {
773            if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
774                // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
775                // element of indirect boundaries is reserved for top.
776                src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
777                result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
778            }
779        }
780        if(result == 0) {
781            *status = U_ILLEGAL_ARGUMENT_ERROR;
782        }
783        break;
784    case OPTION_OPTIMIZE:
785    case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
786        // we need to move end here
787        src->current++; // skip opening brace
788        while(src->current < src->end && noOpenBraces != 0) {
789            if(*src->current == 0x005b) {
790                noOpenBraces++;
791            } else if(*src->current == 0x005D) { // closing brace
792                noOpenBraces--;
793            }
794            src->current++;
795        }
796        result = UCOL_TOK_SUCCESS;
797        break;
798    case OPTION_SCRIPTREORDER:
799        ucol_tok_parseScriptReorder(src, status);
800        break;
801    default:
802        *status = U_UNSUPPORTED_ERROR;
803        break;
804        }
805    }
806    src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));
807    return result;
808}
809
810
811inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
812    if (stuff == NULL || len <= 0) {
813        return;
814    }
815    UnicodeString tempStuff(FALSE, stuff, len);
816    if(src->extraCurrent+len >= src->extraEnd) {
817        /* reallocate */
818        if (stuff >= src->source && stuff <= src->end) {
819            // Copy the "stuff" contents into tempStuff's own buffer.
820            // UnicodeString is copy-on-write.
821            if (len > 0) {
822                tempStuff.setCharAt(0, tempStuff[0]);
823            } else {
824                tempStuff.remove();
825            }
826        }
827        UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
828        if(newSrc != NULL) {
829            src->current = newSrc + (src->current - src->source);
830            src->extraCurrent = newSrc + (src->extraCurrent - src->source);
831            src->end = newSrc + (src->end - src->source);
832            src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
833            src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
834            src->source = newSrc;
835        } else {
836            *status = U_MEMORY_ALLOCATION_ERROR;
837            return;
838        }
839    }
840    if(len == 1) {
841        *src->extraCurrent++ = tempStuff[0];
842    } else {
843        u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len);
844        src->extraCurrent += len;
845    }
846}
847
848inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
849    /*
850    top = TRUE;
851    */
852    UChar buff[5];
853    src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
854    buff[0] = 0xFFFE;
855    buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
856    buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
857    if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
858        src->parsedToken.charsLen = 3;
859        ucol_tok_addToExtraCurrent(src, buff, 3, status);
860    } else {
861        buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
862        buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
863        src->parsedToken.charsLen = 5;
864        ucol_tok_addToExtraCurrent(src, buff, 5, status);
865    }
866    return TRUE;
867}
868
869static UBool isCharNewLine(UChar c){
870    switch(c){
871    case 0x000A: /* LF  */
872    case 0x000D: /* CR  */
873    case 0x000C: /* FF  */
874    case 0x0085: /* NEL */
875    case 0x2028: /* LS  */
876    case 0x2029: /* PS  */
877        return TRUE;
878    default:
879        return FALSE;
880    }
881}
882
883/*
884 * This function is called several times when a range is processed.  Each time, the next code point
885 * is processed.
886 * The following variables must be set before calling this function:
887 *   src->currentRangeCp:  The current code point to process.
888 *   src->lastRangeCp: The last code point in the range.
889 * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.
890 */
891static const UChar*
892ucol_tok_processNextCodePointInRange(UColTokenParser *src,
893                                     UErrorCode *status)
894{
895  // Append current code point to source
896  UChar buff[U16_MAX_LENGTH];
897  uint32_t i = 0;
898
899  uint32_t nChars = U16_LENGTH(src->currentRangeCp);
900  src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
901  src->parsedToken.charsLen = nChars;
902
903  U16_APPEND_UNSAFE(buff, i, src->currentRangeCp);
904  ucol_tok_addToExtraCurrent(src, buff, nChars, status);
905
906  ++src->currentRangeCp;
907  if (src->currentRangeCp > src->lastRangeCp) {
908    src->inRange = FALSE;
909
910    if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
911      src->isStarred = FALSE;
912    }
913  } else {
914    src->previousCp = src->currentRangeCp;
915  }
916  return src->current;
917}
918
919/*
920 * This function is called several times when a starred list is processed.  Each time, the next code point
921 * in the list is processed.
922 * The following variables must be set before calling this function:
923 *   src->currentStarredCharIndex:  Index (in src->source) of the first char of the current code point.
924 *   src->lastStarredCharIndex: Index to the last character in the list.
925 * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.
926 */
927static const UChar*
928ucol_tok_processNextTokenInStarredList(UColTokenParser *src)
929{
930  // Extract the characters corresponding to the next code point.
931  UChar32 cp;
932  src->parsedToken.charsOffset = src->currentStarredCharIndex;
933  int32_t prev = src->currentStarredCharIndex;
934  U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp);
935  src->parsedToken.charsLen = src->currentStarredCharIndex - prev;
936
937  // When we are done parsing the starred string, turn the flag off so that
938  // the normal processing is restored.
939  if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
940    src->isStarred = FALSE;
941  }
942  src->previousCp = cp;
943  return src->current;
944}
945
946/*
947 * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters.
948 *
949 * This routine parses and separates almost all tokens. The following are the syntax characters recognized.
950 *  # : Comment character
951 *  & : Reset operator
952 *  = : Equality
953 *  < : Primary collation
954 *  << : Secondary collation
955 *  <<< : Tertiary collation
956 *  ; : Secondary collation
957 *  , : Tertiary collation
958 *  / : Expansions
959 *  | : Prefix
960 *  - : Range
961
962 *  ! : Java Thai modifier, ignored
963 *  @ : French only
964
965 * [] : Options
966 * '' : Quotes
967 *
968 *  Along with operators =, <, <<, <<<, the operator * is supported to indicate a list.  For example, &a<*bcdexyz
969 *  is equivalent to &a<b<c<d<e<x<y<z.  In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above.
970 *  This function do not separate the tokens in a list.  Instead, &a<*b-ex-z is parsed as three tokens - "&a",
971 *  "<*b", "-ex", "-z".  The strength (< in this case), whether in a list, whether in a range and the previous
972 *  character returned as cached so that the calling program can do further splitting.
973 */
974static const UChar*
975ucol_tok_parseNextTokenInternal(UColTokenParser *src,
976                                UBool startOfRules,
977                                UParseError *parseError,
978                                UErrorCode *status)
979{
980    UBool variableTop = FALSE;
981    UBool top = FALSE;
982    UBool inChars = TRUE;
983    UBool inQuote = FALSE;
984    UBool wasInQuote = FALSE;
985    uint8_t before = 0;
986    UBool isEscaped = FALSE;
987
988    // TODO: replace these variables with src->parsedToken counterparts
989    // no need to use them anymore since we have src->parsedToken.
990    // Ideally, token parser would be a nice class... Once, when I have
991    // more time (around 2020 probably).
992    uint32_t newExtensionLen = 0;
993    uint32_t extensionOffset = 0;
994    uint32_t newStrength = UCOL_TOK_UNSET;
995    UChar buff[10];
996
997    src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
998    src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
999    src->parsedToken.indirectIndex = 0;
1000
1001    while (src->current < src->end) {
1002        UChar ch = *(src->current);
1003
1004        if (inQuote) {
1005            if (ch == 0x0027/*'\''*/) {
1006                inQuote = FALSE;
1007            } else {
1008                if ((src->parsedToken.charsLen == 0) || inChars) {
1009                    if(src->parsedToken.charsLen == 0) {
1010                        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1011                    }
1012                    src->parsedToken.charsLen++;
1013                } else {
1014                    if(newExtensionLen == 0) {
1015                        extensionOffset = (uint32_t)(src->extraCurrent - src->source);
1016                    }
1017                    newExtensionLen++;
1018                }
1019            }
1020        }else if(isEscaped){
1021            isEscaped =FALSE;
1022            if (newStrength == UCOL_TOK_UNSET) {
1023                *status = U_INVALID_FORMAT_ERROR;
1024                syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1025                DBG_FORMAT_ERROR
1026                return NULL;
1027                // enabling rules to start with non-tokens a < b
1028                // newStrength = UCOL_TOK_RESET;
1029            }
1030            if(ch != 0x0000  && src->current != src->end) {
1031                if (inChars) {
1032                    if(src->parsedToken.charsLen == 0) {
1033                        src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1034                    }
1035                    src->parsedToken.charsLen++;
1036                } else {
1037                    if(newExtensionLen == 0) {
1038                        extensionOffset = (uint32_t)(src->current - src->source);
1039                    }
1040                    newExtensionLen++;
1041                }
1042            }
1043        }else {
1044            if(!PatternProps::isWhiteSpace(ch)) {
1045                /* Sets the strength for this entry */
1046                switch (ch) {
1047                case 0x003D/*'='*/ :
1048                    if (newStrength != UCOL_TOK_UNSET) {
1049                        goto EndOfLoop;
1050                    }
1051
1052                    /* if we start with strength, we'll reset to top */
1053                    if(startOfRules == TRUE) {
1054                        src->parsedToken.indirectIndex = 5;
1055                        top = ucol_tok_doSetTop(src, status);
1056                        newStrength = UCOL_TOK_RESET;
1057                        goto EndOfLoop;
1058                    }
1059                    newStrength = UCOL_IDENTICAL;
1060                    if(*(src->current+1) == 0x002A) {/*'*'*/
1061                        src->current++;
1062                        src->isStarred = TRUE;
1063                    }
1064                    break;
1065
1066                case 0x002C/*','*/:
1067                    if (newStrength != UCOL_TOK_UNSET) {
1068                        goto EndOfLoop;
1069                    }
1070
1071                    /* if we start with strength, we'll reset to top */
1072                    if(startOfRules == TRUE) {
1073                        src->parsedToken.indirectIndex = 5;
1074                        top = ucol_tok_doSetTop(src, status);
1075                        newStrength = UCOL_TOK_RESET;
1076                        goto EndOfLoop;
1077                    }
1078                    newStrength = UCOL_TERTIARY;
1079                    break;
1080
1081                case  0x003B/*';'*/:
1082                    if (newStrength != UCOL_TOK_UNSET) {
1083                        goto EndOfLoop;
1084                    }
1085
1086                    /* if we start with strength, we'll reset to top */
1087                    if(startOfRules == TRUE) {
1088                        src->parsedToken.indirectIndex = 5;
1089                        top = ucol_tok_doSetTop(src, status);
1090                        newStrength = UCOL_TOK_RESET;
1091                        goto EndOfLoop;
1092                    }
1093                    newStrength = UCOL_SECONDARY;
1094                    break;
1095
1096                case 0x003C/*'<'*/:
1097                    if (newStrength != UCOL_TOK_UNSET) {
1098                        goto EndOfLoop;
1099                    }
1100
1101                    /* if we start with strength, we'll reset to top */
1102                    if(startOfRules == TRUE) {
1103                        src->parsedToken.indirectIndex = 5;
1104                        top = ucol_tok_doSetTop(src, status);
1105                        newStrength = UCOL_TOK_RESET;
1106                        goto EndOfLoop;
1107                    }
1108                    /* before this, do a scan to verify whether this is */
1109                    /* another strength */
1110                    if(*(src->current+1) == 0x003C) {
1111                        src->current++;
1112                        if(*(src->current+1) == 0x003C) {
1113                            src->current++; /* three in a row! */
1114                            newStrength = UCOL_TERTIARY;
1115                        } else { /* two in a row */
1116                            newStrength = UCOL_SECONDARY;
1117                        }
1118                    } else { /* just one */
1119                        newStrength = UCOL_PRIMARY;
1120                    }
1121                    if(*(src->current+1) == 0x002A) {/*'*'*/
1122                        src->current++;
1123                        src->isStarred = TRUE;
1124                    }
1125                    break;
1126
1127                case 0x0026/*'&'*/:
1128                    if (newStrength != UCOL_TOK_UNSET) {
1129                        /**/
1130                        goto EndOfLoop;
1131                    }
1132
1133                    newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
1134                    break;
1135
1136                case 0x005b/*'['*/:
1137                    /* options - read an option, analyze it */
1138                    if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
1139                        uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
1140                        if(U_SUCCESS(*status)) {
1141                            if(result & UCOL_TOK_TOP) {
1142                                if(newStrength == UCOL_TOK_RESET) {
1143                                    top = ucol_tok_doSetTop(src, status);
1144                                    if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
1145                                        src->parsedToken.charsLen+=2;
1146                                        buff[0] = 0x002d;
1147                                        buff[1] = before;
1148                                        ucol_tok_addToExtraCurrent(src, buff, 2, status);
1149                                    }
1150
1151                                    src->current++;
1152                                    goto EndOfLoop;
1153                                } else {
1154                                    *status = U_INVALID_FORMAT_ERROR;
1155                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1156                                    DBG_FORMAT_ERROR
1157                                }
1158                            } else if(result & UCOL_TOK_VARIABLE_TOP) {
1159                                if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
1160                                    variableTop = TRUE;
1161                                    src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1162                                    src->parsedToken.charsLen = 1;
1163                                    buff[0] = 0xFFFF;
1164                                    ucol_tok_addToExtraCurrent(src, buff, 1, status);
1165                                    src->current++;
1166                                    goto EndOfLoop;
1167                                } else {
1168                                    *status = U_INVALID_FORMAT_ERROR;
1169                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1170                                    DBG_FORMAT_ERROR
1171                                }
1172                            } else if (result & UCOL_TOK_BEFORE){
1173                                if(newStrength == UCOL_TOK_RESET) {
1174                                    before = result & UCOL_TOK_BEFORE;
1175                                } else {
1176                                    *status = U_INVALID_FORMAT_ERROR;
1177                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1178                                    DBG_FORMAT_ERROR
1179                                }
1180                            }
1181                        } else {
1182                            *status = U_INVALID_FORMAT_ERROR;
1183                            syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1184                            DBG_FORMAT_ERROR
1185                            return NULL;
1186                        }
1187                    }
1188                    break;
1189                case 0x0021/*! skip java thai modifier reordering*/:
1190                    break;
1191                case 0x002F/*'/'*/:
1192                    wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
1193                    inChars = FALSE; /* we're now processing expansion */
1194                    break;
1195                case 0x005C /* back slash for escaped chars */:
1196                    isEscaped = TRUE;
1197                    break;
1198                    /* found a quote, we're gonna start copying */
1199                case 0x0027/*'\''*/:
1200                    if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
1201                      *status = U_INVALID_FORMAT_ERROR;
1202                      syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1203                      DBG_FORMAT_ERROR
1204                      return NULL;
1205                      // enabling rules to start with a non-token character a < b
1206                      // newStrength = UCOL_TOK_RESET;
1207                    }
1208
1209                    inQuote = TRUE;
1210
1211                    if(inChars) { /* we're doing characters */
1212                        if(wasInQuote == FALSE) {
1213                            src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1214                        }
1215                        if (src->parsedToken.charsLen != 0) {
1216                            ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1217                        }
1218                        src->parsedToken.charsLen++;
1219                    } else { /* we're doing an expansion */
1220                        if(wasInQuote == FALSE) {
1221                            extensionOffset = (uint32_t)(src->extraCurrent - src->source);
1222                        }
1223                        if (newExtensionLen != 0) {
1224                            ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
1225                        }
1226                        newExtensionLen++;
1227                    }
1228
1229                    wasInQuote = TRUE;
1230
1231                    ch = *(++(src->current));
1232                    if(ch == 0x0027) { /* copy the double quote */
1233                        ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1234                        inQuote = FALSE;
1235                    }
1236                    break;
1237
1238                    /* '@' is french only if the strength is not currently set */
1239                    /* if it is, it's just a regular character in collation rules */
1240                case 0x0040/*'@'*/:
1241                    if (newStrength == UCOL_TOK_UNSET) {
1242                        src->opts->frenchCollation = UCOL_ON;
1243                        break;
1244                    }
1245
1246                case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1247                    // we want to store read characters to the prefix part and continue reading
1248                    // the characters (proper way would be to restart reading the chars, but in
1249                    // that case we would have to complicate the token hasher, which I do not
1250                    // intend to play with. Instead, we will do prefixes when prefixes are due
1251                    // (before adding the elements).
1252                    src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
1253                    src->parsedToken.prefixLen = src->parsedToken.charsLen;
1254
1255                    if(inChars) { /* we're doing characters */
1256                        if(wasInQuote == FALSE) {
1257                            src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1258                        }
1259                        if (src->parsedToken.charsLen != 0) {
1260                            ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1261                        }
1262                        src->parsedToken.charsLen++;
1263                    }
1264
1265                    wasInQuote = TRUE;
1266
1267                    do {
1268                        ch = *(++(src->current));
1269                        // skip whitespace between '|' and the character
1270                    } while (PatternProps::isWhiteSpace(ch));
1271                    break;
1272
1273                    //charsOffset = 0;
1274                    //newCharsLen = 0;
1275                    //break; // We want to store the whole prefix/character sequence. If we break
1276                    // the '|' is going to get lost.
1277
1278                case 0x002D /*-*/: /* A range. */
1279                    if (newStrength != UCOL_TOK_UNSET) {
1280                      // While processing the pending token, the isStarred field
1281                      // is reset, so it needs to be saved for the next
1282                      // invocation.
1283                      src->savedIsStarred = src->isStarred;
1284                      goto EndOfLoop;
1285                   }
1286                   src->isStarred = src->savedIsStarred;
1287
1288                   // Ranges are valid only in starred tokens.
1289                   if (!src->isStarred) {
1290                     *status = U_INVALID_FORMAT_ERROR;
1291                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1292                     DBG_FORMAT_ERROR
1293                     return NULL;
1294                   }
1295                   newStrength = src->parsedToken.strength;
1296                   src->inRange = TRUE;
1297                   break;
1298
1299                case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1300                    do {
1301                        ch = *(++(src->current));
1302                    } while (!isCharNewLine(ch));
1303
1304                    break;
1305                default:
1306                    if (newStrength == UCOL_TOK_UNSET) {
1307                      *status = U_INVALID_FORMAT_ERROR;
1308                      syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1309                      DBG_FORMAT_ERROR
1310                      return NULL;
1311                    }
1312
1313                    if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
1314                        *status = U_INVALID_FORMAT_ERROR;
1315                        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1316                        DBG_FORMAT_ERROR
1317                        return NULL;
1318                    }
1319
1320                    if(ch == 0x0000 && src->current+1 == src->end) {
1321                        break;
1322                    }
1323
1324                    if (inChars) {
1325                        if(src->parsedToken.charsLen == 0) {
1326                            src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1327                        }
1328                        src->parsedToken.charsLen++;
1329                    } else {
1330                        if(newExtensionLen == 0) {
1331                            extensionOffset = (uint32_t)(src->current - src->source);
1332                        }
1333                        newExtensionLen++;
1334                    }
1335
1336                    break;
1337                }
1338            }
1339        }
1340
1341        if(wasInQuote) {
1342            if(ch != 0x27) {
1343                if(inQuote || !PatternProps::isWhiteSpace(ch)) {
1344                    ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1345                }
1346            }
1347        }
1348
1349        src->current++;
1350    }
1351
1352EndOfLoop:
1353    wasInQuote = FALSE;
1354    if (newStrength == UCOL_TOK_UNSET) {
1355        return NULL;
1356    }
1357
1358    if (src->parsedToken.charsLen == 0 && top == FALSE) {
1359        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1360        *status = U_INVALID_FORMAT_ERROR;
1361        DBG_FORMAT_ERROR
1362        return NULL;
1363    }
1364
1365    src->parsedToken.strength = newStrength;
1366    src->parsedToken.extensionOffset = extensionOffset;
1367    src->parsedToken.extensionLen = newExtensionLen;
1368    src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
1369
1370    return src->current;
1371}
1372
1373/*
1374 * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.
1375 * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.
1376 *
1377 * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following:
1378 *  1) ucol_tok_parseNextTokenInternal() returns a range as a single token.  This function separates
1379 *     it to separate tokens and returns one by one.  In order to do that, the necessary states are
1380 *     cached as member variables of the token parser.
1381 *  2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the
1382 *     starting character as a single list token (which is separated into individual characters here)
1383 *     and as another list token starting with the last character in the range.  Before expanding it
1384 *     as a list of tokens, this function expands the range by filling the intermediate characters and
1385 *     returns them one by one as separate tokens.
1386 * Necessary checks are done for invalid combinations.
1387 */
1388U_CAPI const UChar* U_EXPORT2
1389ucol_tok_parseNextToken(UColTokenParser *src,
1390                        UBool startOfRules,
1391                        UParseError *parseError,
1392                        UErrorCode *status)
1393{
1394  const UChar *nextToken;
1395
1396  if (src->inRange) {
1397    // We are not done processing a range.  Continue it.
1398    return ucol_tok_processNextCodePointInRange(src, status);
1399  } else if (src->isStarred) {
1400    // We are not done processing a starred token.  Continue it.
1401    return ucol_tok_processNextTokenInStarredList(src);
1402  }
1403
1404  // Get the next token.
1405  nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status);
1406
1407  if (nextToken == NULL) {
1408    return NULL;
1409  }
1410
1411  if (src->inRange) {
1412    // A new range has started.
1413    // Check whether it is a chain of ranges with more than one hyphen.
1414    if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) {
1415        *status = U_INVALID_FORMAT_ERROR;
1416        syntaxError(src->source,src->parsedToken.charsOffset-1,
1417                    src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError);
1418        DBG_FORMAT_ERROR
1419        return NULL;
1420    }
1421
1422    // The current token indicates the second code point of the range.
1423    // Process just that, and then proceed with the star.
1424    src->currentStarredCharIndex = src->parsedToken.charsOffset;
1425    U16_NEXT(src->source, src->currentStarredCharIndex,
1426             (uint32_t)(src->end - src->source), src->lastRangeCp);
1427    if (src->lastRangeCp <= src->previousCp) {
1428        *status = U_INVALID_FORMAT_ERROR;
1429        syntaxError(src->source,src->parsedToken.charsOffset-1,
1430                    src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1431        DBG_FORMAT_ERROR
1432        return NULL;
1433    }
1434
1435    // Set current range code point to process the range loop
1436    src->currentRangeCp = src->previousCp + 1;
1437
1438    src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
1439
1440    return ucol_tok_processNextCodePointInRange(src, status);
1441 } else if (src->isStarred) {
1442    // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
1443    // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
1444    // separated into several tokens and returned.
1445    src->currentStarredCharIndex = src->parsedToken.charsOffset;
1446    src->lastStarredCharIndex =  src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
1447
1448    return ucol_tok_processNextTokenInStarredList(src);
1449  } else {
1450    // Set previous codepoint
1451    U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp);
1452  }
1453  return nextToken;
1454}
1455
1456
1457/*
1458Processing Description
14591 Build a ListList. Each list has a header, which contains two lists (positive
1460and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1461reset may be null.
14622 As you process, you keep a LAST pointer that points to the last token you
1463handled.
1464
1465*/
1466
1467static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
1468                                      UParseError *parseError, UErrorCode *status)
1469{
1470    if(src->resultLen == src->listCapacity) {
1471        // Unfortunately, this won't work, as we store addresses of lhs in token
1472        src->listCapacity *= 2;
1473        src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
1474        if(src->lh == NULL) {
1475            *status = U_MEMORY_ALLOCATION_ERROR;
1476            return NULL;
1477        }
1478    }
1479    /* do the reset thing */
1480    UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1481    /* test for NULL */
1482    if (sourceToken == NULL) {
1483        *status = U_MEMORY_ALLOCATION_ERROR;
1484        return NULL;
1485    }
1486    sourceToken->rulesToParseHdl = &(src->source);
1487    sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1488    sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1489
1490    sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1491    sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1492
1493    // keep the flags around so that we know about before
1494    sourceToken->flags = src->parsedToken.flags;
1495
1496    if(src->parsedToken.prefixOffset != 0) {
1497        // this is a syntax error
1498        *status = U_INVALID_FORMAT_ERROR;
1499        syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1500        DBG_FORMAT_ERROR
1501        uprv_free(sourceToken);
1502        return 0;
1503    } else {
1504        sourceToken->prefix = 0;
1505    }
1506
1507    sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1508    sourceToken->strength = UCOL_TOK_RESET;
1509    sourceToken->next = NULL;
1510    sourceToken->previous = NULL;
1511    sourceToken->noOfCEs = 0;
1512    sourceToken->noOfExpCEs = 0;
1513    sourceToken->listHeader = &src->lh[src->resultLen];
1514
1515    src->lh[src->resultLen].first = NULL;
1516    src->lh[src->resultLen].last = NULL;
1517    src->lh[src->resultLen].first = NULL;
1518    src->lh[src->resultLen].last = NULL;
1519
1520    src->lh[src->resultLen].reset = sourceToken;
1521
1522    /*
1523    3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1524    First convert all expansions into normal form. Examples:
1525    If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1526    d * ... into &x * c/y * d * ...
1527    Note: reset values can never have expansions, although they can cause the
1528    very next item to have one. They may be contractions, if they are found
1529    earlier in the list.
1530    */
1531    *expandNext = 0;
1532    if(expand != NULL) {
1533        /* check to see if there is an expansion */
1534        if(src->parsedToken.charsLen > 1) {
1535            uint32_t resetCharsOffset;
1536            resetCharsOffset = (uint32_t)(expand - src->source);
1537            sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
1538            *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
1539        }
1540    }
1541
1542    src->resultLen++;
1543
1544    uhash_put(src->tailored, sourceToken, sourceToken, status);
1545
1546    return sourceToken;
1547}
1548
1549static
1550inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
1551    if(U_FAILURE(*status)) {
1552        return NULL;
1553    }
1554    /* this is a virgin before - we need to fish the anchor from the UCA */
1555    collIterate s;
1556    uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
1557    uint32_t CE, SecondCE;
1558    uint32_t invPos;
1559    if(sourceToken != NULL) {
1560        uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
1561    } else {
1562        uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
1563    }
1564    if(U_FAILURE(*status)) {
1565        return NULL;
1566    }
1567
1568    baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
1569    baseContCE = ucol_getNextCE(src->UCA, &s, status);
1570    if(baseContCE == UCOL_NO_MORE_CES) {
1571        baseContCE = 0;
1572    }
1573
1574
1575    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1576    uint32_t ch = 0;
1577    uint32_t expandNext = 0;
1578    UColToken key;
1579
1580    if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1581        uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
1582        uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1583        ch = uprv_uca_getCodePointFromRaw(raw-1);
1584        uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1585        CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
1586        SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
1587
1588        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1589        *src->extraCurrent++ = 0xFFFE;
1590        *src->extraCurrent++ = (UChar)ch;
1591        src->parsedToken.charsLen++;
1592
1593        key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1594        key.rulesToParseHdl = &(src->source);
1595
1596        //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1597        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1598
1599        if(sourceToken == NULL) {
1600            src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1601            if(isContinuation(SecondCE)) {
1602                src->lh[src->resultLen].baseContCE = SecondCE;
1603            } else {
1604                src->lh[src->resultLen].baseContCE = 0;
1605            }
1606            src->lh[src->resultLen].nextCE = 0;
1607            src->lh[src->resultLen].nextContCE = 0;
1608            src->lh[src->resultLen].previousCE = 0;
1609            src->lh[src->resultLen].previousContCE = 0;
1610
1611            src->lh[src->resultLen].indirect = FALSE;
1612
1613            sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1614        }
1615
1616    } else {
1617        invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1618
1619        // we got the previous CE. Now we need to see if the difference between
1620        // the two CEs is really of the requested strength.
1621        // if it's a bigger difference (we asked for secondary and got primary), we
1622        // need to modify the CE.
1623        if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
1624            // adjust the strength
1625            // now we are in the situation where our baseCE should actually be modified in
1626            // order to get the CE in the right position.
1627            if(strength == UCOL_SECONDARY) {
1628                CE = baseCE - 0x0200;
1629            } else { // strength == UCOL_TERTIARY
1630                CE = baseCE - 0x02;
1631            }
1632            if(baseContCE) {
1633                if(strength == UCOL_SECONDARY) {
1634                    SecondCE = baseContCE - 0x0200;
1635                } else { // strength == UCOL_TERTIARY
1636                    SecondCE = baseContCE - 0x02;
1637                }
1638            }
1639        }
1640
1641#if 0
1642        // the code below relies on getting a code point from the inverse table, in order to be
1643        // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1644        // 1. There are many code points that have the same CE
1645        // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1646        // Also, in case when there is no equivalent strength before an element, we have to actually
1647        // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1648        // before a is a primary difference.
1649
1650        //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
1651
1652
1653        ch = CETable[3*invPos+2];
1654
1655        if((ch &  UCOL_INV_SIZEMASK) != 0) {
1656            uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
1657            uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
1658            ch = conts[offset];
1659        }
1660
1661        *src->extraCurrent++ = (UChar)ch;
1662        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
1663        src->parsedToken.charsLen = 1;
1664
1665        // We got an UCA before. However, this might have been tailored.
1666        // example:
1667        // &\u30ca = \u306a
1668        // &[before 3]\u306a<<<\u306a|\u309d
1669
1670
1671        // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1672        key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1673        key.rulesToParseHdl = &(src->source);
1674
1675        //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1676        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1677#endif
1678
1679        // here is how it should be. The situation such as &[before 1]a < x, should be
1680        // resolved exactly as if we wrote &a > x.
1681        // therefore, I don't really care if the UCA value before a has been changed.
1682        // However, I do care if the strength between my element and the previous element
1683        // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1684        // have to construct the base CE.
1685
1686
1687
1688        // if we found a tailored thing, we have to use the UCA value and construct
1689        // a new reset token with constructed name
1690        //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1691        // character to which we want to anchor is already tailored.
1692        // We need to construct a new token which will be the anchor
1693        // point
1694        //*(src->extraCurrent-1) = 0xFFFE;
1695        //*src->extraCurrent++ = (UChar)ch;
1696        // grab before
1697        src->parsedToken.charsOffset -= 10;
1698        src->parsedToken.charsLen += 10;
1699        src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1700        if(isContinuation(SecondCE)) {
1701            src->lh[src->resultLen].baseContCE = SecondCE;
1702        } else {
1703            src->lh[src->resultLen].baseContCE = 0;
1704        }
1705        src->lh[src->resultLen].nextCE = 0;
1706        src->lh[src->resultLen].nextContCE = 0;
1707        src->lh[src->resultLen].previousCE = 0;
1708        src->lh[src->resultLen].previousContCE = 0;
1709
1710        src->lh[src->resultLen].indirect = FALSE;
1711
1712        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1713        //}
1714    }
1715
1716    return sourceToken;
1717
1718}
1719
1720uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
1721    UColToken *lastToken = NULL;
1722    const UChar *parseEnd = NULL;
1723    uint32_t expandNext = 0;
1724    UBool variableTop = FALSE;
1725    UBool top = FALSE;
1726    uint16_t specs = 0;
1727    UColTokListHeader *ListList = NULL;
1728
1729    src->parsedToken.strength = UCOL_TOK_UNSET;
1730
1731    ListList = src->lh;
1732
1733    if(U_FAILURE(*status)) {
1734        return 0;
1735    }
1736#ifdef DEBUG_FOR_CODE_POINTS
1737    char filename[35];
1738    sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid());
1739    dfcp_fp = fopen(filename, "a");
1740    fprintf(stdout, "Output is in the file %s.\n", filename);
1741#endif
1742
1743#ifdef DEBUG_FOR_COLL_RULES
1744    std::string s3;
1745    UnicodeString(src->source).toUTF8String(s3);
1746    std::cout << "src->source = " << s3 << std::endl;
1747#endif
1748
1749    while(src->current < src->end || src->isStarred) {
1750        src->parsedToken.prefixOffset = 0;
1751
1752        parseEnd = ucol_tok_parseNextToken(src,
1753            (UBool)(lastToken == NULL),
1754            parseError,
1755            status);
1756
1757        specs = src->parsedToken.flags;
1758
1759
1760        variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
1761        top = ((specs & UCOL_TOK_TOP) != 0);
1762
1763        if(U_SUCCESS(*status) && parseEnd != NULL) {
1764            UColToken *sourceToken = NULL;
1765            //uint32_t key = 0;
1766            uint32_t lastStrength = UCOL_TOK_UNSET;
1767
1768            if(lastToken != NULL ) {
1769                lastStrength = lastToken->strength;
1770            }
1771
1772#ifdef DEBUG_FOR_CODE_POINTS
1773            UChar32 cp;
1774            U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp);
1775            fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength);
1776#endif
1777            //key = newCharsLen << 24 | charsOffset;
1778            UColToken key;
1779            key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1780            key.rulesToParseHdl = &(src->source);
1781
1782            /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
1783            sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1784
1785            if(src->parsedToken.strength != UCOL_TOK_RESET) {
1786                if(lastToken == NULL) { /* this means that rules haven't started properly */
1787                    *status = U_INVALID_FORMAT_ERROR;
1788                    syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1789                    DBG_FORMAT_ERROR
1790                    return 0;
1791                }
1792                /*  6 Otherwise (when relation != reset) */
1793                if(sourceToken == NULL) {
1794                    /* If sourceToken is null, create new one, */
1795                    sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1796                    /* test for NULL */
1797                    if (sourceToken == NULL) {
1798                        *status = U_MEMORY_ALLOCATION_ERROR;
1799                        return 0;
1800                    }
1801                    sourceToken->rulesToParseHdl = &(src->source);
1802                    sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1803
1804                    sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1805
1806                    sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
1807                    sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
1808
1809                    sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1810                    sourceToken->next = NULL;
1811                    sourceToken->previous = NULL;
1812                    sourceToken->noOfCEs = 0;
1813                    sourceToken->noOfExpCEs = 0;
1814                    // keep the flags around so that we know about before
1815                    sourceToken->flags = src->parsedToken.flags;
1816                    uhash_put(src->tailored, sourceToken, sourceToken, status);
1817                    if(U_FAILURE(*status)) {
1818                        return 0;
1819                    }
1820                } else {
1821                    /* we could have fished out a reset here */
1822                    if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
1823                        /* otherwise remove sourceToken from where it was. */
1824                        if(sourceToken->next != NULL) {
1825                            if(sourceToken->next->strength > sourceToken->strength) {
1826                                sourceToken->next->strength = sourceToken->strength;
1827                            }
1828                            sourceToken->next->previous = sourceToken->previous;
1829                        } else {
1830                            sourceToken->listHeader->last = sourceToken->previous;
1831                        }
1832
1833                        if(sourceToken->previous != NULL) {
1834                            sourceToken->previous->next = sourceToken->next;
1835                        } else {
1836                            sourceToken->listHeader->first = sourceToken->next;
1837                        }
1838                        sourceToken->next = NULL;
1839                        sourceToken->previous = NULL;
1840                    }
1841                }
1842
1843                sourceToken->strength = src->parsedToken.strength;
1844                sourceToken->listHeader = lastToken->listHeader;
1845
1846                /*
1847                1.  Find the strongest strength in each list, and set strongestP and strongestN
1848                accordingly in the headers.
1849                */
1850                if(lastStrength == UCOL_TOK_RESET
1851                    || sourceToken->listHeader->first == 0) {
1852                        /* If LAST is a reset
1853                        insert sourceToken in the list. */
1854                        if(sourceToken->listHeader->first == 0) {
1855                            sourceToken->listHeader->first = sourceToken;
1856                            sourceToken->listHeader->last = sourceToken;
1857                        } else { /* we need to find a place for us */
1858                            /* and we'll get in front of the same strength */
1859                            if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
1860                                sourceToken->next = sourceToken->listHeader->first;
1861                                sourceToken->next->previous = sourceToken;
1862                                sourceToken->listHeader->first = sourceToken;
1863                                sourceToken->previous = NULL;
1864                            } else {
1865                                lastToken = sourceToken->listHeader->first;
1866                                while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1867                                    lastToken = lastToken->next;
1868                                }
1869                                if(lastToken->next != NULL) {
1870                                    lastToken->next->previous = sourceToken;
1871                                } else {
1872                                    sourceToken->listHeader->last = sourceToken;
1873                                }
1874                                sourceToken->previous = lastToken;
1875                                sourceToken->next = lastToken->next;
1876                                lastToken->next = sourceToken;
1877                            }
1878                        }
1879                    } else {
1880                        /* Otherwise (when LAST is not a reset)
1881                        if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1882                        otherwise insert before.
1883                        when inserting after or before, search to the next position with the same
1884                        strength in that direction. (This is called postpone insertion).         */
1885                        if(sourceToken != lastToken) {
1886                            if(lastToken->polarity == sourceToken->polarity) {
1887                                while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1888                                    lastToken = lastToken->next;
1889                                }
1890                                sourceToken->previous = lastToken;
1891                                if(lastToken->next != NULL) {
1892                                    lastToken->next->previous = sourceToken;
1893                                } else {
1894                                    sourceToken->listHeader->last = sourceToken;
1895                                }
1896
1897                                sourceToken->next = lastToken->next;
1898                                lastToken->next = sourceToken;
1899                            } else {
1900                                while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
1901                                    lastToken = lastToken->previous;
1902                                }
1903                                sourceToken->next = lastToken;
1904                                if(lastToken->previous != NULL) {
1905                                    lastToken->previous->next = sourceToken;
1906                                } else {
1907                                    sourceToken->listHeader->first = sourceToken;
1908                                }
1909                                sourceToken->previous = lastToken->previous;
1910                                lastToken->previous = sourceToken;
1911                            }
1912                        } else { /* repeated one thing twice in rules, stay with the stronger strength */
1913                            if(lastStrength < sourceToken->strength) {
1914                                sourceToken->strength = lastStrength;
1915                            }
1916                        }
1917                    }
1918
1919                    /* if the token was a variable top, we're gonna put it in */
1920                    if(variableTop == TRUE && src->varTop == NULL) {
1921                        variableTop = FALSE;
1922                        src->varTop = sourceToken;
1923                    }
1924
1925                    // Treat the expansions.
1926                    // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1927                    // (&abc * d * e <=> &ab * d / c * e / c)
1928                    // if both of them are in effect for a token, they are combined.
1929
1930                    sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1931
1932                    if(expandNext != 0) {
1933                        if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
1934                            expandNext = 0;
1935                        } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
1936                            sourceToken->expansion = expandNext;
1937                        } else { /* there is both explicit and implicit expansion. We need to make a combination */
1938                            uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
1939                            uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
1940                            sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
1941                            src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
1942                        }
1943                    }
1944
1945                    // This is just for debugging purposes
1946                    if(sourceToken->expansion != 0) {
1947                        sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1948                    } else {
1949                        sourceToken->debugExpansion = 0;
1950                    }
1951                    // if the previous token was a reset before, the strength of this
1952                    // token must match the strength of before. Otherwise we have an
1953                    // undefined situation.
1954                    // In other words, we currently have a cludge which we use to
1955                    // represent &a >> x. This is written as &[before 2]a << x.
1956                    if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
1957                        uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
1958                        if(beforeStrength != sourceToken->strength) {
1959                            *status = U_INVALID_FORMAT_ERROR;
1960                            syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1961                            DBG_FORMAT_ERROR
1962                            return 0;
1963                        }
1964                    }
1965            } else {
1966                if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
1967                    /* if the previous token was also a reset, */
1968                    /*this means that we have two consecutive resets */
1969                    /* and we want to remove the previous one if empty*/
1970                    if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1971                        src->resultLen--;
1972                    }
1973                }
1974
1975                if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1976                    uint32_t searchCharsLen = src->parsedToken.charsLen;
1977                    while(searchCharsLen > 1 && sourceToken == NULL) {
1978                        searchCharsLen--;
1979                        //key = searchCharsLen << 24 | charsOffset;
1980                        UColToken key;
1981                        key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
1982                        key.rulesToParseHdl = &(src->source);
1983                        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1984                    }
1985                    if(sourceToken != NULL) {
1986                        expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
1987                    }
1988                }
1989
1990                if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
1991                    if(top == FALSE) { /* there is no indirection */
1992                        uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1993                        if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1994                            /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1995                            while(sourceToken->strength > strength && sourceToken->previous != NULL) {
1996                                sourceToken = sourceToken->previous;
1997                            }
1998                            /* here, either we hit the strength or NULL */
1999                            if(sourceToken->strength == strength) {
2000                                if(sourceToken->previous != NULL) {
2001                                    sourceToken = sourceToken->previous;
2002                                } else { /* start of list */
2003                                    sourceToken = sourceToken->listHeader->reset;
2004                                }
2005                            } else { /* we hit NULL */
2006                                /* we should be doing the else part */
2007                                sourceToken = sourceToken->listHeader->reset;
2008                                sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
2009                            }
2010                        } else {
2011                            sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
2012                        }
2013                    } else { /* this is both before and indirection */
2014                        top = FALSE;
2015                        ListList[src->resultLen].previousCE = 0;
2016                        ListList[src->resultLen].previousContCE = 0;
2017                        ListList[src->resultLen].indirect = TRUE;
2018                        /* we need to do slightly more work. we need to get the baseCE using the */
2019                        /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
2020                        /* in ucol_bld */
2021                        uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
2022                        uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
2023                        uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
2024                        uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
2025
2026                        UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
2027                        if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) &&
2028                           (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
2029                            uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
2030                            uint32_t raw = uprv_uca_getRawFromImplicit(primary);
2031                            uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
2032                            CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
2033                            SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
2034                        } else {
2035                            /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
2036                            ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
2037                        }
2038
2039                        ListList[src->resultLen].baseCE = CE;
2040                        ListList[src->resultLen].baseContCE = SecondCE;
2041                        ListList[src->resultLen].nextCE = 0;
2042                        ListList[src->resultLen].nextContCE = 0;
2043
2044                        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
2045                    }
2046                }
2047
2048
2049                /*  5 If the relation is a reset:
2050                If sourceToken is null
2051                Create new list, create new sourceToken, make the baseCE from source, put
2052                the sourceToken in ListHeader of the new list */
2053                if(sourceToken == NULL) {
2054                    /*
2055                    3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
2056                    First convert all expansions into normal form. Examples:
2057                    If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
2058                    d * ... into &x * c/y * d * ...
2059                    Note: reset values can never have expansions, although they can cause the
2060                    very next item to have one. They may be contractions, if they are found
2061                    earlier in the list.
2062                    */
2063                    if(top == FALSE) {
2064                        collIterate s;
2065                        uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
2066
2067                        uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
2068
2069                        CE = ucol_getNextCE(src->UCA, &s, status);
2070                        const UChar *expand = s.pos;
2071                        SecondCE = ucol_getNextCE(src->UCA, &s, status);
2072
2073                        ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
2074                        if(isContinuation(SecondCE)) {
2075                            ListList[src->resultLen].baseContCE = SecondCE;
2076                        } else {
2077                            ListList[src->resultLen].baseContCE = 0;
2078                        }
2079                        ListList[src->resultLen].nextCE = 0;
2080                        ListList[src->resultLen].nextContCE = 0;
2081                        ListList[src->resultLen].previousCE = 0;
2082                        ListList[src->resultLen].previousContCE = 0;
2083                        ListList[src->resultLen].indirect = FALSE;
2084                        sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
2085                    } else { /* top == TRUE */
2086                        /* just use the supplied values */
2087                        top = FALSE;
2088                        ListList[src->resultLen].previousCE = 0;
2089                        ListList[src->resultLen].previousContCE = 0;
2090                        ListList[src->resultLen].indirect = TRUE;
2091                        ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
2092                        ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
2093                        ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
2094                        ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
2095
2096                        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
2097
2098                    }
2099                } else { /* reset to something already in rules */
2100                    top = FALSE;
2101                }
2102            }
2103            /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
2104            lastToken = sourceToken;
2105        } else {
2106            if(U_FAILURE(*status)) {
2107                return 0;
2108            }
2109        }
2110    }
2111#ifdef DEBUG_FOR_CODE_POINTS
2112    fclose(dfcp_fp);
2113#endif
2114
2115
2116    if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
2117        src->resultLen--;
2118    }
2119    return src->resultLen;
2120}
2121
2122const UChar* ucol_tok_getRulesFromBundle(
2123    void* /*context*/,
2124    const char* locale,
2125    const char* type,
2126    int32_t* pLength,
2127    UErrorCode* status)
2128{
2129    const UChar* rules = NULL;
2130    UResourceBundle* bundle;
2131    UResourceBundle* collations;
2132    UResourceBundle* collation;
2133
2134    *pLength = 0;
2135
2136    bundle = ures_open(U_ICUDATA_COLL, locale, status);
2137    if(U_SUCCESS(*status)){
2138        collations = ures_getByKey(bundle, "collations", NULL, status);
2139        if(U_SUCCESS(*status)){
2140            collation = ures_getByKey(collations, type, NULL, status);
2141            if(U_SUCCESS(*status)){
2142                rules = ures_getStringByKey(collation, "Sequence", pLength, status);
2143                if(U_FAILURE(*status)){
2144                    *pLength = 0;
2145                    rules = NULL;
2146                }
2147                ures_close(collation);
2148            }
2149            ures_close(collations);
2150        }
2151    }
2152
2153    ures_close(bundle);
2154
2155    return rules;
2156}
2157
2158void ucol_tok_initTokenList(
2159    UColTokenParser *src,
2160    const UChar *rules,
2161    uint32_t rulesLength,
2162    const UCollator *UCA,
2163    GetCollationRulesFunction importFunc,
2164    void* context,
2165    UErrorCode *status) {
2166    U_NAMESPACE_USE
2167
2168    uint32_t nSize = 0;
2169    uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
2170
2171    bool needToDeallocRules = false;
2172
2173    if(U_FAILURE(*status)) {
2174        return;
2175    }
2176
2177    // set everything to zero, so that we can clean up gracefully
2178    uprv_memset(src, 0, sizeof(UColTokenParser));
2179
2180    // first we need to find options that don't like to be normalized,
2181    // like copy and remove...
2182    //const UChar *openBrace = rules;
2183    int32_t optionNumber = -1;
2184    const UChar *setStart = NULL;
2185    uint32_t i = 0;
2186    while(i < rulesLength) {
2187        if(rules[i] == 0x005B) {    // '[': start of an option
2188            /* Gets the following:
2189               optionNumber: The index of the option.
2190               setStart: The pointer at which the option arguments start.
2191             */
2192            optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
2193
2194            if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
2195                // [optimize]
2196                USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
2197                if(U_SUCCESS(*status)) {
2198                    if(src->copySet == NULL) {
2199                        src->copySet = newSet;
2200                    } else {
2201                        uset_addAll(src->copySet, newSet);
2202                        uset_close(newSet);
2203                    }
2204                } else {
2205                    return;
2206                }
2207            } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
2208                USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
2209                if(U_SUCCESS(*status)) {
2210                    if(src->removeSet == NULL) {
2211                        src->removeSet = newSet;
2212                    } else {
2213                        uset_addAll(src->removeSet, newSet);
2214                        uset_close(newSet);
2215                    }
2216                } else {
2217                    return;
2218                }
2219            } else if(optionNumber == OPTION_IMPORT){
2220                // [import <collation-name>]
2221
2222                // Find the address of the closing ].
2223                UChar* import_end = u_strchr(setStart, 0x005D);
2224                int32_t optionEndOffset = (int32_t)(import_end + 1 - rules);
2225                // Ignore trailing whitespace.
2226                while(PatternProps::isWhiteSpace(*(import_end-1))) {
2227                    --import_end;
2228                }
2229
2230                int32_t optionLength = (int32_t)(import_end - setStart);
2231                char option[50];
2232                if(optionLength >= (int32_t)sizeof(option)) {
2233                    *status = U_ILLEGAL_ARGUMENT_ERROR;
2234                    return;
2235                }
2236                u_UCharsToChars(setStart, option, optionLength);
2237                option[optionLength] = 0;
2238
2239                *status = U_ZERO_ERROR;
2240                char locale[50];
2241                int32_t templ;
2242                uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status);
2243                if(U_FAILURE(*status)) {
2244                    *status = U_ILLEGAL_ARGUMENT_ERROR;
2245                    return;
2246                }
2247
2248                char type[50];
2249                if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 ||
2250                    U_FAILURE(*status)
2251                ) {
2252                    *status = U_ZERO_ERROR;
2253                    uprv_strcpy(type, "standard");
2254                }
2255
2256                // TODO: Use public functions when available, see ticket #8134.
2257                char *keywords = (char *)locale_getKeywordsStart(locale);
2258                if(keywords != NULL) {
2259                    *keywords = 0;
2260                }
2261
2262                int32_t importRulesLength = 0;
2263                const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status);
2264
2265#ifdef DEBUG_FOR_COLL_RULES
2266                std::string s;
2267                UnicodeString(importRules).toUTF8String(s);
2268                std::cout << "Import rules = " << s << std::endl;
2269#endif
2270
2271                // Add the length of the imported rules to length of the original rules,
2272                // and subtract the length of the import option.
2273                uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i);
2274
2275                UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar));
2276
2277#ifdef DEBUG_FOR_COLL_RULES
2278                std::string s1;
2279                UnicodeString(rules).toUTF8String(s1);
2280                std::cout << "Original rules = " << s1 << std::endl;
2281#endif
2282
2283
2284                // Copy the section of the original rules leading up to the import
2285                uprv_memcpy(newRules, rules, i*sizeof(UChar));
2286                // Copy the imported rules
2287                uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar));
2288                // Copy the rest of the original rules (minus the import option itself)
2289                uprv_memcpy(newRules+i+importRulesLength,
2290                            rules+optionEndOffset,
2291                            (rulesLength-optionEndOffset)*sizeof(UChar));
2292
2293#ifdef DEBUG_FOR_COLL_RULES
2294                std::string s2;
2295                UnicodeString(newRules).toUTF8String(s2);
2296                std::cout << "Resulting rules = " << s2 << std::endl;
2297#endif
2298
2299                if(needToDeallocRules){
2300                    // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2301                    uprv_free((void*)rules);
2302                }
2303                needToDeallocRules = true;
2304                rules = newRules;
2305                rulesLength = newRulesLength;
2306
2307                estimatedSize += importRulesLength*2;
2308
2309                // First character of the new rules needs to be processed
2310                i--;
2311            }
2312        }
2313        //openBrace++;
2314        i++;
2315    }
2316
2317    src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
2318    /* test for NULL */
2319    if (src->source == NULL) {
2320        *status = U_MEMORY_ALLOCATION_ERROR;
2321        return;
2322    }
2323    uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
2324    nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
2325    if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
2326        *status = U_ZERO_ERROR;
2327        src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
2328        /* test for NULL */
2329        if (src->source == NULL) {
2330            *status = U_MEMORY_ALLOCATION_ERROR;
2331            return;
2332        }
2333        nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
2334    }
2335    if(needToDeallocRules){
2336        // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2337        uprv_free((void*)rules);
2338    }
2339
2340
2341    src->current = src->source;
2342    src->end = src->source+nSize;
2343    src->sourceCurrent = src->source;
2344    src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
2345    src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
2346    src->varTop = NULL;
2347    src->UCA = UCA;
2348    src->invUCA = ucol_initInverseUCA(status);
2349    src->parsedToken.charsLen = 0;
2350    src->parsedToken.charsOffset = 0;
2351    src->parsedToken.extensionLen = 0;
2352    src->parsedToken.extensionOffset = 0;
2353    src->parsedToken.prefixLen = 0;
2354    src->parsedToken.prefixOffset = 0;
2355    src->parsedToken.flags = 0;
2356    src->parsedToken.strength = UCOL_TOK_UNSET;
2357    src->buildCCTabFlag = FALSE;
2358    src->isStarred = FALSE;
2359    src->inRange = FALSE;
2360    src->lastRangeCp = 0;
2361    src->previousCp = 0;
2362
2363    if(U_FAILURE(*status)) {
2364        return;
2365    }
2366    src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
2367    if(U_FAILURE(*status)) {
2368        return;
2369    }
2370    uhash_setValueDeleter(src->tailored, uhash_freeBlock);
2371
2372    src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
2373    /* test for NULL */
2374    if (src->opts == NULL) {
2375        *status = U_MEMORY_ALLOCATION_ERROR;
2376        return;
2377    }
2378
2379    uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
2380
2381    src->lh = 0;
2382    src->listCapacity = 1024;
2383    src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
2384    //Test for NULL
2385    if (src->lh == NULL) {
2386        *status = U_MEMORY_ALLOCATION_ERROR;
2387        return;
2388    }
2389    uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
2390    src->resultLen = 0;
2391
2392    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
2393
2394    // UCOL_RESET_TOP_VALUE
2395    setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
2396    // UCOL_FIRST_PRIMARY_IGNORABLE
2397    setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
2398    // UCOL_LAST_PRIMARY_IGNORABLE
2399    setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
2400    // UCOL_FIRST_SECONDARY_IGNORABLE
2401    setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
2402    // UCOL_LAST_SECONDARY_IGNORABLE
2403    setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
2404    // UCOL_FIRST_TERTIARY_IGNORABLE
2405    setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
2406    // UCOL_LAST_TERTIARY_IGNORABLE
2407    setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
2408    // UCOL_FIRST_VARIABLE
2409    setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
2410    // UCOL_LAST_VARIABLE
2411    setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
2412    // UCOL_FIRST_NON_VARIABLE
2413    setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
2414    // UCOL_LAST_NON_VARIABLE
2415    setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
2416    // UCOL_FIRST_IMPLICIT
2417    setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
2418    // UCOL_LAST_IMPLICIT
2419    setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
2420    // UCOL_FIRST_TRAILING
2421    setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
2422    // UCOL_LAST_TRAILING
2423    setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
2424    ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
2425}
2426
2427
2428void ucol_tok_closeTokenList(UColTokenParser *src) {
2429    if(src->copySet != NULL) {
2430        uset_close(src->copySet);
2431    }
2432    if(src->removeSet != NULL) {
2433        uset_close(src->removeSet);
2434    }
2435    if(src->tailored != NULL) {
2436        uhash_close(src->tailored);
2437    }
2438    if(src->lh != NULL) {
2439        uprv_free(src->lh);
2440    }
2441    if(src->source != NULL) {
2442        uprv_free(src->source);
2443    }
2444    if(src->opts != NULL) {
2445        uprv_free(src->opts);
2446    }
2447    if (src->reorderCodes != NULL) {
2448        uprv_free(src->reorderCodes);
2449    }
2450}
2451
2452#endif /* #if !UCONFIG_NO_COLLATION */
2453