1/*
2*******************************************************************************
3*
4*   Copyright (C) 2001-2012, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  ucol_tok.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created 02/22/2001
14*   created by: Vladimir Weinstein
15*
16* This module reads a tailoring rule string and produces a list of
17* tokens that will be turned into collation elements
18*
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_COLLATION
24
25#include "unicode/uscript.h"
26#include "unicode/ustring.h"
27#include "unicode/uchar.h"
28#include "unicode/uniset.h"
29
30#include "cmemory.h"
31#include "cstring.h"
32#include "patternprops.h"
33#include "ucol_bld.h"
34#include "ucol_tok.h"
35#include "ulocimp.h"
36#include "uresimp.h"
37
38// Define this only for debugging.
39// #define DEBUG_FOR_COLL_RULES 1
40
41#ifdef DEBUG_FOR_COLL_RULES
42#include <iostream>
43#endif
44
45U_NAMESPACE_USE
46
47U_CDECL_BEGIN
48static int32_t U_CALLCONV
49uhash_hashTokens(const UHashTok k)
50{
51    int32_t hash = 0;
52    //uint32_t key = (uint32_t)k.integer;
53    UColToken *key = (UColToken *)k.pointer;
54    if (key != 0) {
55        int32_t len = (key->source & 0xFF000000)>>24;
56        int32_t inc = ((len - 32) / 32) + 1;
57
58        const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl);
59        const UChar *limit = p + len;
60
61        while (p<limit) {
62            hash = (hash * 37) + *p;
63            p += inc;
64        }
65    }
66    return hash;
67}
68
69static UBool U_CALLCONV
70uhash_compareTokens(const UHashTok key1, const UHashTok key2)
71{
72    //uint32_t p1 = (uint32_t) key1.integer;
73    //uint32_t p2 = (uint32_t) key2.integer;
74    UColToken *p1 = (UColToken *)key1.pointer;
75    UColToken *p2 = (UColToken *)key2.pointer;
76    const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl);
77    const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl);
78    uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
79    uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
80    const UChar *end = s1+s1L-1;
81
82    if (p1 == p2) {
83        return TRUE;
84    }
85    if (p1->source == 0 || p2->source == 0) {
86        return FALSE;
87    }
88    if(s1L != s2L) {
89        return FALSE;
90    }
91    if(p1->source == p2->source) {
92        return TRUE;
93    }
94    while((s1 < end) && *s1 == *s2) {
95        ++s1;
96        ++s2;
97    }
98    if(*s1 == *s2) {
99        return TRUE;
100    } else {
101        return FALSE;
102    }
103}
104U_CDECL_END
105
106/*
107 * Debug messages used to pinpoint where a format error occurred.
108 * A better way is to include context-sensitive information in syntaxError() function.
109 *
110 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR
111 * in the compile line.
112 */
113/* #define DEBUG_FOR_FORMAT_ERROR 1 */
114
115#ifdef DEBUG_FOR_FORMAT_ERROR
116#define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);}
117#else
118#define DBG_FORMAT_ERROR
119#endif
120
121
122/*
123 * Controls debug messages so that the output can be compared before and after a
124 * big change.  Prints the information of every code point that comes out of the
125 * collation parser and its strength into a file.  When a big change in format
126 * happens, the files before and after the change should be identical.
127 *
128 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS
129 * in the compile line.
130 */
131// #define DEBUG_FOR_CODE_POINTS 1
132
133#ifdef DEBUG_FOR_CODE_POINTS
134    FILE* dfcp_fp = NULL;
135#endif
136
137
138typedef struct {
139    uint32_t startCE;
140    uint32_t startContCE;
141    uint32_t limitCE;
142    uint32_t limitContCE;
143} indirectBoundaries;
144
145/* these values are used for finding CE values for indirect positioning. */
146/* Indirect positioning is a mechanism for allowing resets on symbolic   */
147/* values. It only works for resets and you cannot tailor indirect names */
148/* An indirect name can define either an anchor point or a range. An     */
149/* anchor point behaves in exactly the same way as a code point in reset */
150/* would, except that it cannot be tailored. A range (we currently only  */
151/* know for the [top] range will explicitly set the upper bound for      */
152/* generated CEs, thus allowing for better control over how many CEs can */
153/* be squeezed between in the range without performance penalty.         */
154/* In that respect, we use [top] for tailoring of locales that use CJK   */
155/* characters. Other indirect values are currently a pure convenience,   */
156/* they can be used to assure that the CEs will be always positioned in  */
157/* the same place relative to a point with known properties (e.g. first  */
158/* primary ignorable). */
159static indirectBoundaries ucolIndirectBoundaries[15];
160/*
161static indirectBoundaries ucolIndirectBoundaries[11] = {
162{ UCOL_RESET_TOP_VALUE,               0,
163UCOL_NEXT_TOP_VALUE,                0 },
164{ UCOL_FIRST_PRIMARY_IGNORABLE,       0,
1650,                                  0 },
166{ UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
1670,                                  0 },
168{ UCOL_FIRST_SECONDARY_IGNORABLE,     0,
1690,                                  0 },
170{ UCOL_LAST_SECONDARY_IGNORABLE,      0,
1710,                                  0 },
172{ UCOL_FIRST_TERTIARY_IGNORABLE,      0,
1730,                                  0 },
174{ UCOL_LAST_TERTIARY_IGNORABLE,       0,
1750,                                  0 },
176{ UCOL_FIRST_VARIABLE,                0,
1770,                                  0 },
178{ UCOL_LAST_VARIABLE,                 0,
1790,                                  0 },
180{ UCOL_FIRST_NON_VARIABLE,            0,
1810,                                  0 },
182{ UCOL_LAST_NON_VARIABLE,             0,
1830,                                  0 },
184};
185*/
186
187static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
188
189    // Set values for the top - TODO: once we have values for all the indirects, we are going
190    // to initalize here.
191    ucolIndirectBoundaries[indexR].startCE = start[0];
192    ucolIndirectBoundaries[indexR].startContCE = start[1];
193    if(end) {
194        ucolIndirectBoundaries[indexR].limitCE = end[0];
195        ucolIndirectBoundaries[indexR].limitContCE = end[1];
196    } else {
197        ucolIndirectBoundaries[indexR].limitCE = 0;
198        ucolIndirectBoundaries[indexR].limitContCE = 0;
199    }
200}
201
202
203static inline
204void syntaxError(const UChar* rules,
205                 int32_t pos,
206                 int32_t rulesLen,
207                 UParseError* parseError)
208{
209    parseError->offset = pos;
210    parseError->line = 0 ; /* we are not using line numbers */
211
212    // for pre-context
213    int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
214    int32_t stop  = pos;
215
216    u_memcpy(parseError->preContext,rules+start,stop-start);
217    //null terminate the buffer
218    parseError->preContext[stop-start] = 0;
219
220    //for post-context
221    start = pos+1;
222    stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
223    rulesLen;
224
225    if(start < stop) {
226        u_memcpy(parseError->postContext,rules+start,stop-start);
227        //null terminate the buffer
228        parseError->postContext[stop-start]= 0;
229    } else {
230        parseError->postContext[0] = 0;
231    }
232}
233
234static
235void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
236    switch(attrib) {
237    case UCOL_HIRAGANA_QUATERNARY_MODE:
238        opts->hiraganaQ = value;
239        break;
240    case UCOL_FRENCH_COLLATION:
241        opts->frenchCollation = value;
242        break;
243    case UCOL_ALTERNATE_HANDLING:
244        opts->alternateHandling = value;
245        break;
246    case UCOL_CASE_FIRST:
247        opts->caseFirst = value;
248        break;
249    case UCOL_CASE_LEVEL:
250        opts->caseLevel = value;
251        break;
252    case UCOL_NORMALIZATION_MODE:
253        opts->normalizationMode = value;
254        break;
255    case UCOL_STRENGTH:
256        opts->strength = value;
257        break;
258    case UCOL_NUMERIC_COLLATION:
259        opts->numericCollation = value;
260        break;
261    case UCOL_ATTRIBUTE_COUNT:
262    default:
263        break;
264    }
265}
266
267#define UTOK_OPTION_COUNT 22
268
269static UBool didInit = FALSE;
270/* we can be strict, or we can be lenient */
271/* I'd surely be lenient with the option arguments */
272/* maybe even with options */
273U_STRING_DECL(suboption_00, "non-ignorable", 13);
274U_STRING_DECL(suboption_01, "shifted",        7);
275
276U_STRING_DECL(suboption_02, "lower",          5);
277U_STRING_DECL(suboption_03, "upper",          5);
278U_STRING_DECL(suboption_04, "off",            3);
279U_STRING_DECL(suboption_05, "on",             2);
280U_STRING_DECL(suboption_06, "1",              1);
281U_STRING_DECL(suboption_07, "2",              1);
282U_STRING_DECL(suboption_08, "3",              1);
283U_STRING_DECL(suboption_09, "4",              1);
284U_STRING_DECL(suboption_10, "I",              1);
285
286U_STRING_DECL(suboption_11, "primary",        7);
287U_STRING_DECL(suboption_12, "secondary",      9);
288U_STRING_DECL(suboption_13, "tertiary",       8);
289U_STRING_DECL(suboption_14, "variable",       8);
290U_STRING_DECL(suboption_15, "regular",        7);
291U_STRING_DECL(suboption_16, "implicit",       8);
292U_STRING_DECL(suboption_17, "trailing",       8);
293
294
295U_STRING_DECL(option_00,    "undefined",      9);
296U_STRING_DECL(option_01,    "rearrange",      9);
297U_STRING_DECL(option_02,    "alternate",      9);
298U_STRING_DECL(option_03,    "backwards",      9);
299U_STRING_DECL(option_04,    "variable top",  12);
300U_STRING_DECL(option_05,    "top",            3);
301U_STRING_DECL(option_06,    "normalization", 13);
302U_STRING_DECL(option_07,    "caseLevel",      9);
303U_STRING_DECL(option_08,    "caseFirst",      9);
304U_STRING_DECL(option_09,    "scriptOrder",   11);
305U_STRING_DECL(option_10,    "charsetname",   11);
306U_STRING_DECL(option_11,    "charset",        7);
307U_STRING_DECL(option_12,    "before",         6);
308U_STRING_DECL(option_13,    "hiraganaQ",      9);
309U_STRING_DECL(option_14,    "strength",       8);
310U_STRING_DECL(option_15,    "first",          5);
311U_STRING_DECL(option_16,    "last",           4);
312U_STRING_DECL(option_17,    "optimize",       8);
313U_STRING_DECL(option_18,    "suppressContractions",         20);
314U_STRING_DECL(option_19,    "numericOrdering",              15);
315U_STRING_DECL(option_20,    "import",         6);
316U_STRING_DECL(option_21,    "reorder",         7);
317
318/*
319[last variable] last variable value
320[last primary ignorable] largest CE for primary ignorable
321[last secondary ignorable] largest CE for secondary ignorable
322[last tertiary ignorable] largest CE for tertiary ignorable
323[top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
324*/
325
326
327static const ucolTokSuboption alternateSub[2] = {
328    {suboption_00, 13, UCOL_NON_IGNORABLE},
329    {suboption_01,  7, UCOL_SHIFTED}
330};
331
332static const ucolTokSuboption caseFirstSub[3] = {
333    {suboption_02, 5, UCOL_LOWER_FIRST},
334    {suboption_03,  5, UCOL_UPPER_FIRST},
335    {suboption_04,  3, UCOL_OFF},
336};
337
338static const ucolTokSuboption onOffSub[2] = {
339    {suboption_04, 3, UCOL_OFF},
340    {suboption_05, 2, UCOL_ON}
341};
342
343static const ucolTokSuboption frenchSub[1] = {
344    {suboption_07, 1, UCOL_ON}
345};
346
347static const ucolTokSuboption beforeSub[3] = {
348    {suboption_06, 1, UCOL_PRIMARY},
349    {suboption_07, 1, UCOL_SECONDARY},
350    {suboption_08, 1, UCOL_TERTIARY}
351};
352
353static const ucolTokSuboption strengthSub[5] = {
354    {suboption_06, 1, UCOL_PRIMARY},
355    {suboption_07, 1, UCOL_SECONDARY},
356    {suboption_08, 1, UCOL_TERTIARY},
357    {suboption_09, 1, UCOL_QUATERNARY},
358    {suboption_10, 1, UCOL_IDENTICAL},
359};
360
361static const ucolTokSuboption firstLastSub[7] = {
362    {suboption_11, 7, UCOL_PRIMARY},
363    {suboption_12, 9, UCOL_PRIMARY},
364    {suboption_13, 8, UCOL_PRIMARY},
365    {suboption_14, 8, UCOL_PRIMARY},
366    {suboption_15, 7, UCOL_PRIMARY},
367    {suboption_16, 8, UCOL_PRIMARY},
368    {suboption_17, 8, UCOL_PRIMARY},
369};
370
371enum OptionNumber {
372    OPTION_ALTERNATE_HANDLING = 0,
373    OPTION_FRENCH_COLLATION,
374    OPTION_CASE_LEVEL,
375    OPTION_CASE_FIRST,
376    OPTION_NORMALIZATION_MODE,
377    OPTION_HIRAGANA_QUATERNARY,
378    OPTION_STRENGTH,
379    OPTION_NUMERIC_COLLATION,
380    OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
381    OPTION_VARIABLE_TOP,
382    OPTION_REARRANGE,
383    OPTION_BEFORE,
384    OPTION_TOP,
385    OPTION_FIRST,
386    OPTION_LAST,
387    OPTION_OPTIMIZE,
388    OPTION_SUPPRESS_CONTRACTIONS,
389    OPTION_UNDEFINED,
390    OPTION_SCRIPT_ORDER,
391    OPTION_CHARSET_NAME,
392    OPTION_CHARSET,
393    OPTION_IMPORT,
394    OPTION_SCRIPTREORDER
395} ;
396
397static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
398    /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
399    /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
400    /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
401    /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
402    /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
403    /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
404    /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
405    /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
406    /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
407    /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
408    /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
409    /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
410    /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
411    /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
412    /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
413    /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
414    /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
415    /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
416    /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
417    /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"charset"        */
418    /*20*/ {option_20,  6, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"import"        */
419    /*21*/ {option_21,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"reorder"        */
420};
421
422static
423int32_t u_strncmpNoCase(const UChar     *s1,
424                        const UChar     *s2,
425                        int32_t     n)
426{
427    if(n > 0) {
428        int32_t rc;
429        for(;;) {
430            rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
431            if(rc != 0 || *s1 == 0 || --n == 0) {
432                return rc;
433            }
434            ++s1;
435            ++s2;
436        }
437    }
438    return 0;
439}
440
441static
442void ucol_uprv_tok_initData() {
443    if(!didInit) {
444        U_STRING_INIT(suboption_00, "non-ignorable", 13);
445        U_STRING_INIT(suboption_01, "shifted",        7);
446
447        U_STRING_INIT(suboption_02, "lower",          5);
448        U_STRING_INIT(suboption_03, "upper",          5);
449        U_STRING_INIT(suboption_04, "off",            3);
450        U_STRING_INIT(suboption_05, "on",             2);
451
452        U_STRING_INIT(suboption_06, "1",              1);
453        U_STRING_INIT(suboption_07, "2",              1);
454        U_STRING_INIT(suboption_08, "3",              1);
455        U_STRING_INIT(suboption_09, "4",              1);
456        U_STRING_INIT(suboption_10, "I",              1);
457
458        U_STRING_INIT(suboption_11, "primary",        7);
459        U_STRING_INIT(suboption_12, "secondary",      9);
460        U_STRING_INIT(suboption_13, "tertiary",       8);
461        U_STRING_INIT(suboption_14, "variable",       8);
462        U_STRING_INIT(suboption_15, "regular",        7);
463        U_STRING_INIT(suboption_16, "implicit",       8);
464        U_STRING_INIT(suboption_17, "trailing",       8);
465
466
467        U_STRING_INIT(option_00, "undefined",      9);
468        U_STRING_INIT(option_01, "rearrange",      9);
469        U_STRING_INIT(option_02, "alternate",      9);
470        U_STRING_INIT(option_03, "backwards",      9);
471        U_STRING_INIT(option_04, "variable top",  12);
472        U_STRING_INIT(option_05, "top",            3);
473        U_STRING_INIT(option_06, "normalization", 13);
474        U_STRING_INIT(option_07, "caseLevel",      9);
475        U_STRING_INIT(option_08, "caseFirst",      9);
476        U_STRING_INIT(option_09, "scriptOrder",   11);
477        U_STRING_INIT(option_10, "charsetname",   11);
478        U_STRING_INIT(option_11, "charset",        7);
479        U_STRING_INIT(option_12, "before",         6);
480        U_STRING_INIT(option_13, "hiraganaQ",      9);
481        U_STRING_INIT(option_14, "strength",       8);
482        U_STRING_INIT(option_15, "first",          5);
483        U_STRING_INIT(option_16, "last",           4);
484        U_STRING_INIT(option_17, "optimize",       8);
485        U_STRING_INIT(option_18, "suppressContractions",         20);
486        U_STRING_INIT(option_19, "numericOrdering",      15);
487        U_STRING_INIT(option_20, "import ",        6);
488        U_STRING_INIT(option_21, "reorder",        7);
489        didInit = TRUE;
490    }
491}
492
493
494// This function reads basic options to set in the runtime collator
495// used by data driven tests. Should not support build time options
496U_CAPI const UChar * U_EXPORT2
497ucol_tok_getNextArgument(const UChar *start, const UChar *end,
498                         UColAttribute *attrib, UColAttributeValue *value,
499                         UErrorCode *status)
500{
501    uint32_t i = 0;
502    int32_t j=0;
503    UBool foundOption = FALSE;
504    const UChar *optionArg = NULL;
505
506    ucol_uprv_tok_initData();
507
508    while(start < end && PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
509        start++;
510    }
511    if(start >= end) {
512        return NULL;
513    }
514    /* skip opening '[' */
515    if(*start == 0x005b) {
516        start++;
517    } else {
518        *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
519        return NULL;
520    }
521
522    while(i < UTOK_OPTION_COUNT) {
523        if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
524            foundOption = TRUE;
525            if(end - start > rulesOptions[i].optionLen) {
526                optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
527                while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
528                    optionArg++;
529                }
530            }
531            break;
532        }
533        i++;
534    }
535
536    if(!foundOption) {
537        *status = U_ILLEGAL_ARGUMENT_ERROR;
538        return NULL;
539    }
540
541    if(optionArg) {
542        for(j = 0; j<rulesOptions[i].subSize; j++) {
543            if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
544                //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
545                *attrib = rulesOptions[i].attr;
546                *value = rulesOptions[i].subopts[j].attrVal;
547                optionArg += rulesOptions[i].subopts[j].subLen;
548                while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
549                    optionArg++;
550                }
551                if(*optionArg == 0x005d) {
552                    optionArg++;
553                    return optionArg;
554                } else {
555                    *status = U_ILLEGAL_ARGUMENT_ERROR;
556                    return NULL;
557                }
558            }
559        }
560    }
561    *status = U_ILLEGAL_ARGUMENT_ERROR;
562    return NULL;
563}
564
565static
566USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
567    while(*start != 0x005b) { /* advance while we find the first '[' */
568        start++;
569    }
570    // now we need to get a balanced set of '[]'. The problem is that a set can have
571    // many, and *end point to the first closing '['
572    int32_t noOpenBraces = 1;
573    int32_t current = 1; // skip the opening brace
574    while(start+current < end && noOpenBraces != 0) {
575        if(start[current] == 0x005b) {
576            noOpenBraces++;
577        } else if(start[current] == 0x005D) { // closing brace
578            noOpenBraces--;
579        }
580        current++;
581    }
582
583    if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
584        *status = U_ILLEGAL_ARGUMENT_ERROR;
585        return NULL;
586    }
587    return uset_openPattern(start, current, status);
588}
589
590/**
591 * Reads an option and matches the option name with the predefined options. (Case-insensitive.)
592 * @param start Pointer to the start UChar.
593 * @param end Pointer to the last valid pointer beyond which the option will not extend.
594 * @param optionArg Address of the pointer at which the options start (after the option name)
595 * @return The index of the option, or -1 if the option is not valid.
596 */
597static
598int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
599    int32_t i = 0;
600    ucol_uprv_tok_initData();
601
602    while(PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
603        start++;
604    }
605    while(i < UTOK_OPTION_COUNT) {
606        if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
607            if(end - start > rulesOptions[i].optionLen) {
608                *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */
609                while(PatternProps::isWhiteSpace(**optionArg)) { /* eat whitespace */
610                    (*optionArg)++;
611                }
612            }
613            break;
614        }
615        i++;
616    }
617    if(i == UTOK_OPTION_COUNT) {
618        i = -1; // didn't find an option
619    }
620    return i;
621}
622
623
624static
625void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) {
626    int32_t codeCount = 0;
627    int32_t codeIndex = 0;
628    char conversion[64];
629    int32_t tokenLength = 0;
630    const UChar* space;
631
632    const UChar* current = src->current;
633    const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);
634
635    // eat leading whitespace
636    while(current < end && u_isWhitespace(*current)) {
637        current++;
638    }
639
640    while(current < end) {
641        space = u_memchr(current, 0x0020, end - current);
642        space = space == 0 ? end : space;
643        tokenLength = space - current;
644        if (tokenLength < 4) {
645            *status = U_INVALID_FORMAT_ERROR;
646            return;
647        }
648        codeCount++;
649        current += tokenLength;
650        while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
651            ++current;
652        }
653    }
654
655    if (codeCount == 0) {
656        *status = U_INVALID_FORMAT_ERROR;
657    }
658
659    src->reorderCodesLength = codeCount;
660    src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t));
661    current = src->current;
662
663    // eat leading whitespace
664    while(current < end && u_isWhitespace(*current)) {
665        current++;
666    }
667
668    while(current < end) {
669        space = u_memchr(current, 0x0020, end - current);
670        space = space == 0 ? end : space;
671        tokenLength = space - current;
672        if (tokenLength < 4) {
673            *status = U_ILLEGAL_ARGUMENT_ERROR;
674            return;
675        } else {
676            u_UCharsToChars(current, conversion, tokenLength);
677            conversion[tokenLength] = '\0';
678            src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion);
679            if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
680                src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion);
681            }
682            if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
683                *status = U_ILLEGAL_ARGUMENT_ERROR;
684            }
685        }
686        codeIndex++;
687        current += tokenLength;
688        while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
689            ++current;
690        }
691    }
692}
693
694// reads and conforms to various options in rules
695// end is the position of the first closing ']'
696// However, some of the options take an UnicodeSet definition
697// which needs to duplicate the closing ']'
698// for example: '[copy [\uAC00-\uD7FF]]'
699// These options will move end to the second ']' and the
700// caller will set the current to it.
701static
702uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
703    const UChar* start = src->current;
704    int32_t i = 0;
705    int32_t j=0;
706    const UChar *optionArg = NULL;
707
708    uint8_t result = 0;
709
710    start++; /*skip opening '['*/
711    i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
712    if(optionArg) {
713        src->current = optionArg;
714    }
715
716    if(i < 0) {
717        *status = U_ILLEGAL_ARGUMENT_ERROR;
718    } else {
719        int32_t noOpenBraces = 1;
720        switch(i) {
721    case OPTION_ALTERNATE_HANDLING:
722    case OPTION_FRENCH_COLLATION:
723    case OPTION_CASE_LEVEL:
724    case OPTION_CASE_FIRST:
725    case OPTION_NORMALIZATION_MODE:
726    case OPTION_HIRAGANA_QUATERNARY:
727    case OPTION_STRENGTH:
728    case OPTION_NUMERIC_COLLATION:
729        if(optionArg) {
730            for(j = 0; j<rulesOptions[i].subSize; j++) {
731                if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
732                    ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
733                    result =  UCOL_TOK_SUCCESS;
734                }
735            }
736        }
737        if(result == 0) {
738            *status = U_ILLEGAL_ARGUMENT_ERROR;
739        }
740        break;
741    case OPTION_VARIABLE_TOP:
742        result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
743        break;
744    case OPTION_REARRANGE:
745        result = UCOL_TOK_SUCCESS;
746        break;
747    case OPTION_BEFORE:
748        if(optionArg) {
749            for(j = 0; j<rulesOptions[i].subSize; j++) {
750                if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
751                    result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1);
752                }
753            }
754        }
755        if(result == 0) {
756            *status = U_ILLEGAL_ARGUMENT_ERROR;
757        }
758        break;
759    case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
760        /* index to this array will be src->parsedToken.indirectIndex*/
761        src->parsedToken.indirectIndex = 0;
762        result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
763        break;
764    case OPTION_FIRST:
765    case OPTION_LAST: /* first, last */
766        for(j = 0; j<rulesOptions[i].subSize; j++) {
767            if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
768                // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
769                // element of indirect boundaries is reserved for top.
770                src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
771                result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
772            }
773        }
774        if(result == 0) {
775            *status = U_ILLEGAL_ARGUMENT_ERROR;
776        }
777        break;
778    case OPTION_OPTIMIZE:
779    case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
780        // we need to move end here
781        src->current++; // skip opening brace
782        while(src->current < src->end && noOpenBraces != 0) {
783            if(*src->current == 0x005b) {
784                noOpenBraces++;
785            } else if(*src->current == 0x005D) { // closing brace
786                noOpenBraces--;
787            }
788            src->current++;
789        }
790        result = UCOL_TOK_SUCCESS;
791        break;
792    case OPTION_SCRIPTREORDER:
793        ucol_tok_parseScriptReorder(src, status);
794        break;
795    default:
796        *status = U_UNSUPPORTED_ERROR;
797        break;
798        }
799    }
800    src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));
801    return result;
802}
803
804
805inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
806    if (stuff == NULL || len <= 0) {
807        return;
808    }
809    UnicodeString tempStuff(FALSE, stuff, len);
810    if(src->extraCurrent+len >= src->extraEnd) {
811        /* reallocate */
812        if (stuff >= src->source && stuff <= src->end) {
813            // Copy the "stuff" contents into tempStuff's own buffer.
814            // UnicodeString is copy-on-write.
815            if (len > 0) {
816                tempStuff.setCharAt(0, tempStuff[0]);
817            } else {
818                tempStuff.remove();
819            }
820        }
821        UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
822        if(newSrc != NULL) {
823            src->current = newSrc + (src->current - src->source);
824            src->extraCurrent = newSrc + (src->extraCurrent - src->source);
825            src->end = newSrc + (src->end - src->source);
826            src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
827            src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
828            src->source = newSrc;
829        } else {
830            *status = U_MEMORY_ALLOCATION_ERROR;
831            return;
832        }
833    }
834    if(len == 1) {
835        *src->extraCurrent++ = tempStuff[0];
836    } else {
837        u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len);
838        src->extraCurrent += len;
839    }
840}
841
842inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
843    /*
844    top = TRUE;
845    */
846    UChar buff[5];
847    src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
848    buff[0] = 0xFFFE;
849    buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
850    buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
851    if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
852        src->parsedToken.charsLen = 3;
853        ucol_tok_addToExtraCurrent(src, buff, 3, status);
854    } else {
855        buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
856        buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
857        src->parsedToken.charsLen = 5;
858        ucol_tok_addToExtraCurrent(src, buff, 5, status);
859    }
860    return TRUE;
861}
862
863static UBool isCharNewLine(UChar c){
864    switch(c){
865    case 0x000A: /* LF  */
866    case 0x000D: /* CR  */
867    case 0x000C: /* FF  */
868    case 0x0085: /* NEL */
869    case 0x2028: /* LS  */
870    case 0x2029: /* PS  */
871        return TRUE;
872    default:
873        return FALSE;
874    }
875}
876
877/*
878 * This function is called several times when a range is processed.  Each time, the next code point
879 * is processed.
880 * The following variables must be set before calling this function:
881 *   src->currentRangeCp:  The current code point to process.
882 *   src->lastRangeCp: The last code point in the range.
883 * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.
884 */
885static const UChar*
886ucol_tok_processNextCodePointInRange(UColTokenParser *src,
887                                     UErrorCode *status)
888{
889  // Append current code point to source
890  UChar buff[U16_MAX_LENGTH];
891  uint32_t i = 0;
892
893  uint32_t nChars = U16_LENGTH(src->currentRangeCp);
894  src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
895  src->parsedToken.charsLen = nChars;
896
897  U16_APPEND_UNSAFE(buff, i, src->currentRangeCp);
898  ucol_tok_addToExtraCurrent(src, buff, nChars, status);
899
900  ++src->currentRangeCp;
901  if (src->currentRangeCp > src->lastRangeCp) {
902    src->inRange = FALSE;
903
904    if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
905      src->isStarred = FALSE;
906    }
907  } else {
908    src->previousCp = src->currentRangeCp;
909  }
910  return src->current;
911}
912
913/*
914 * This function is called several times when a starred list is processed.  Each time, the next code point
915 * in the list is processed.
916 * The following variables must be set before calling this function:
917 *   src->currentStarredCharIndex:  Index (in src->source) of the first char of the current code point.
918 *   src->lastStarredCharIndex: Index to the last character in the list.
919 * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.
920 */
921static const UChar*
922ucol_tok_processNextTokenInStarredList(UColTokenParser *src)
923{
924  // Extract the characters corresponding to the next code point.
925  UChar32 cp;
926  src->parsedToken.charsOffset = src->currentStarredCharIndex;
927  int32_t prev = src->currentStarredCharIndex;
928  U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp);
929  src->parsedToken.charsLen = src->currentStarredCharIndex - prev;
930
931  // When we are done parsing the starred string, turn the flag off so that
932  // the normal processing is restored.
933  if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
934    src->isStarred = FALSE;
935  }
936  src->previousCp = cp;
937  return src->current;
938}
939
940/*
941 * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters.
942 *
943 * This routine parses and separates almost all tokens. The following are the syntax characters recognized.
944 *  # : Comment character
945 *  & : Reset operator
946 *  = : Equality
947 *  < : Primary collation
948 *  << : Secondary collation
949 *  <<< : Tertiary collation
950 *  ; : Secondary collation
951 *  , : Tertiary collation
952 *  / : Expansions
953 *  | : Prefix
954 *  - : Range
955
956 *  ! : Java Thai modifier, ignored
957 *  @ : French only
958
959 * [] : Options
960 * '' : Quotes
961 *
962 *  Along with operators =, <, <<, <<<, the operator * is supported to indicate a list.  For example, &a<*bcdexyz
963 *  is equivalent to &a<b<c<d<e<x<y<z.  In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above.
964 *  This function do not separate the tokens in a list.  Instead, &a<*b-ex-z is parsed as three tokens - "&a",
965 *  "<*b", "-ex", "-z".  The strength (< in this case), whether in a list, whether in a range and the previous
966 *  character returned as cached so that the calling program can do further splitting.
967 */
968static const UChar*
969ucol_tok_parseNextTokenInternal(UColTokenParser *src,
970                                UBool startOfRules,
971                                UParseError *parseError,
972                                UErrorCode *status)
973{
974    UBool variableTop = FALSE;
975    UBool top = FALSE;
976    UBool inChars = TRUE;
977    UBool inQuote = FALSE;
978    UBool wasInQuote = FALSE;
979    uint8_t before = 0;
980    UBool isEscaped = FALSE;
981
982    // TODO: replace these variables with src->parsedToken counterparts
983    // no need to use them anymore since we have src->parsedToken.
984    // Ideally, token parser would be a nice class... Once, when I have
985    // more time (around 2020 probably).
986    uint32_t newExtensionLen = 0;
987    uint32_t extensionOffset = 0;
988    uint32_t newStrength = UCOL_TOK_UNSET;
989    UChar buff[10];
990
991    src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
992    src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
993    src->parsedToken.indirectIndex = 0;
994
995    while (src->current < src->end) {
996        UChar ch = *(src->current);
997
998        if (inQuote) {
999            if (ch == 0x0027/*'\''*/) {
1000                inQuote = FALSE;
1001            } else {
1002                if ((src->parsedToken.charsLen == 0) || inChars) {
1003                    if(src->parsedToken.charsLen == 0) {
1004                        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1005                    }
1006                    src->parsedToken.charsLen++;
1007                } else {
1008                    if(newExtensionLen == 0) {
1009                        extensionOffset = (uint32_t)(src->extraCurrent - src->source);
1010                    }
1011                    newExtensionLen++;
1012                }
1013            }
1014        }else if(isEscaped){
1015            isEscaped =FALSE;
1016            if (newStrength == UCOL_TOK_UNSET) {
1017                *status = U_INVALID_FORMAT_ERROR;
1018                syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1019                DBG_FORMAT_ERROR
1020                return NULL;
1021                // enabling rules to start with non-tokens a < b
1022                // newStrength = UCOL_TOK_RESET;
1023            }
1024            if(ch != 0x0000  && src->current != src->end) {
1025                if (inChars) {
1026                    if(src->parsedToken.charsLen == 0) {
1027                        src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1028                    }
1029                    src->parsedToken.charsLen++;
1030                } else {
1031                    if(newExtensionLen == 0) {
1032                        extensionOffset = (uint32_t)(src->current - src->source);
1033                    }
1034                    newExtensionLen++;
1035                }
1036            }
1037        }else {
1038            if(!PatternProps::isWhiteSpace(ch)) {
1039                /* Sets the strength for this entry */
1040                switch (ch) {
1041                case 0x003D/*'='*/ :
1042                    if (newStrength != UCOL_TOK_UNSET) {
1043                        goto EndOfLoop;
1044                    }
1045
1046                    /* if we start with strength, we'll reset to top */
1047                    if(startOfRules == TRUE) {
1048                        src->parsedToken.indirectIndex = 5;
1049                        top = ucol_tok_doSetTop(src, status);
1050                        newStrength = UCOL_TOK_RESET;
1051                        goto EndOfLoop;
1052                    }
1053                    newStrength = UCOL_IDENTICAL;
1054                    if(*(src->current+1) == 0x002A) {/*'*'*/
1055                        src->current++;
1056                        src->isStarred = TRUE;
1057                    }
1058                    break;
1059
1060                case 0x002C/*','*/:
1061                    if (newStrength != UCOL_TOK_UNSET) {
1062                        goto EndOfLoop;
1063                    }
1064
1065                    /* if we start with strength, we'll reset to top */
1066                    if(startOfRules == TRUE) {
1067                        src->parsedToken.indirectIndex = 5;
1068                        top = ucol_tok_doSetTop(src, status);
1069                        newStrength = UCOL_TOK_RESET;
1070                        goto EndOfLoop;
1071                    }
1072                    newStrength = UCOL_TERTIARY;
1073                    break;
1074
1075                case  0x003B/*';'*/:
1076                    if (newStrength != UCOL_TOK_UNSET) {
1077                        goto EndOfLoop;
1078                    }
1079
1080                    /* if we start with strength, we'll reset to top */
1081                    if(startOfRules == TRUE) {
1082                        src->parsedToken.indirectIndex = 5;
1083                        top = ucol_tok_doSetTop(src, status);
1084                        newStrength = UCOL_TOK_RESET;
1085                        goto EndOfLoop;
1086                    }
1087                    newStrength = UCOL_SECONDARY;
1088                    break;
1089
1090                case 0x003C/*'<'*/:
1091                    if (newStrength != UCOL_TOK_UNSET) {
1092                        goto EndOfLoop;
1093                    }
1094
1095                    /* if we start with strength, we'll reset to top */
1096                    if(startOfRules == TRUE) {
1097                        src->parsedToken.indirectIndex = 5;
1098                        top = ucol_tok_doSetTop(src, status);
1099                        newStrength = UCOL_TOK_RESET;
1100                        goto EndOfLoop;
1101                    }
1102                    /* before this, do a scan to verify whether this is */
1103                    /* another strength */
1104                    if(*(src->current+1) == 0x003C) {
1105                        src->current++;
1106                        if(*(src->current+1) == 0x003C) {
1107                            src->current++; /* three in a row! */
1108                            newStrength = UCOL_TERTIARY;
1109                        } else { /* two in a row */
1110                            newStrength = UCOL_SECONDARY;
1111                        }
1112                    } else { /* just one */
1113                        newStrength = UCOL_PRIMARY;
1114                    }
1115                    if(*(src->current+1) == 0x002A) {/*'*'*/
1116                        src->current++;
1117                        src->isStarred = TRUE;
1118                    }
1119                    break;
1120
1121                case 0x0026/*'&'*/:
1122                    if (newStrength != UCOL_TOK_UNSET) {
1123                        /**/
1124                        goto EndOfLoop;
1125                    }
1126
1127                    newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
1128                    break;
1129
1130                case 0x005b/*'['*/:
1131                    /* options - read an option, analyze it */
1132                    if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
1133                        uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
1134                        if(U_SUCCESS(*status)) {
1135                            if(result & UCOL_TOK_TOP) {
1136                                if(newStrength == UCOL_TOK_RESET) {
1137                                    top = ucol_tok_doSetTop(src, status);
1138                                    if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
1139                                        src->parsedToken.charsLen+=2;
1140                                        buff[0] = 0x002d;
1141                                        buff[1] = before;
1142                                        ucol_tok_addToExtraCurrent(src, buff, 2, status);
1143                                    }
1144
1145                                    src->current++;
1146                                    goto EndOfLoop;
1147                                } else {
1148                                    *status = U_INVALID_FORMAT_ERROR;
1149                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1150                                    DBG_FORMAT_ERROR
1151                                }
1152                            } else if(result & UCOL_TOK_VARIABLE_TOP) {
1153                                if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
1154                                    variableTop = TRUE;
1155                                    src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1156                                    src->parsedToken.charsLen = 1;
1157                                    buff[0] = 0xFFFF;
1158                                    ucol_tok_addToExtraCurrent(src, buff, 1, status);
1159                                    src->current++;
1160                                    goto EndOfLoop;
1161                                } else {
1162                                    *status = U_INVALID_FORMAT_ERROR;
1163                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1164                                    DBG_FORMAT_ERROR
1165                                }
1166                            } else if (result & UCOL_TOK_BEFORE){
1167                                if(newStrength == UCOL_TOK_RESET) {
1168                                    before = result & UCOL_TOK_BEFORE;
1169                                } else {
1170                                    *status = U_INVALID_FORMAT_ERROR;
1171                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1172                                    DBG_FORMAT_ERROR
1173                                }
1174                            }
1175                        } else {
1176                            *status = U_INVALID_FORMAT_ERROR;
1177                            syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1178                            DBG_FORMAT_ERROR
1179                            return NULL;
1180                        }
1181                    }
1182                    break;
1183                case 0x0021/*! skip java thai modifier reordering*/:
1184                    break;
1185                case 0x002F/*'/'*/:
1186                    wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
1187                    inChars = FALSE; /* we're now processing expansion */
1188                    break;
1189                case 0x005C /* back slash for escaped chars */:
1190                    isEscaped = TRUE;
1191                    break;
1192                    /* found a quote, we're gonna start copying */
1193                case 0x0027/*'\''*/:
1194                    if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
1195                      *status = U_INVALID_FORMAT_ERROR;
1196                      syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1197                      DBG_FORMAT_ERROR
1198                      return NULL;
1199                      // enabling rules to start with a non-token character a < b
1200                      // newStrength = UCOL_TOK_RESET;
1201                    }
1202
1203                    inQuote = TRUE;
1204
1205                    if(inChars) { /* we're doing characters */
1206                        if(wasInQuote == FALSE) {
1207                            src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1208                        }
1209                        if (src->parsedToken.charsLen != 0) {
1210                            ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1211                        }
1212                        src->parsedToken.charsLen++;
1213                    } else { /* we're doing an expansion */
1214                        if(wasInQuote == FALSE) {
1215                            extensionOffset = (uint32_t)(src->extraCurrent - src->source);
1216                        }
1217                        if (newExtensionLen != 0) {
1218                            ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
1219                        }
1220                        newExtensionLen++;
1221                    }
1222
1223                    wasInQuote = TRUE;
1224
1225                    ch = *(++(src->current));
1226                    if(ch == 0x0027) { /* copy the double quote */
1227                        ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1228                        inQuote = FALSE;
1229                    }
1230                    break;
1231
1232                    /* '@' is french only if the strength is not currently set */
1233                    /* if it is, it's just a regular character in collation rules */
1234                case 0x0040/*'@'*/:
1235                    if (newStrength == UCOL_TOK_UNSET) {
1236                        src->opts->frenchCollation = UCOL_ON;
1237                        break;
1238                    }
1239
1240                case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1241                    // we want to store read characters to the prefix part and continue reading
1242                    // the characters (proper way would be to restart reading the chars, but in
1243                    // that case we would have to complicate the token hasher, which I do not
1244                    // intend to play with. Instead, we will do prefixes when prefixes are due
1245                    // (before adding the elements).
1246                    src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
1247                    src->parsedToken.prefixLen = src->parsedToken.charsLen;
1248
1249                    if(inChars) { /* we're doing characters */
1250                        if(wasInQuote == FALSE) {
1251                            src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1252                        }
1253                        if (src->parsedToken.charsLen != 0) {
1254                            ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1255                        }
1256                        src->parsedToken.charsLen++;
1257                    }
1258
1259                    wasInQuote = TRUE;
1260
1261                    do {
1262                        ch = *(++(src->current));
1263                        // skip whitespace between '|' and the character
1264                    } while (PatternProps::isWhiteSpace(ch));
1265                    break;
1266
1267                    //charsOffset = 0;
1268                    //newCharsLen = 0;
1269                    //break; // We want to store the whole prefix/character sequence. If we break
1270                    // the '|' is going to get lost.
1271
1272                case 0x002D /*-*/: /* A range. */
1273                    if (newStrength != UCOL_TOK_UNSET) {
1274                      // While processing the pending token, the isStarred field
1275                      // is reset, so it needs to be saved for the next
1276                      // invocation.
1277                      src->savedIsStarred = src->isStarred;
1278                      goto EndOfLoop;
1279                   }
1280                   src->isStarred = src->savedIsStarred;
1281
1282                   // Ranges are valid only in starred tokens.
1283                   if (!src->isStarred) {
1284                     *status = U_INVALID_FORMAT_ERROR;
1285                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1286                     DBG_FORMAT_ERROR
1287                     return NULL;
1288                   }
1289                   newStrength = src->parsedToken.strength;
1290                   src->inRange = TRUE;
1291                   break;
1292
1293                case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1294                    do {
1295                        ch = *(++(src->current));
1296                    } while (!isCharNewLine(ch));
1297
1298                    break;
1299                default:
1300                    if (newStrength == UCOL_TOK_UNSET) {
1301                      *status = U_INVALID_FORMAT_ERROR;
1302                      syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1303                      DBG_FORMAT_ERROR
1304                      return NULL;
1305                    }
1306
1307                    if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
1308                        *status = U_INVALID_FORMAT_ERROR;
1309                        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1310                        DBG_FORMAT_ERROR
1311                        return NULL;
1312                    }
1313
1314                    if(ch == 0x0000 && src->current+1 == src->end) {
1315                        break;
1316                    }
1317
1318                    if (inChars) {
1319                        if(src->parsedToken.charsLen == 0) {
1320                            src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1321                        }
1322                        src->parsedToken.charsLen++;
1323                    } else {
1324                        if(newExtensionLen == 0) {
1325                            extensionOffset = (uint32_t)(src->current - src->source);
1326                        }
1327                        newExtensionLen++;
1328                    }
1329
1330                    break;
1331                }
1332            }
1333        }
1334
1335        if(wasInQuote) {
1336            if(ch != 0x27) {
1337                if(inQuote || !PatternProps::isWhiteSpace(ch)) {
1338                    ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1339                }
1340            }
1341        }
1342
1343        src->current++;
1344    }
1345
1346EndOfLoop:
1347    wasInQuote = FALSE;
1348    if (newStrength == UCOL_TOK_UNSET) {
1349        return NULL;
1350    }
1351
1352    if (src->parsedToken.charsLen == 0 && top == FALSE) {
1353        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1354        *status = U_INVALID_FORMAT_ERROR;
1355        DBG_FORMAT_ERROR
1356        return NULL;
1357    }
1358
1359    src->parsedToken.strength = newStrength;
1360    src->parsedToken.extensionOffset = extensionOffset;
1361    src->parsedToken.extensionLen = newExtensionLen;
1362    src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
1363
1364    return src->current;
1365}
1366
1367/*
1368 * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.
1369 * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.
1370 *
1371 * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following:
1372 *  1) ucol_tok_parseNextTokenInternal() returns a range as a single token.  This function separates
1373 *     it to separate tokens and returns one by one.  In order to do that, the necessary states are
1374 *     cached as member variables of the token parser.
1375 *  2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the
1376 *     starting character as a single list token (which is separated into individual characters here)
1377 *     and as another list token starting with the last character in the range.  Before expanding it
1378 *     as a list of tokens, this function expands the range by filling the intermediate characters and
1379 *     returns them one by one as separate tokens.
1380 * Necessary checks are done for invalid combinations.
1381 */
1382U_CAPI const UChar* U_EXPORT2
1383ucol_tok_parseNextToken(UColTokenParser *src,
1384                        UBool startOfRules,
1385                        UParseError *parseError,
1386                        UErrorCode *status)
1387{
1388  const UChar *nextToken;
1389
1390  if (src->inRange) {
1391    // We are not done processing a range.  Continue it.
1392    return ucol_tok_processNextCodePointInRange(src, status);
1393  } else if (src->isStarred) {
1394    // We are not done processing a starred token.  Continue it.
1395    return ucol_tok_processNextTokenInStarredList(src);
1396  }
1397
1398  // Get the next token.
1399  nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status);
1400
1401  if (nextToken == NULL) {
1402    return NULL;
1403  }
1404
1405  if (src->inRange) {
1406    // A new range has started.
1407    // Check whether it is a chain of ranges with more than one hyphen.
1408    if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) {
1409        *status = U_INVALID_FORMAT_ERROR;
1410        syntaxError(src->source,src->parsedToken.charsOffset-1,
1411                    src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError);
1412        DBG_FORMAT_ERROR
1413        return NULL;
1414    }
1415
1416    // The current token indicates the second code point of the range.
1417    // Process just that, and then proceed with the star.
1418    src->currentStarredCharIndex = src->parsedToken.charsOffset;
1419    U16_NEXT(src->source, src->currentStarredCharIndex,
1420             (uint32_t)(src->end - src->source), src->lastRangeCp);
1421    if (src->lastRangeCp <= src->previousCp) {
1422        *status = U_INVALID_FORMAT_ERROR;
1423        syntaxError(src->source,src->parsedToken.charsOffset-1,
1424                    src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1425        DBG_FORMAT_ERROR
1426        return NULL;
1427    }
1428
1429    // Set current range code point to process the range loop
1430    src->currentRangeCp = src->previousCp + 1;
1431
1432    src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
1433
1434    return ucol_tok_processNextCodePointInRange(src, status);
1435 } else if (src->isStarred) {
1436    // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
1437    // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
1438    // separated into several tokens and returned.
1439    src->currentStarredCharIndex = src->parsedToken.charsOffset;
1440    src->lastStarredCharIndex =  src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
1441
1442    return ucol_tok_processNextTokenInStarredList(src);
1443  } else {
1444    // Set previous codepoint
1445    U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp);
1446  }
1447  return nextToken;
1448}
1449
1450
1451/*
1452Processing Description
14531 Build a ListList. Each list has a header, which contains two lists (positive
1454and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1455reset may be null.
14562 As you process, you keep a LAST pointer that points to the last token you
1457handled.
1458
1459*/
1460
1461static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
1462                                      UParseError *parseError, UErrorCode *status)
1463{
1464    if(src->resultLen == src->listCapacity) {
1465        // Unfortunately, this won't work, as we store addresses of lhs in token
1466        src->listCapacity *= 2;
1467        src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
1468        if(src->lh == NULL) {
1469            *status = U_MEMORY_ALLOCATION_ERROR;
1470            return NULL;
1471        }
1472    }
1473    /* do the reset thing */
1474    UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1475    /* test for NULL */
1476    if (sourceToken == NULL) {
1477        *status = U_MEMORY_ALLOCATION_ERROR;
1478        return NULL;
1479    }
1480    sourceToken->rulesToParseHdl = &(src->source);
1481    sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1482    sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1483
1484    sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1485    sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1486
1487    // keep the flags around so that we know about before
1488    sourceToken->flags = src->parsedToken.flags;
1489
1490    if(src->parsedToken.prefixOffset != 0) {
1491        // this is a syntax error
1492        *status = U_INVALID_FORMAT_ERROR;
1493        syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1494        DBG_FORMAT_ERROR
1495        uprv_free(sourceToken);
1496        return 0;
1497    } else {
1498        sourceToken->prefix = 0;
1499    }
1500
1501    sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1502    sourceToken->strength = UCOL_TOK_RESET;
1503    sourceToken->next = NULL;
1504    sourceToken->previous = NULL;
1505    sourceToken->noOfCEs = 0;
1506    sourceToken->noOfExpCEs = 0;
1507    sourceToken->listHeader = &src->lh[src->resultLen];
1508
1509    src->lh[src->resultLen].first = NULL;
1510    src->lh[src->resultLen].last = NULL;
1511    src->lh[src->resultLen].first = NULL;
1512    src->lh[src->resultLen].last = NULL;
1513
1514    src->lh[src->resultLen].reset = sourceToken;
1515
1516    /*
1517    3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1518    First convert all expansions into normal form. Examples:
1519    If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1520    d * ... into &x * c/y * d * ...
1521    Note: reset values can never have expansions, although they can cause the
1522    very next item to have one. They may be contractions, if they are found
1523    earlier in the list.
1524    */
1525    *expandNext = 0;
1526    if(expand != NULL) {
1527        /* check to see if there is an expansion */
1528        if(src->parsedToken.charsLen > 1) {
1529            uint32_t resetCharsOffset;
1530            resetCharsOffset = (uint32_t)(expand - src->source);
1531            sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
1532            *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
1533        }
1534    }
1535
1536    src->resultLen++;
1537
1538    uhash_put(src->tailored, sourceToken, sourceToken, status);
1539
1540    return sourceToken;
1541}
1542
1543static
1544inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
1545    if(U_FAILURE(*status)) {
1546        return NULL;
1547    }
1548    /* this is a virgin before - we need to fish the anchor from the UCA */
1549    collIterate s;
1550    uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
1551    uint32_t CE, SecondCE;
1552    // uint32_t invPos;
1553    if(sourceToken != NULL) {
1554        uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
1555    } else {
1556        uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
1557    }
1558    if(U_FAILURE(*status)) {
1559        return NULL;
1560    }
1561
1562    baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
1563    baseContCE = ucol_getNextCE(src->UCA, &s, status);
1564    if(baseContCE == UCOL_NO_MORE_CES) {
1565        baseContCE = 0;
1566    }
1567
1568
1569    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1570    uint32_t ch = 0;
1571    uint32_t expandNext = 0;
1572    UColToken key;
1573
1574    if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1575        uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
1576        uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1577        ch = uprv_uca_getCodePointFromRaw(raw-1);
1578        uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1579        CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
1580        SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
1581
1582        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1583        *src->extraCurrent++ = 0xFFFE;
1584        *src->extraCurrent++ = (UChar)ch;
1585        src->parsedToken.charsLen++;
1586
1587        key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1588        key.rulesToParseHdl = &(src->source);
1589
1590        //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1591        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1592
1593        if(sourceToken == NULL) {
1594            src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1595            if(isContinuation(SecondCE)) {
1596                src->lh[src->resultLen].baseContCE = SecondCE;
1597            } else {
1598                src->lh[src->resultLen].baseContCE = 0;
1599            }
1600            src->lh[src->resultLen].nextCE = 0;
1601            src->lh[src->resultLen].nextContCE = 0;
1602            src->lh[src->resultLen].previousCE = 0;
1603            src->lh[src->resultLen].previousContCE = 0;
1604
1605            src->lh[src->resultLen].indirect = FALSE;
1606
1607            sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1608        }
1609
1610    } else {
1611        /* invPos = */ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1612
1613        // we got the previous CE. Now we need to see if the difference between
1614        // the two CEs is really of the requested strength.
1615        // if it's a bigger difference (we asked for secondary and got primary), we
1616        // need to modify the CE.
1617        if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
1618            // adjust the strength
1619            // now we are in the situation where our baseCE should actually be modified in
1620            // order to get the CE in the right position.
1621            if(strength == UCOL_SECONDARY) {
1622                CE = baseCE - 0x0200;
1623            } else { // strength == UCOL_TERTIARY
1624                CE = baseCE - 0x02;
1625            }
1626            if(baseContCE) {
1627                if(strength == UCOL_SECONDARY) {
1628                    SecondCE = baseContCE - 0x0200;
1629                } else { // strength == UCOL_TERTIARY
1630                    SecondCE = baseContCE - 0x02;
1631                }
1632            }
1633        }
1634
1635#if 0
1636        // the code below relies on getting a code point from the inverse table, in order to be
1637        // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1638        // 1. There are many code points that have the same CE
1639        // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1640        // Also, in case when there is no equivalent strength before an element, we have to actually
1641        // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1642        // before a is a primary difference.
1643
1644        //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
1645
1646
1647        ch = CETable[3*invPos+2];
1648
1649        if((ch &  UCOL_INV_SIZEMASK) != 0) {
1650            uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
1651            uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
1652            ch = conts[offset];
1653        }
1654
1655        *src->extraCurrent++ = (UChar)ch;
1656        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
1657        src->parsedToken.charsLen = 1;
1658
1659        // We got an UCA before. However, this might have been tailored.
1660        // example:
1661        // &\u30ca = \u306a
1662        // &[before 3]\u306a<<<\u306a|\u309d
1663
1664
1665        // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1666        key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1667        key.rulesToParseHdl = &(src->source);
1668
1669        //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1670        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1671#endif
1672
1673        // here is how it should be. The situation such as &[before 1]a < x, should be
1674        // resolved exactly as if we wrote &a > x.
1675        // therefore, I don't really care if the UCA value before a has been changed.
1676        // However, I do care if the strength between my element and the previous element
1677        // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1678        // have to construct the base CE.
1679
1680
1681
1682        // if we found a tailored thing, we have to use the UCA value and construct
1683        // a new reset token with constructed name
1684        //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1685        // character to which we want to anchor is already tailored.
1686        // We need to construct a new token which will be the anchor
1687        // point
1688        //*(src->extraCurrent-1) = 0xFFFE;
1689        //*src->extraCurrent++ = (UChar)ch;
1690        // grab before
1691        src->parsedToken.charsOffset -= 10;
1692        src->parsedToken.charsLen += 10;
1693        src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1694        if(isContinuation(SecondCE)) {
1695            src->lh[src->resultLen].baseContCE = SecondCE;
1696        } else {
1697            src->lh[src->resultLen].baseContCE = 0;
1698        }
1699        src->lh[src->resultLen].nextCE = 0;
1700        src->lh[src->resultLen].nextContCE = 0;
1701        src->lh[src->resultLen].previousCE = 0;
1702        src->lh[src->resultLen].previousContCE = 0;
1703
1704        src->lh[src->resultLen].indirect = FALSE;
1705
1706        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1707        //}
1708    }
1709
1710    return sourceToken;
1711
1712}
1713
1714uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
1715    UColToken *lastToken = NULL;
1716    const UChar *parseEnd = NULL;
1717    uint32_t expandNext = 0;
1718    UBool variableTop = FALSE;
1719    UBool top = FALSE;
1720    uint16_t specs = 0;
1721    UColTokListHeader *ListList = NULL;
1722
1723    src->parsedToken.strength = UCOL_TOK_UNSET;
1724
1725    ListList = src->lh;
1726
1727    if(U_FAILURE(*status)) {
1728        return 0;
1729    }
1730#ifdef DEBUG_FOR_CODE_POINTS
1731    char filename[35];
1732    sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid());
1733    dfcp_fp = fopen(filename, "a");
1734    fprintf(stdout, "Output is in the file %s.\n", filename);
1735#endif
1736
1737#ifdef DEBUG_FOR_COLL_RULES
1738    std::string s3;
1739    UnicodeString(src->source).toUTF8String(s3);
1740    std::cout << "src->source = " << s3 << std::endl;
1741#endif
1742
1743    while(src->current < src->end || src->isStarred) {
1744        src->parsedToken.prefixOffset = 0;
1745
1746        parseEnd = ucol_tok_parseNextToken(src,
1747            (UBool)(lastToken == NULL),
1748            parseError,
1749            status);
1750
1751        specs = src->parsedToken.flags;
1752
1753
1754        variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
1755        top = ((specs & UCOL_TOK_TOP) != 0);
1756
1757        if(U_SUCCESS(*status) && parseEnd != NULL) {
1758            UColToken *sourceToken = NULL;
1759            //uint32_t key = 0;
1760            uint32_t lastStrength = UCOL_TOK_UNSET;
1761
1762            if(lastToken != NULL ) {
1763                lastStrength = lastToken->strength;
1764            }
1765
1766#ifdef DEBUG_FOR_CODE_POINTS
1767            UChar32 cp;
1768            U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp);
1769            fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength);
1770#endif
1771            //key = newCharsLen << 24 | charsOffset;
1772            UColToken key;
1773            key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1774            key.rulesToParseHdl = &(src->source);
1775
1776            /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
1777            sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1778
1779            if(src->parsedToken.strength != UCOL_TOK_RESET) {
1780                if(lastToken == NULL) { /* this means that rules haven't started properly */
1781                    *status = U_INVALID_FORMAT_ERROR;
1782                    syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1783                    DBG_FORMAT_ERROR
1784                    return 0;
1785                }
1786                /*  6 Otherwise (when relation != reset) */
1787                if(sourceToken == NULL) {
1788                    /* If sourceToken is null, create new one, */
1789                    sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1790                    /* test for NULL */
1791                    if (sourceToken == NULL) {
1792                        *status = U_MEMORY_ALLOCATION_ERROR;
1793                        return 0;
1794                    }
1795                    sourceToken->rulesToParseHdl = &(src->source);
1796                    sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1797
1798                    sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1799
1800                    sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
1801                    sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
1802
1803                    sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1804                    sourceToken->next = NULL;
1805                    sourceToken->previous = NULL;
1806                    sourceToken->noOfCEs = 0;
1807                    sourceToken->noOfExpCEs = 0;
1808                    // keep the flags around so that we know about before
1809                    sourceToken->flags = src->parsedToken.flags;
1810                    uhash_put(src->tailored, sourceToken, sourceToken, status);
1811                    if(U_FAILURE(*status)) {
1812                        return 0;
1813                    }
1814                } else {
1815                    /* we could have fished out a reset here */
1816                    if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
1817                        /* otherwise remove sourceToken from where it was. */
1818                        if(sourceToken->next != NULL) {
1819                            if(sourceToken->next->strength > sourceToken->strength) {
1820                                sourceToken->next->strength = sourceToken->strength;
1821                            }
1822                            sourceToken->next->previous = sourceToken->previous;
1823                        } else {
1824                            sourceToken->listHeader->last = sourceToken->previous;
1825                        }
1826
1827                        if(sourceToken->previous != NULL) {
1828                            sourceToken->previous->next = sourceToken->next;
1829                        } else {
1830                            sourceToken->listHeader->first = sourceToken->next;
1831                        }
1832                        sourceToken->next = NULL;
1833                        sourceToken->previous = NULL;
1834                    }
1835                }
1836
1837                sourceToken->strength = src->parsedToken.strength;
1838                sourceToken->listHeader = lastToken->listHeader;
1839
1840                /*
1841                1.  Find the strongest strength in each list, and set strongestP and strongestN
1842                accordingly in the headers.
1843                */
1844                if(lastStrength == UCOL_TOK_RESET
1845                    || sourceToken->listHeader->first == 0) {
1846                        /* If LAST is a reset
1847                        insert sourceToken in the list. */
1848                        if(sourceToken->listHeader->first == 0) {
1849                            sourceToken->listHeader->first = sourceToken;
1850                            sourceToken->listHeader->last = sourceToken;
1851                        } else { /* we need to find a place for us */
1852                            /* and we'll get in front of the same strength */
1853                            if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
1854                                sourceToken->next = sourceToken->listHeader->first;
1855                                sourceToken->next->previous = sourceToken;
1856                                sourceToken->listHeader->first = sourceToken;
1857                                sourceToken->previous = NULL;
1858                            } else {
1859                                lastToken = sourceToken->listHeader->first;
1860                                while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1861                                    lastToken = lastToken->next;
1862                                }
1863                                if(lastToken->next != NULL) {
1864                                    lastToken->next->previous = sourceToken;
1865                                } else {
1866                                    sourceToken->listHeader->last = sourceToken;
1867                                }
1868                                sourceToken->previous = lastToken;
1869                                sourceToken->next = lastToken->next;
1870                                lastToken->next = sourceToken;
1871                            }
1872                        }
1873                    } else {
1874                        /* Otherwise (when LAST is not a reset)
1875                        if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1876                        otherwise insert before.
1877                        when inserting after or before, search to the next position with the same
1878                        strength in that direction. (This is called postpone insertion).         */
1879                        if(sourceToken != lastToken) {
1880                            if(lastToken->polarity == sourceToken->polarity) {
1881                                while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1882                                    lastToken = lastToken->next;
1883                                }
1884                                sourceToken->previous = lastToken;
1885                                if(lastToken->next != NULL) {
1886                                    lastToken->next->previous = sourceToken;
1887                                } else {
1888                                    sourceToken->listHeader->last = sourceToken;
1889                                }
1890
1891                                sourceToken->next = lastToken->next;
1892                                lastToken->next = sourceToken;
1893                            } else {
1894                                while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
1895                                    lastToken = lastToken->previous;
1896                                }
1897                                sourceToken->next = lastToken;
1898                                if(lastToken->previous != NULL) {
1899                                    lastToken->previous->next = sourceToken;
1900                                } else {
1901                                    sourceToken->listHeader->first = sourceToken;
1902                                }
1903                                sourceToken->previous = lastToken->previous;
1904                                lastToken->previous = sourceToken;
1905                            }
1906                        } else { /* repeated one thing twice in rules, stay with the stronger strength */
1907                            if(lastStrength < sourceToken->strength) {
1908                                sourceToken->strength = lastStrength;
1909                            }
1910                        }
1911                    }
1912
1913                    /* if the token was a variable top, we're gonna put it in */
1914                    if(variableTop == TRUE && src->varTop == NULL) {
1915                        variableTop = FALSE;
1916                        src->varTop = sourceToken;
1917                    }
1918
1919                    // Treat the expansions.
1920                    // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1921                    // (&abc * d * e <=> &ab * d / c * e / c)
1922                    // if both of them are in effect for a token, they are combined.
1923
1924                    sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1925
1926                    if(expandNext != 0) {
1927                        if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
1928                            expandNext = 0;
1929                        } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
1930                            sourceToken->expansion = expandNext;
1931                        } else { /* there is both explicit and implicit expansion. We need to make a combination */
1932                            uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
1933                            uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
1934                            sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
1935                            src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
1936                        }
1937                    }
1938
1939                    // This is just for debugging purposes
1940                    if(sourceToken->expansion != 0) {
1941                        sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1942                    } else {
1943                        sourceToken->debugExpansion = 0;
1944                    }
1945                    // if the previous token was a reset before, the strength of this
1946                    // token must match the strength of before. Otherwise we have an
1947                    // undefined situation.
1948                    // In other words, we currently have a cludge which we use to
1949                    // represent &a >> x. This is written as &[before 2]a << x.
1950                    if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
1951                        uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
1952                        if(beforeStrength != sourceToken->strength) {
1953                            *status = U_INVALID_FORMAT_ERROR;
1954                            syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1955                            DBG_FORMAT_ERROR
1956                            return 0;
1957                        }
1958                    }
1959            } else {
1960                if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
1961                    /* if the previous token was also a reset, */
1962                    /*this means that we have two consecutive resets */
1963                    /* and we want to remove the previous one if empty*/
1964                    if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1965                        src->resultLen--;
1966                    }
1967                }
1968
1969                if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1970                    uint32_t searchCharsLen = src->parsedToken.charsLen;
1971                    while(searchCharsLen > 1 && sourceToken == NULL) {
1972                        searchCharsLen--;
1973                        //key = searchCharsLen << 24 | charsOffset;
1974                        UColToken key;
1975                        key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
1976                        key.rulesToParseHdl = &(src->source);
1977                        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1978                    }
1979                    if(sourceToken != NULL) {
1980                        expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
1981                    }
1982                }
1983
1984                if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
1985                    if(top == FALSE) { /* there is no indirection */
1986                        uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1987                        if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1988                            /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1989                            while(sourceToken->strength > strength && sourceToken->previous != NULL) {
1990                                sourceToken = sourceToken->previous;
1991                            }
1992                            /* here, either we hit the strength or NULL */
1993                            if(sourceToken->strength == strength) {
1994                                if(sourceToken->previous != NULL) {
1995                                    sourceToken = sourceToken->previous;
1996                                } else { /* start of list */
1997                                    sourceToken = sourceToken->listHeader->reset;
1998                                }
1999                            } else { /* we hit NULL */
2000                                /* we should be doing the else part */
2001                                sourceToken = sourceToken->listHeader->reset;
2002                                sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
2003                            }
2004                        } else {
2005                            sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
2006                        }
2007                    } else { /* this is both before and indirection */
2008                        top = FALSE;
2009                        ListList[src->resultLen].previousCE = 0;
2010                        ListList[src->resultLen].previousContCE = 0;
2011                        ListList[src->resultLen].indirect = TRUE;
2012                        /* we need to do slightly more work. we need to get the baseCE using the */
2013                        /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
2014                        /* in ucol_bld */
2015                        uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
2016                        uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
2017                        uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
2018                        uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
2019
2020                        UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
2021                        if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) &&
2022                           (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
2023                            uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
2024                            uint32_t raw = uprv_uca_getRawFromImplicit(primary);
2025                            uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
2026                            CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
2027                            SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
2028                        } else {
2029                            /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
2030                            ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
2031                        }
2032
2033                        ListList[src->resultLen].baseCE = CE;
2034                        ListList[src->resultLen].baseContCE = SecondCE;
2035                        ListList[src->resultLen].nextCE = 0;
2036                        ListList[src->resultLen].nextContCE = 0;
2037
2038                        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
2039                    }
2040                }
2041
2042
2043                /*  5 If the relation is a reset:
2044                If sourceToken is null
2045                Create new list, create new sourceToken, make the baseCE from source, put
2046                the sourceToken in ListHeader of the new list */
2047                if(sourceToken == NULL) {
2048                    /*
2049                    3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
2050                    First convert all expansions into normal form. Examples:
2051                    If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
2052                    d * ... into &x * c/y * d * ...
2053                    Note: reset values can never have expansions, although they can cause the
2054                    very next item to have one. They may be contractions, if they are found
2055                    earlier in the list.
2056                    */
2057                    if(top == FALSE) {
2058                        collIterate s;
2059                        uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
2060
2061                        uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
2062
2063                        CE = ucol_getNextCE(src->UCA, &s, status);
2064                        const UChar *expand = s.pos;
2065                        SecondCE = ucol_getNextCE(src->UCA, &s, status);
2066
2067                        ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
2068                        if(isContinuation(SecondCE)) {
2069                            ListList[src->resultLen].baseContCE = SecondCE;
2070                        } else {
2071                            ListList[src->resultLen].baseContCE = 0;
2072                        }
2073                        ListList[src->resultLen].nextCE = 0;
2074                        ListList[src->resultLen].nextContCE = 0;
2075                        ListList[src->resultLen].previousCE = 0;
2076                        ListList[src->resultLen].previousContCE = 0;
2077                        ListList[src->resultLen].indirect = FALSE;
2078                        sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
2079                    } else { /* top == TRUE */
2080                        /* just use the supplied values */
2081                        top = FALSE;
2082                        ListList[src->resultLen].previousCE = 0;
2083                        ListList[src->resultLen].previousContCE = 0;
2084                        ListList[src->resultLen].indirect = TRUE;
2085                        ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
2086                        ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
2087                        ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
2088                        ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
2089
2090                        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
2091
2092                    }
2093                } else { /* reset to something already in rules */
2094                    top = FALSE;
2095                }
2096            }
2097            /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
2098            lastToken = sourceToken;
2099        } else {
2100            if(U_FAILURE(*status)) {
2101                return 0;
2102            }
2103        }
2104    }
2105#ifdef DEBUG_FOR_CODE_POINTS
2106    fclose(dfcp_fp);
2107#endif
2108
2109
2110    if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
2111        src->resultLen--;
2112    }
2113    return src->resultLen;
2114}
2115
2116const UChar* ucol_tok_getRulesFromBundle(
2117    void* /*context*/,
2118    const char* locale,
2119    const char* type,
2120    int32_t* pLength,
2121    UErrorCode* status)
2122{
2123    const UChar* rules = NULL;
2124    UResourceBundle* bundle;
2125    UResourceBundle* collations;
2126    UResourceBundle* collation;
2127
2128    *pLength = 0;
2129
2130    bundle = ures_open(U_ICUDATA_COLL, locale, status);
2131    if(U_SUCCESS(*status)){
2132        collations = ures_getByKey(bundle, "collations", NULL, status);
2133        if(U_SUCCESS(*status)){
2134            collation = ures_getByKey(collations, type, NULL, status);
2135            if(U_SUCCESS(*status)){
2136                rules = ures_getStringByKey(collation, "Sequence", pLength, status);
2137                if(U_FAILURE(*status)){
2138                    *pLength = 0;
2139                    rules = NULL;
2140                }
2141                ures_close(collation);
2142            }
2143            ures_close(collations);
2144        }
2145    }
2146
2147    ures_close(bundle);
2148
2149    return rules;
2150}
2151
2152void ucol_tok_initTokenList(
2153    UColTokenParser *src,
2154    const UChar *rules,
2155    uint32_t rulesLength,
2156    const UCollator *UCA,
2157    GetCollationRulesFunction importFunc,
2158    void* context,
2159    UErrorCode *status) {
2160    U_NAMESPACE_USE
2161
2162    uint32_t nSize = 0;
2163    uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
2164
2165    bool needToDeallocRules = false;
2166
2167    if(U_FAILURE(*status)) {
2168        return;
2169    }
2170
2171    // set everything to zero, so that we can clean up gracefully
2172    uprv_memset(src, 0, sizeof(UColTokenParser));
2173
2174    // first we need to find options that don't like to be normalized,
2175    // like copy and remove...
2176    //const UChar *openBrace = rules;
2177    int32_t optionNumber = -1;
2178    const UChar *setStart = NULL;
2179    uint32_t i = 0;
2180    while(i < rulesLength) {
2181        if(rules[i] == 0x005B) {    // '[': start of an option
2182            /* Gets the following:
2183               optionNumber: The index of the option.
2184               setStart: The pointer at which the option arguments start.
2185             */
2186            optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
2187
2188            if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
2189                // [optimize]
2190                USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
2191                if(U_SUCCESS(*status)) {
2192                    if(src->copySet == NULL) {
2193                        src->copySet = newSet;
2194                    } else {
2195                        uset_addAll(src->copySet, newSet);
2196                        uset_close(newSet);
2197                    }
2198                } else {
2199                    return;
2200                }
2201            } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
2202                USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
2203                if(U_SUCCESS(*status)) {
2204                    if(src->removeSet == NULL) {
2205                        src->removeSet = newSet;
2206                    } else {
2207                        uset_addAll(src->removeSet, newSet);
2208                        uset_close(newSet);
2209                    }
2210                } else {
2211                    return;
2212                }
2213            } else if(optionNumber == OPTION_IMPORT){
2214                // [import <collation-name>]
2215
2216                // Find the address of the closing ].
2217                UChar* import_end = u_strchr(setStart, 0x005D);
2218                int32_t optionEndOffset = (int32_t)(import_end + 1 - rules);
2219                // Ignore trailing whitespace.
2220                while(PatternProps::isWhiteSpace(*(import_end-1))) {
2221                    --import_end;
2222                }
2223
2224                int32_t optionLength = (int32_t)(import_end - setStart);
2225                char option[50];
2226                if(optionLength >= (int32_t)sizeof(option)) {
2227                    *status = U_ILLEGAL_ARGUMENT_ERROR;
2228                    return;
2229                }
2230                u_UCharsToChars(setStart, option, optionLength);
2231                option[optionLength] = 0;
2232
2233                *status = U_ZERO_ERROR;
2234                char locale[50];
2235                int32_t templ;
2236                uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status);
2237                if(U_FAILURE(*status)) {
2238                    *status = U_ILLEGAL_ARGUMENT_ERROR;
2239                    return;
2240                }
2241
2242                char type[50];
2243                if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 ||
2244                    U_FAILURE(*status)
2245                ) {
2246                    *status = U_ZERO_ERROR;
2247                    uprv_strcpy(type, "standard");
2248                }
2249
2250                // TODO: Use public functions when available, see ticket #8134.
2251                char *keywords = (char *)locale_getKeywordsStart(locale);
2252                if(keywords != NULL) {
2253                    *keywords = 0;
2254                }
2255
2256                int32_t importRulesLength = 0;
2257                const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status);
2258
2259#ifdef DEBUG_FOR_COLL_RULES
2260                std::string s;
2261                UnicodeString(importRules).toUTF8String(s);
2262                std::cout << "Import rules = " << s << std::endl;
2263#endif
2264
2265                // Add the length of the imported rules to length of the original rules,
2266                // and subtract the length of the import option.
2267                uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i);
2268
2269                UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar));
2270
2271#ifdef DEBUG_FOR_COLL_RULES
2272                std::string s1;
2273                UnicodeString(rules).toUTF8String(s1);
2274                std::cout << "Original rules = " << s1 << std::endl;
2275#endif
2276
2277
2278                // Copy the section of the original rules leading up to the import
2279                uprv_memcpy(newRules, rules, i*sizeof(UChar));
2280                // Copy the imported rules
2281                uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar));
2282                // Copy the rest of the original rules (minus the import option itself)
2283                uprv_memcpy(newRules+i+importRulesLength,
2284                            rules+optionEndOffset,
2285                            (rulesLength-optionEndOffset)*sizeof(UChar));
2286
2287#ifdef DEBUG_FOR_COLL_RULES
2288                std::string s2;
2289                UnicodeString(newRules).toUTF8String(s2);
2290                std::cout << "Resulting rules = " << s2 << std::endl;
2291#endif
2292
2293                if(needToDeallocRules){
2294                    // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2295                    uprv_free((void*)rules);
2296                }
2297                needToDeallocRules = true;
2298                rules = newRules;
2299                rulesLength = newRulesLength;
2300
2301                estimatedSize += importRulesLength*2;
2302
2303                // First character of the new rules needs to be processed
2304                i--;
2305            }
2306        }
2307        //openBrace++;
2308        i++;
2309    }
2310
2311    src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
2312    /* test for NULL */
2313    if (src->source == NULL) {
2314        *status = U_MEMORY_ALLOCATION_ERROR;
2315        return;
2316    }
2317    uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
2318    nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
2319    if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
2320        *status = U_ZERO_ERROR;
2321        src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
2322        /* test for NULL */
2323        if (src->source == NULL) {
2324            *status = U_MEMORY_ALLOCATION_ERROR;
2325            return;
2326        }
2327        nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
2328    }
2329    if(needToDeallocRules){
2330        // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2331        uprv_free((void*)rules);
2332    }
2333
2334
2335    src->current = src->source;
2336    src->end = src->source+nSize;
2337    src->sourceCurrent = src->source;
2338    src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
2339    src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
2340    src->varTop = NULL;
2341    src->UCA = UCA;
2342    src->invUCA = ucol_initInverseUCA(status);
2343    src->parsedToken.charsLen = 0;
2344    src->parsedToken.charsOffset = 0;
2345    src->parsedToken.extensionLen = 0;
2346    src->parsedToken.extensionOffset = 0;
2347    src->parsedToken.prefixLen = 0;
2348    src->parsedToken.prefixOffset = 0;
2349    src->parsedToken.flags = 0;
2350    src->parsedToken.strength = UCOL_TOK_UNSET;
2351    src->buildCCTabFlag = FALSE;
2352    src->isStarred = FALSE;
2353    src->inRange = FALSE;
2354    src->lastRangeCp = 0;
2355    src->previousCp = 0;
2356
2357    if(U_FAILURE(*status)) {
2358        return;
2359    }
2360    src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
2361    if(U_FAILURE(*status)) {
2362        return;
2363    }
2364    uhash_setValueDeleter(src->tailored, uprv_free);
2365
2366    src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
2367    /* test for NULL */
2368    if (src->opts == NULL) {
2369        *status = U_MEMORY_ALLOCATION_ERROR;
2370        return;
2371    }
2372
2373    uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
2374
2375    src->lh = 0;
2376    src->listCapacity = 1024;
2377    src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
2378    //Test for NULL
2379    if (src->lh == NULL) {
2380        *status = U_MEMORY_ALLOCATION_ERROR;
2381        return;
2382    }
2383    uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
2384    src->resultLen = 0;
2385
2386    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
2387
2388    // UCOL_RESET_TOP_VALUE
2389    setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
2390    // UCOL_FIRST_PRIMARY_IGNORABLE
2391    setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
2392    // UCOL_LAST_PRIMARY_IGNORABLE
2393    setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
2394    // UCOL_FIRST_SECONDARY_IGNORABLE
2395    setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
2396    // UCOL_LAST_SECONDARY_IGNORABLE
2397    setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
2398    // UCOL_FIRST_TERTIARY_IGNORABLE
2399    setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
2400    // UCOL_LAST_TERTIARY_IGNORABLE
2401    setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
2402    // UCOL_FIRST_VARIABLE
2403    setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
2404    // UCOL_LAST_VARIABLE
2405    setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
2406    // UCOL_FIRST_NON_VARIABLE
2407    setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
2408    // UCOL_LAST_NON_VARIABLE
2409    setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
2410    // UCOL_FIRST_IMPLICIT
2411    setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
2412    // UCOL_LAST_IMPLICIT
2413    setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
2414    // UCOL_FIRST_TRAILING
2415    setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
2416    // UCOL_LAST_TRAILING
2417    setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
2418    ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
2419}
2420
2421
2422void ucol_tok_closeTokenList(UColTokenParser *src) {
2423    if(src->copySet != NULL) {
2424        uset_close(src->copySet);
2425    }
2426    if(src->removeSet != NULL) {
2427        uset_close(src->removeSet);
2428    }
2429    if(src->tailored != NULL) {
2430        uhash_close(src->tailored);
2431    }
2432    if(src->lh != NULL) {
2433        uprv_free(src->lh);
2434    }
2435    if(src->source != NULL) {
2436        uprv_free(src->source);
2437    }
2438    if(src->opts != NULL) {
2439        uprv_free(src->opts);
2440    }
2441    if (src->reorderCodes != NULL) {
2442        uprv_free(src->reorderCodes);
2443    }
2444}
2445
2446#endif /* #if !UCONFIG_NO_COLLATION */
2447