1/*
2 ******************************************************************************
3 *   Copyright (C) 1996-2012, International Business Machines                 *
4 *   Corporation and others.  All Rights Reserved.                            *
5 ******************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_COLLATION
11
12#include "unicode/unistr.h"
13#include "unicode/usearch.h"
14
15#include "cmemory.h"
16#include "unicode/coll.h"
17#include "unicode/tblcoll.h"
18#include "unicode/coleitr.h"
19#include "unicode/ucoleitr.h"
20
21#include "unicode/regex.h"        // TODO: make conditional on regexp being built.
22
23#include "unicode/uniset.h"
24#include "unicode/uset.h"
25#include "unicode/ustring.h"
26#include "hash.h"
27#include "uhash.h"
28#include "ucol_imp.h"
29#include "uassert.h"
30
31#include "colldata.h"
32
33#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
34#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
35#define DELETE_ARRAY(array) uprv_free((void *) (array))
36#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
37
38CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
39    : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
40{
41    UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
42    UCollationStrength strength = ucol_getStrength(coll);
43    UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
44    uint32_t variableTop = ucol_getVariableTop(coll, &status);
45    uint32_t strengthMask = 0;
46    int32_t order;
47
48    if (U_FAILURE(status)) {
49        return;
50    }
51
52    // **** only set flag if string has Han(gul) ****
53    ucol_forceHanImplicit(elems, &status);
54
55    switch (strength)
56    {
57    default:
58        strengthMask |= UCOL_TERTIARYORDERMASK;
59        /* fall through */
60
61    case UCOL_SECONDARY:
62        strengthMask |= UCOL_SECONDARYORDERMASK;
63        /* fall through */
64
65    case UCOL_PRIMARY:
66        strengthMask |= UCOL_PRIMARYORDERMASK;
67    }
68
69    ces = ceBuffer;
70
71    while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
72        UBool cont = isContinuation(order);
73
74        order &= strengthMask;
75
76        if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
77            if (strength >= UCOL_QUATERNARY) {
78                order &= UCOL_PRIMARYORDERMASK;
79            } else {
80                order = UCOL_IGNORABLE;
81            }
82        }
83
84        if (order == UCOL_IGNORABLE) {
85            continue;
86        }
87
88        if (cont) {
89            order |= UCOL_CONTINUATION_MARKER;
90        }
91
92        add(order, status);
93    }
94
95    ucol_closeElements(elems);
96}
97
98CEList::~CEList()
99{
100    if (ces != ceBuffer) {
101        DELETE_ARRAY(ces);
102    }
103}
104
105void CEList::add(uint32_t ce, UErrorCode &status)
106{
107    if (U_FAILURE(status)) {
108        return;
109    }
110
111    if (listSize >= listMax) {
112        int32_t newMax = listMax + CELIST_BUFFER_SIZE;
113        uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
114
115        if (newCEs == NULL) {
116            status = U_MEMORY_ALLOCATION_ERROR;
117            return;
118        }
119
120        uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
121
122        if (ces != ceBuffer) {
123            DELETE_ARRAY(ces);
124        }
125
126        ces = newCEs;
127        listMax = newMax;
128    }
129
130    ces[listSize++] = ce;
131}
132
133uint32_t CEList::get(int32_t index) const
134{
135    if (index >= 0 && index < listSize) {
136        return ces[index];
137    }
138
139    return (uint32_t)UCOL_NULLORDER;
140}
141
142uint32_t &CEList::operator[](int32_t index) const
143{
144    return ces[index];
145}
146
147UBool CEList::matchesAt(int32_t offset, const CEList *other) const
148{
149    if (other == NULL || listSize - offset < other->size()) {
150        return FALSE;
151    }
152
153    for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
154        if (ces[i] != (*other)[j]) {
155            return FALSE;
156        }
157    }
158
159    return TRUE;
160}
161
162int32_t CEList::size() const
163{
164    return listSize;
165}
166
167StringList::StringList(UErrorCode &status)
168    : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
169{
170    if (U_FAILURE(status)) {
171        return;
172    }
173
174    strings = new UnicodeString [listMax];
175
176    if (strings == NULL) {
177        status = U_MEMORY_ALLOCATION_ERROR;
178        return;
179    }
180}
181
182StringList::~StringList()
183{
184    delete[] strings;
185}
186
187void StringList::add(const UnicodeString *string, UErrorCode &status)
188{
189    if (U_FAILURE(status)) {
190        return;
191    }
192    if (listSize >= listMax) {
193        int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
194        UnicodeString *newStrings = new UnicodeString[newMax];
195        if (newStrings == NULL) {
196            status = U_MEMORY_ALLOCATION_ERROR;
197            return;
198        }
199        for (int32_t i=0; i<listSize; ++i) {
200            newStrings[i] = strings[i];
201        }
202        delete[] strings;
203        strings = newStrings;
204        listMax = newMax;
205    }
206
207    // The ctor initialized all the strings in
208    // the array to empty strings, so this
209    // is the same as copying the source string.
210    strings[listSize++].append(*string);
211}
212
213void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
214{
215    const UnicodeString string(chars, count);
216
217    add(&string, status);
218}
219
220const UnicodeString *StringList::get(int32_t index) const
221{
222    if (index >= 0 && index < listSize) {
223        return &strings[index];
224    }
225
226    return NULL;
227}
228
229int32_t StringList::size() const
230{
231    return listSize;
232}
233
234
235U_CDECL_BEGIN
236static void U_CALLCONV
237deleteStringList(void *obj)
238{
239    StringList *strings = (StringList *) obj;
240
241    delete strings;
242}
243U_CDECL_END
244
245class CEToStringsMap
246{
247public:
248    CEToStringsMap(UErrorCode &status);
249    ~CEToStringsMap();
250
251    void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
252    StringList *getStringList(uint32_t ce) const;
253
254private:
255    void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
256    UHashtable *map;
257};
258
259CEToStringsMap::CEToStringsMap(UErrorCode &status)
260    : map(NULL)
261{
262    if (U_FAILURE(status)) {
263        return;
264    }
265
266    map = uhash_open(uhash_hashLong, uhash_compareLong,
267                     uhash_compareCaselessUnicodeString,
268                     &status);
269
270    if (U_FAILURE(status)) {
271        return;
272    }
273
274    uhash_setValueDeleter(map, deleteStringList);
275}
276
277CEToStringsMap::~CEToStringsMap()
278{
279    uhash_close(map);
280}
281
282void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
283{
284    StringList *strings = getStringList(ce);
285
286    if (strings == NULL) {
287        strings = new StringList(status);
288
289        if (strings == NULL || U_FAILURE(status)) {
290            status = U_MEMORY_ALLOCATION_ERROR;
291            return;
292        }
293
294        putStringList(ce, strings, status);
295    }
296
297    strings->add(string, status);
298}
299
300StringList *CEToStringsMap::getStringList(uint32_t ce) const
301{
302    return (StringList *) uhash_iget(map, ce);
303}
304
305void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
306{
307    uhash_iput(map, ce, (void *) stringList, &status);
308}
309
310#define CLONE_COLLATOR
311
312CollData::CollData(UCollator *collator, UErrorCode &status)
313    : coll(NULL), ceToCharsStartingWith(NULL)
314{
315    // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
316    // i.e. other, control, private use, format, surrogate
317    U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
318    U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
319    USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
320
321    // Han ext. A, Han, Jamo, Hangul, Han Ext. B
322    // i.e. all the characers we handle implicitly
323    U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
324    U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
325    USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
326
327    if (U_FAILURE(status)) {
328        return;
329    }
330
331    USet *expansions   = uset_openEmpty();
332    USet *contractions = uset_openEmpty();
333    int32_t itemCount;
334
335    ceToCharsStartingWith = new CEToStringsMap(status);
336
337    if (U_FAILURE(status)) {
338        goto bail;
339    }
340
341#ifdef CLONE_COLLATOR
342    coll = ucol_safeClone(collator, NULL, NULL, &status);
343
344    if (U_FAILURE(status)) {
345        goto bail;
346    }
347#else
348    coll = collator;
349#endif
350
351    ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
352
353    uset_addAll(charsToTest, contractions);
354    uset_addAll(charsToTest, expansions);
355    uset_removeAll(charsToTest, charsToRemove);
356
357    itemCount = uset_getItemCount(charsToTest);
358    for(int32_t item = 0; item < itemCount; item += 1) {
359        UChar32 start = 0, end = 0;
360        UChar buffer[16];
361        int32_t len = uset_getItem(charsToTest, item, &start, &end,
362                                   buffer, 16, &status);
363
364        if (len == 0) {
365            for (UChar32 ch = start; ch <= end; ch += 1) {
366                UnicodeString *st = new UnicodeString(ch);
367
368                if (st == NULL) {
369                    status = U_MEMORY_ALLOCATION_ERROR;
370                    break;
371                }
372
373                CEList *ceList = new CEList(coll, *st, status);
374
375                ceToCharsStartingWith->put(ceList->get(0), st, status);
376
377                delete ceList;
378                delete st;
379            }
380        } else if (len > 0) {
381            UnicodeString *st = new UnicodeString(buffer, len);
382
383            if (st == NULL) {
384                status = U_MEMORY_ALLOCATION_ERROR;
385                break;
386            }
387
388            CEList *ceList = new CEList(coll, *st, status);
389
390            ceToCharsStartingWith->put(ceList->get(0), st, status);
391
392            delete ceList;
393            delete st;
394        } else {
395            // shouldn't happen...
396        }
397
398        if (U_FAILURE(status)) {
399             break;
400        }
401    }
402
403bail:
404    uset_close(contractions);
405    uset_close(expansions);
406    uset_close(charsToRemove);
407    uset_close(charsToTest);
408
409    if (U_FAILURE(status)) {
410        return;
411    }
412
413     UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
414                            UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
415     UChar  jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
416     UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
417     UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
418     CEList hanList(coll, hanString, status);
419     CEList jamoList(coll, jamoString, status);
420     int32_t j = 0;
421
422     if (U_FAILURE(status)) {
423         return;
424     }
425
426     for (int32_t c = 0; c < jamoList.size(); c += 1) {
427         uint32_t jce = jamoList[c];
428
429         if (! isContinuation(jce)) {
430             jamoLimits[j++] = jce;
431         }
432     }
433
434     jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
435
436     minHan = 0xFFFFFFFF;
437     maxHan = 0;
438
439     for(int32_t h = 0; h < hanList.size(); h += 2) {
440         uint32_t han = (uint32_t) hanList[h];
441
442         if (han < minHan) {
443             minHan = han;
444         }
445
446         if (han > maxHan) {
447             maxHan = han;
448         }
449     }
450
451     maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
452}
453
454CollData::~CollData()
455{
456#ifdef CLONE_COLLATOR
457   ucol_close(coll);
458#endif
459
460   delete ceToCharsStartingWith;
461}
462
463UCollator *CollData::getCollator() const
464{
465    return coll;
466}
467
468const StringList *CollData::getStringList(int32_t ce) const
469{
470    return ceToCharsStartingWith->getStringList(ce);
471}
472
473const CEList *CollData::getCEList(const UnicodeString *string) const
474{
475    UErrorCode status = U_ZERO_ERROR;
476    const CEList *list = new CEList(coll, *string, status);
477
478    if (U_FAILURE(status)) {
479        delete list;
480        list = NULL;
481    }
482
483    return list;
484}
485
486void CollData::freeCEList(const CEList *list)
487{
488    delete list;
489}
490
491int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
492{
493    // find out shortest string for the longest sequence of ces.
494    // this can probably be folded with the minLengthCache...
495
496    if (history[offset] >= 0) {
497        return history[offset];
498    }
499
500    uint32_t ce = ceList->get(offset);
501    int32_t maxOffset = ceList->size();
502    int32_t shortestLength = INT32_MAX;
503    const StringList *strings = ceToCharsStartingWith->getStringList(ce);
504
505    if (strings != NULL) {
506        int32_t stringCount = strings->size();
507
508        for (int32_t s = 0; s < stringCount; s += 1) {
509            const UnicodeString *string = strings->get(s);
510            UErrorCode status = U_ZERO_ERROR;
511            const CEList *ceList2 = new CEList(coll, *string, status);
512
513            if (U_FAILURE(status)) {
514                delete ceList2;
515                ceList2 = NULL;
516            }
517
518            if (ceList->matchesAt(offset, ceList2)) {
519                U_ASSERT(ceList2 != NULL);
520                int32_t clength = ceList2->size();
521                int32_t slength = string->length();
522                int32_t roffset = offset + clength;
523                int32_t rlength = 0;
524
525                if (roffset < maxOffset) {
526                    rlength = minLengthInChars(ceList, roffset, history);
527
528                    if (rlength <= 0) {
529                    // delete before continue to avoid memory leak.
530                        delete ceList2;
531
532                        // ignore any dead ends
533                        continue;
534                    }
535                }
536
537                if (shortestLength > slength + rlength) {
538                    shortestLength = slength + rlength;
539                }
540            }
541
542            delete ceList2;
543        }
544    }
545
546    if (shortestLength == INT32_MAX) {
547        // No matching strings at this offset. See if
548        // the CE is in a range we can handle manually.
549        if (ce >= minHan && ce < maxHan) {
550            // all han have implicit orders which
551            // generate two CEs.
552            int32_t roffset = offset + 2;
553            int32_t rlength = 0;
554
555          //history[roffset++] = -1;
556          //history[roffset++] = 1;
557
558            if (roffset < maxOffset) {
559                rlength = minLengthInChars(ceList, roffset, history);
560            }
561
562            if (rlength < 0) {
563                return -1;
564            }
565
566            shortestLength = 1 + rlength;
567            goto have_shortest;
568        } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
569            int32_t roffset = offset;
570            int32_t rlength = 0;
571
572            // **** this loop may not handle archaic Hangul correctly ****
573            for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
574                uint32_t jce = ceList->get(roffset);
575
576                // Some Jamo have 24-bit primary order; skip the
577                // 2nd CE. This should always be OK because if
578                // we're still in the loop all we've seen are
579                // a series of Jamo in LVT order.
580                if (isContinuation(jce)) {
581                    continue;
582                }
583
584                if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
585                    break;
586                }
587            }
588
589            if (roffset == offset) {
590                // we started with a non-L Jamo...
591                // just say it comes from a single character
592                roffset += 1;
593
594                // See if the single Jamo has a 24-bit order.
595                if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
596                    roffset += 1;
597                }
598            }
599
600            if (roffset < maxOffset) {
601                rlength = minLengthInChars(ceList, roffset, history);
602            }
603
604            if (rlength < 0) {
605                return -1;
606            }
607
608            shortestLength = 1 + rlength;
609            goto have_shortest;
610        }
611
612        // Can't handle it manually either. Just move on.
613        return -1;
614    }
615
616have_shortest:
617    history[offset] = shortestLength;
618
619    return shortestLength;
620}
621
622int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
623{
624    int32_t clength = ceList->size();
625    int32_t *history = NEW_ARRAY(int32_t, clength);
626
627    for (int32_t i = 0; i < clength; i += 1) {
628        history[i] = -1;
629    }
630
631    int32_t minLength = minLengthInChars(ceList, offset, history);
632
633    DELETE_ARRAY(history);
634
635    return minLength;
636}
637
638#endif // #if !UCONFIG_NO_COLLATION
639