1/*
2 ******************************************************************************
3 *   Copyright (C) 1996-2014, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 ******************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_COLLATION
11
12#include "unicode/unistr.h"
13#include "unicode/usearch.h"
14
15#include "cmemory.h"
16#include "unicode/coll.h"
17#include "unicode/tblcoll.h"
18#include "unicode/coleitr.h"
19#include "unicode/ucoleitr.h"
20
21#include "unicode/regex.h"        // TODO: make conditional on regexp being built.
22
23#include "unicode/uniset.h"
24#include "unicode/uset.h"
25#include "unicode/usetiter.h"
26#include "unicode/ustring.h"
27#include "hash.h"
28#include "normalizer2impl.h"
29#include "uhash.h"
30#include "usrchimp.h"
31#include "uassert.h"
32
33#include "colldata.h"
34
35#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
36#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
37#define DELETE_ARRAY(array) uprv_free((void *) (array))
38#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
39
40CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
41    : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
42{
43    UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
44    UCollationStrength strength = ucol_getStrength(coll);
45    UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
46    uint32_t variableTop = ucol_getVariableTop(coll, &status);
47    uint32_t strengthMask = 0;
48    int32_t order;
49
50    if (U_FAILURE(status)) {
51        return;
52    }
53
54    // **** only set flag if string has Han(gul) ****
55    // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476
56
57    switch (strength)
58    {
59    default:
60        strengthMask |= UCOL_TERTIARYORDERMASK;
61        /* fall through */
62
63    case UCOL_SECONDARY:
64        strengthMask |= UCOL_SECONDARYORDERMASK;
65        /* fall through */
66
67    case UCOL_PRIMARY:
68        strengthMask |= UCOL_PRIMARYORDERMASK;
69    }
70
71    ces = ceBuffer;
72
73    while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
74        UBool cont = isContinuation(order);
75
76        order &= strengthMask;
77
78        if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
79            if (strength >= UCOL_QUATERNARY) {
80                order &= UCOL_PRIMARYORDERMASK;
81            } else {
82                order = UCOL_IGNORABLE;
83            }
84        }
85
86        if (order == UCOL_IGNORABLE) {
87            continue;
88        }
89
90        if (cont) {
91            order |= UCOL_CONTINUATION_MARKER;
92        }
93
94        add(order, status);
95    }
96
97    ucol_closeElements(elems);
98}
99
100CEList::~CEList()
101{
102    if (ces != ceBuffer) {
103        DELETE_ARRAY(ces);
104    }
105}
106
107void CEList::add(uint32_t ce, UErrorCode &status)
108{
109    if (U_FAILURE(status)) {
110        return;
111    }
112
113    if (listSize >= listMax) {
114        int32_t newMax = listMax + CELIST_BUFFER_SIZE;
115        uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
116
117        if (newCEs == NULL) {
118            status = U_MEMORY_ALLOCATION_ERROR;
119            return;
120        }
121
122        uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
123
124        if (ces != ceBuffer) {
125            DELETE_ARRAY(ces);
126        }
127
128        ces = newCEs;
129        listMax = newMax;
130    }
131
132    ces[listSize++] = ce;
133}
134
135uint32_t CEList::get(int32_t index) const
136{
137    if (index >= 0 && index < listSize) {
138        return ces[index];
139    }
140
141    return (uint32_t)UCOL_NULLORDER;
142}
143
144uint32_t &CEList::operator[](int32_t index) const
145{
146    return ces[index];
147}
148
149UBool CEList::matchesAt(int32_t offset, const CEList *other) const
150{
151    if (other == NULL || listSize - offset < other->size()) {
152        return FALSE;
153    }
154
155    for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
156        if (ces[i] != (*other)[j]) {
157            return FALSE;
158        }
159    }
160
161    return TRUE;
162}
163
164int32_t CEList::size() const
165{
166    return listSize;
167}
168
169StringList::StringList(UErrorCode &status)
170    : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
171{
172    if (U_FAILURE(status)) {
173        return;
174    }
175
176    strings = new UnicodeString [listMax];
177
178    if (strings == NULL) {
179        status = U_MEMORY_ALLOCATION_ERROR;
180        return;
181    }
182}
183
184StringList::~StringList()
185{
186    delete[] strings;
187}
188
189void StringList::add(const UnicodeString *string, UErrorCode &status)
190{
191    if (U_FAILURE(status)) {
192        return;
193    }
194    if (listSize >= listMax) {
195        int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
196        UnicodeString *newStrings = new UnicodeString[newMax];
197        if (newStrings == NULL) {
198            status = U_MEMORY_ALLOCATION_ERROR;
199            return;
200        }
201        for (int32_t i=0; i<listSize; ++i) {
202            newStrings[i] = strings[i];
203        }
204        delete[] strings;
205        strings = newStrings;
206        listMax = newMax;
207    }
208
209    // The ctor initialized all the strings in
210    // the array to empty strings, so this
211    // is the same as copying the source string.
212    strings[listSize++].append(*string);
213}
214
215void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
216{
217    const UnicodeString string(chars, count);
218
219    add(&string, status);
220}
221
222const UnicodeString *StringList::get(int32_t index) const
223{
224    if (index >= 0 && index < listSize) {
225        return &strings[index];
226    }
227
228    return NULL;
229}
230
231int32_t StringList::size() const
232{
233    return listSize;
234}
235
236
237U_CDECL_BEGIN
238static void U_CALLCONV
239deleteStringList(void *obj)
240{
241    StringList *strings = (StringList *) obj;
242
243    delete strings;
244}
245U_CDECL_END
246
247class CEToStringsMap
248{
249public:
250    CEToStringsMap(UErrorCode &status);
251    ~CEToStringsMap();
252
253    void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
254    StringList *getStringList(uint32_t ce) const;
255
256private:
257    void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
258    UHashtable *map;
259};
260
261CEToStringsMap::CEToStringsMap(UErrorCode &status)
262    : map(NULL)
263{
264    if (U_FAILURE(status)) {
265        return;
266    }
267
268    map = uhash_open(uhash_hashLong, uhash_compareLong,
269                     uhash_compareCaselessUnicodeString,
270                     &status);
271
272    if (U_FAILURE(status)) {
273        return;
274    }
275
276    uhash_setValueDeleter(map, deleteStringList);
277}
278
279CEToStringsMap::~CEToStringsMap()
280{
281    uhash_close(map);
282}
283
284void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
285{
286    StringList *strings = getStringList(ce);
287
288    if (strings == NULL) {
289        strings = new StringList(status);
290
291        if (strings == NULL || U_FAILURE(status)) {
292            status = U_MEMORY_ALLOCATION_ERROR;
293            return;
294        }
295
296        putStringList(ce, strings, status);
297    }
298
299    strings->add(string, status);
300}
301
302StringList *CEToStringsMap::getStringList(uint32_t ce) const
303{
304    return (StringList *) uhash_iget(map, ce);
305}
306
307void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
308{
309    uhash_iput(map, ce, (void *) stringList, &status);
310}
311
312#define CLONE_COLLATOR
313
314CollData::CollData(UCollator *collator, UErrorCode &status)
315    : coll(NULL), ceToCharsStartingWith(NULL)
316{
317    // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
318    // i.e. other, control, private use, format, surrogate
319    U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
320    U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
321    USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
322
323    // Han ext. A, Han, Jamo, Hangul, Han Ext. B
324    // i.e. all the characers we handle implicitly
325    U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
326    U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
327    USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
328
329    if (U_FAILURE(status)) {
330        return;
331    }
332
333    USet *expansions   = uset_openEmpty();
334    USet *contractions = uset_openEmpty();
335    int32_t itemCount;
336
337    ceToCharsStartingWith = new CEToStringsMap(status);
338
339    if (U_FAILURE(status)) {
340        goto bail;
341    }
342
343#ifdef CLONE_COLLATOR
344    coll = ucol_safeClone(collator, NULL, NULL, &status);
345
346    if (U_FAILURE(status)) {
347        goto bail;
348    }
349#else
350    coll = collator;
351#endif
352
353    ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
354
355    uset_addAll(charsToTest, contractions);
356    uset_addAll(charsToTest, expansions);
357    uset_removeAll(charsToTest, charsToRemove);
358
359    itemCount = uset_getItemCount(charsToTest);
360    for(int32_t item = 0; item < itemCount; item += 1) {
361        UChar32 start = 0, end = 0;
362        UChar buffer[16];
363        int32_t len = uset_getItem(charsToTest, item, &start, &end,
364                                   buffer, 16, &status);
365
366        if (len == 0) {
367            for (UChar32 ch = start; ch <= end; ch += 1) {
368                UnicodeString *st = new UnicodeString(ch);
369
370                if (st == NULL) {
371                    status = U_MEMORY_ALLOCATION_ERROR;
372                    break;
373                }
374
375                CEList *ceList = new CEList(coll, *st, status);
376
377                ceToCharsStartingWith->put(ceList->get(0), st, status);
378
379                delete ceList;
380                delete st;
381            }
382        } else if (len > 0) {
383            UnicodeString *st = new UnicodeString(buffer, len);
384
385            if (st == NULL) {
386                status = U_MEMORY_ALLOCATION_ERROR;
387                break;
388            }
389
390            CEList *ceList = new CEList(coll, *st, status);
391
392            ceToCharsStartingWith->put(ceList->get(0), st, status);
393
394            delete ceList;
395            delete st;
396        } else {
397            // shouldn't happen...
398        }
399
400        if (U_FAILURE(status)) {
401             break;
402        }
403    }
404
405bail:
406    uset_close(contractions);
407    uset_close(expansions);
408    uset_close(charsToRemove);
409    uset_close(charsToTest);
410
411    if (U_FAILURE(status)) {
412        return;
413    }
414
415    UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status);
416    if (U_FAILURE(status)) {
417        return;
418    }
419    UnicodeSetIterator hanIter(hanRanges);
420    UnicodeString hanString;
421    while(hanIter.nextRange()) {
422        hanString.append(hanIter.getCodepoint());
423        hanString.append(hanIter.getCodepointEnd());
424    }
425    // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
426    // but as of Unicode 6.3 the 11xx block is filled,
427    // and there are also more Jamo T at U+D7CB..U+D7FB.
428    // Maybe use [:HST=T:] and look for the end of the last range?
429    // Maybe use script boundary mappings instead of this code??
430    UChar  jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF};
431     UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
432     CEList hanList(coll, hanString, status);
433     CEList jamoList(coll, jamoString, status);
434     int32_t j = 0;
435
436     if (U_FAILURE(status)) {
437         return;
438     }
439
440     for (int32_t c = 0; c < jamoList.size(); c += 1) {
441         uint32_t jce = jamoList[c];
442
443         if (! isContinuation(jce)) {
444             jamoLimits[j++] = jce;
445         }
446     }
447
448     jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
449
450     minHan = 0xFFFFFFFF;
451     maxHan = 0;
452
453     for(int32_t h = 0; h < hanList.size(); h += 2) {
454         uint32_t han = (uint32_t) hanList[h];
455
456         if (han < minHan) {
457             minHan = han;
458         }
459
460         if (han > maxHan) {
461             maxHan = han;
462         }
463     }
464
465     maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
466}
467
468CollData::~CollData()
469{
470#ifdef CLONE_COLLATOR
471   ucol_close(coll);
472#endif
473
474   delete ceToCharsStartingWith;
475}
476
477UCollator *CollData::getCollator() const
478{
479    return coll;
480}
481
482const StringList *CollData::getStringList(int32_t ce) const
483{
484    return ceToCharsStartingWith->getStringList(ce);
485}
486
487const CEList *CollData::getCEList(const UnicodeString *string) const
488{
489    UErrorCode status = U_ZERO_ERROR;
490    const CEList *list = new CEList(coll, *string, status);
491
492    if (U_FAILURE(status)) {
493        delete list;
494        list = NULL;
495    }
496
497    return list;
498}
499
500void CollData::freeCEList(const CEList *list)
501{
502    delete list;
503}
504
505int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
506{
507    // find out shortest string for the longest sequence of ces.
508    // this can probably be folded with the minLengthCache...
509
510    if (history[offset] >= 0) {
511        return history[offset];
512    }
513
514    uint32_t ce = ceList->get(offset);
515    int32_t maxOffset = ceList->size();
516    int32_t shortestLength = INT32_MAX;
517    const StringList *strings = ceToCharsStartingWith->getStringList(ce);
518
519    if (strings != NULL) {
520        int32_t stringCount = strings->size();
521
522        for (int32_t s = 0; s < stringCount; s += 1) {
523            const UnicodeString *string = strings->get(s);
524            UErrorCode status = U_ZERO_ERROR;
525            const CEList *ceList2 = new CEList(coll, *string, status);
526
527            if (U_FAILURE(status)) {
528                delete ceList2;
529                ceList2 = NULL;
530            }
531
532            if (ceList->matchesAt(offset, ceList2)) {
533                U_ASSERT(ceList2 != NULL);
534                int32_t clength = ceList2->size();
535                int32_t slength = string->length();
536                int32_t roffset = offset + clength;
537                int32_t rlength = 0;
538
539                if (roffset < maxOffset) {
540                    rlength = minLengthInChars(ceList, roffset, history);
541
542                    if (rlength <= 0) {
543                    // delete before continue to avoid memory leak.
544                        delete ceList2;
545
546                        // ignore any dead ends
547                        continue;
548                    }
549                }
550
551                if (shortestLength > slength + rlength) {
552                    shortestLength = slength + rlength;
553                }
554            }
555
556            delete ceList2;
557        }
558    }
559
560    if (shortestLength == INT32_MAX) {
561        // No matching strings at this offset. See if
562        // the CE is in a range we can handle manually.
563        if (ce >= minHan && ce < maxHan) {
564            // all han have implicit orders which
565            // generate two CEs.
566            int32_t roffset = offset + 2;
567            int32_t rlength = 0;
568
569          //history[roffset++] = -1;
570          //history[roffset++] = 1;
571
572            if (roffset < maxOffset) {
573                rlength = minLengthInChars(ceList, roffset, history);
574            }
575
576            if (rlength < 0) {
577                return -1;
578            }
579
580            shortestLength = 1 + rlength;
581            goto have_shortest;
582        } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
583            int32_t roffset = offset;
584            int32_t rlength = 0;
585
586            // **** this loop may not handle archaic Hangul correctly ****
587            for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
588                uint32_t jce = ceList->get(roffset);
589
590                // Some Jamo have 24-bit primary order; skip the
591                // 2nd CE. This should always be OK because if
592                // we're still in the loop all we've seen are
593                // a series of Jamo in LVT order.
594                if (isContinuation(jce)) {
595                    continue;
596                }
597
598                if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
599                    break;
600                }
601            }
602
603            if (roffset == offset) {
604                // we started with a non-L Jamo...
605                // just say it comes from a single character
606                roffset += 1;
607
608                // See if the single Jamo has a 24-bit order.
609                if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
610                    roffset += 1;
611                }
612            }
613
614            if (roffset < maxOffset) {
615                rlength = minLengthInChars(ceList, roffset, history);
616            }
617
618            if (rlength < 0) {
619                return -1;
620            }
621
622            shortestLength = 1 + rlength;
623            goto have_shortest;
624        }
625
626        // Can't handle it manually either. Just move on.
627        return -1;
628    }
629
630have_shortest:
631    history[offset] = shortestLength;
632
633    return shortestLength;
634}
635
636int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
637{
638    int32_t clength = ceList->size();
639    int32_t *history = NEW_ARRAY(int32_t, clength);
640
641    for (int32_t i = 0; i < clength; i += 1) {
642        history[i] = -1;
643    }
644
645    int32_t minLength = minLengthInChars(ceList, offset, history);
646
647    DELETE_ARRAY(history);
648
649    return minLength;
650}
651
652#endif // #if !UCONFIG_NO_COLLATION
653