1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2013-2014, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* collationsets.cpp
9*
10* created on: 2013feb09
11* created by: Markus W. Scherer
12*/
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
18#include "unicode/ucharstrie.h"
19#include "unicode/uniset.h"
20#include "unicode/unistr.h"
21#include "unicode/ustringtrie.h"
22#include "collation.h"
23#include "collationdata.h"
24#include "collationsets.h"
25#include "normalizer2impl.h"
26#include "uassert.h"
27#include "utf16collationiterator.h"
28#include "utrie2.h"
29
30U_NAMESPACE_BEGIN
31
32U_CDECL_BEGIN
33
34static UBool U_CALLCONV
35enumTailoredRange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) {
36    if(ce32 == Collation::FALLBACK_CE32) {
37        return TRUE;  // fallback to base, not tailored
38    }
39    TailoredSet *ts = (TailoredSet *)context;
40    return ts->handleCE32(start, end, ce32);
41}
42
43U_CDECL_END
44
45void
46TailoredSet::forData(const CollationData *d, UErrorCode &ec) {
47    if(U_FAILURE(ec)) { return; }
48    errorCode = ec;  // Preserve info & warning codes.
49    data = d;
50    baseData = d->base;
51    U_ASSERT(baseData != NULL);
52    utrie2_enum(data->trie, NULL, enumTailoredRange, this);
53    ec = errorCode;
54}
55
56UBool
57TailoredSet::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) {
58    U_ASSERT(ce32 != Collation::FALLBACK_CE32);
59    if(Collation::isSpecialCE32(ce32)) {
60        ce32 = data->getIndirectCE32(ce32);
61        if(ce32 == Collation::FALLBACK_CE32) {
62            return U_SUCCESS(errorCode);
63        }
64    }
65    do {
66        uint32_t baseCE32 = baseData->getFinalCE32(baseData->getCE32(start));
67        // Do not just continue if ce32 == baseCE32 because
68        // contractions and expansions in different data objects
69        // normally differ even if they have the same data offsets.
70        if(Collation::isSelfContainedCE32(ce32) && Collation::isSelfContainedCE32(baseCE32)) {
71            // fastpath
72            if(ce32 != baseCE32) {
73                tailored->add(start);
74            }
75        } else {
76            compare(start, ce32, baseCE32);
77        }
78    } while(++start <= end);
79    return U_SUCCESS(errorCode);
80}
81
82void
83TailoredSet::compare(UChar32 c, uint32_t ce32, uint32_t baseCE32) {
84    if(Collation::isPrefixCE32(ce32)) {
85        const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
86        ce32 = data->getFinalCE32(CollationData::readCE32(p));
87        if(Collation::isPrefixCE32(baseCE32)) {
88            const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
89            baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
90            comparePrefixes(c, p + 2, q + 2);
91        } else {
92            addPrefixes(data, c, p + 2);
93        }
94    } else if(Collation::isPrefixCE32(baseCE32)) {
95        const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
96        baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
97        addPrefixes(baseData, c, q + 2);
98    }
99
100    if(Collation::isContractionCE32(ce32)) {
101        const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
102        if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
103            ce32 = Collation::NO_CE32;
104        } else {
105            ce32 = data->getFinalCE32(CollationData::readCE32(p));
106        }
107        if(Collation::isContractionCE32(baseCE32)) {
108            const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
109            if((baseCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
110                baseCE32 = Collation::NO_CE32;
111            } else {
112                baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
113            }
114            compareContractions(c, p + 2, q + 2);
115        } else {
116            addContractions(c, p + 2);
117        }
118    } else if(Collation::isContractionCE32(baseCE32)) {
119        const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32);
120        baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q));
121        addContractions(c, q + 2);
122    }
123
124    int32_t tag;
125    if(Collation::isSpecialCE32(ce32)) {
126        tag = Collation::tagFromCE32(ce32);
127        U_ASSERT(tag != Collation::PREFIX_TAG);
128        U_ASSERT(tag != Collation::CONTRACTION_TAG);
129        // Currently, the tailoring data builder does not write offset tags.
130        // They might be useful for saving space,
131        // but they would complicate the builder,
132        // and in tailorings we assume that performance of tailored characters is more important.
133        U_ASSERT(tag != Collation::OFFSET_TAG);
134    } else {
135        tag = -1;
136    }
137    int32_t baseTag;
138    if(Collation::isSpecialCE32(baseCE32)) {
139        baseTag = Collation::tagFromCE32(baseCE32);
140        U_ASSERT(baseTag != Collation::PREFIX_TAG);
141        U_ASSERT(baseTag != Collation::CONTRACTION_TAG);
142    } else {
143        baseTag = -1;
144    }
145
146    // Non-contextual mappings, expansions, etc.
147    if(baseTag == Collation::OFFSET_TAG) {
148        // We might be comparing a tailoring CE which is a copy of
149        // a base offset-tag CE, via the [optimize [set]] syntax
150        // or when a single-character mapping was copied for tailored contractions.
151        // Offset tags always result in long-primary CEs,
152        // with common secondary/tertiary weights.
153        if(!Collation::isLongPrimaryCE32(ce32)) {
154            add(c);
155            return;
156        }
157        int64_t dataCE = baseData->ces[Collation::indexFromCE32(baseCE32)];
158        uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE);
159        if(Collation::primaryFromLongPrimaryCE32(ce32) != p) {
160            add(c);
161            return;
162        }
163    }
164
165    if(tag != baseTag) {
166        add(c);
167        return;
168    }
169
170    if(tag == Collation::EXPANSION32_TAG) {
171        const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32);
172        int32_t length = Collation::lengthFromCE32(ce32);
173
174        const uint32_t *baseCE32s = baseData->ce32s + Collation::indexFromCE32(baseCE32);
175        int32_t baseLength = Collation::lengthFromCE32(baseCE32);
176
177        if(length != baseLength) {
178            add(c);
179            return;
180        }
181        for(int32_t i = 0; i < length; ++i) {
182            if(ce32s[i] != baseCE32s[i]) {
183                add(c);
184                break;
185            }
186        }
187    } else if(tag == Collation::EXPANSION_TAG) {
188        const int64_t *ces = data->ces + Collation::indexFromCE32(ce32);
189        int32_t length = Collation::lengthFromCE32(ce32);
190
191        const int64_t *baseCEs = baseData->ces + Collation::indexFromCE32(baseCE32);
192        int32_t baseLength = Collation::lengthFromCE32(baseCE32);
193
194        if(length != baseLength) {
195            add(c);
196            return;
197        }
198        for(int32_t i = 0; i < length; ++i) {
199            if(ces[i] != baseCEs[i]) {
200                add(c);
201                break;
202            }
203        }
204    } else if(tag == Collation::HANGUL_TAG) {
205        UChar jamos[3];
206        int32_t length = Hangul::decompose(c, jamos);
207        if(tailored->contains(jamos[0]) || tailored->contains(jamos[1]) ||
208                (length == 3 && tailored->contains(jamos[2]))) {
209            add(c);
210        }
211    } else if(ce32 != baseCE32) {
212        add(c);
213    }
214}
215
216void
217TailoredSet::comparePrefixes(UChar32 c, const UChar *p, const UChar *q) {
218    // Parallel iteration over prefixes of both tables.
219    UCharsTrie::Iterator prefixes(p, 0, errorCode);
220    UCharsTrie::Iterator basePrefixes(q, 0, errorCode);
221    const UnicodeString *tp = NULL;  // Tailoring prefix.
222    const UnicodeString *bp = NULL;  // Base prefix.
223    // Use a string with a U+FFFF as the limit sentinel.
224    // U+FFFF is untailorable and will not occur in prefixes.
225    UnicodeString none((UChar)0xffff);
226    for(;;) {
227        if(tp == NULL) {
228            if(prefixes.next(errorCode)) {
229                tp = &prefixes.getString();
230            } else {
231                tp = &none;
232            }
233        }
234        if(bp == NULL) {
235            if(basePrefixes.next(errorCode)) {
236                bp = &basePrefixes.getString();
237            } else {
238                bp = &none;
239            }
240        }
241        if(tp == &none && bp == &none) { break; }
242        int32_t cmp = tp->compare(*bp);
243        if(cmp < 0) {
244            // tp occurs in the tailoring but not in the base.
245            addPrefix(data, *tp, c, (uint32_t)prefixes.getValue());
246            tp = NULL;
247        } else if(cmp > 0) {
248            // bp occurs in the base but not in the tailoring.
249            addPrefix(baseData, *bp, c, (uint32_t)basePrefixes.getValue());
250            bp = NULL;
251        } else {
252            setPrefix(*tp);
253            compare(c, (uint32_t)prefixes.getValue(), (uint32_t)basePrefixes.getValue());
254            resetPrefix();
255            tp = NULL;
256            bp = NULL;
257        }
258    }
259}
260
261void
262TailoredSet::compareContractions(UChar32 c, const UChar *p, const UChar *q) {
263    // Parallel iteration over suffixes of both tables.
264    UCharsTrie::Iterator suffixes(p, 0, errorCode);
265    UCharsTrie::Iterator baseSuffixes(q, 0, errorCode);
266    const UnicodeString *ts = NULL;  // Tailoring suffix.
267    const UnicodeString *bs = NULL;  // Base suffix.
268    // Use a string with two U+FFFF as the limit sentinel.
269    // U+FFFF is untailorable and will not occur in contractions except maybe
270    // as a single suffix character for a root-collator boundary contraction.
271    UnicodeString none((UChar)0xffff);
272    none.append((UChar)0xffff);
273    for(;;) {
274        if(ts == NULL) {
275            if(suffixes.next(errorCode)) {
276                ts = &suffixes.getString();
277            } else {
278                ts = &none;
279            }
280        }
281        if(bs == NULL) {
282            if(baseSuffixes.next(errorCode)) {
283                bs = &baseSuffixes.getString();
284            } else {
285                bs = &none;
286            }
287        }
288        if(ts == &none && bs == &none) { break; }
289        int32_t cmp = ts->compare(*bs);
290        if(cmp < 0) {
291            // ts occurs in the tailoring but not in the base.
292            addSuffix(c, *ts);
293            ts = NULL;
294        } else if(cmp > 0) {
295            // bs occurs in the base but not in the tailoring.
296            addSuffix(c, *bs);
297            bs = NULL;
298        } else {
299            suffix = ts;
300            compare(c, (uint32_t)suffixes.getValue(), (uint32_t)baseSuffixes.getValue());
301            suffix = NULL;
302            ts = NULL;
303            bs = NULL;
304        }
305    }
306}
307
308void
309TailoredSet::addPrefixes(const CollationData *d, UChar32 c, const UChar *p) {
310    UCharsTrie::Iterator prefixes(p, 0, errorCode);
311    while(prefixes.next(errorCode)) {
312        addPrefix(d, prefixes.getString(), c, (uint32_t)prefixes.getValue());
313    }
314}
315
316void
317TailoredSet::addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32) {
318    setPrefix(pfx);
319    ce32 = d->getFinalCE32(ce32);
320    if(Collation::isContractionCE32(ce32)) {
321        const UChar *p = d->contexts + Collation::indexFromCE32(ce32);
322        addContractions(c, p + 2);
323    }
324    tailored->add(UnicodeString(unreversedPrefix).append(c));
325    resetPrefix();
326}
327
328void
329TailoredSet::addContractions(UChar32 c, const UChar *p) {
330    UCharsTrie::Iterator suffixes(p, 0, errorCode);
331    while(suffixes.next(errorCode)) {
332        addSuffix(c, suffixes.getString());
333    }
334}
335
336void
337TailoredSet::addSuffix(UChar32 c, const UnicodeString &sfx) {
338    tailored->add(UnicodeString(unreversedPrefix).append(c).append(sfx));
339}
340
341void
342TailoredSet::add(UChar32 c) {
343    if(unreversedPrefix.isEmpty() && suffix == NULL) {
344        tailored->add(c);
345    } else {
346        UnicodeString s(unreversedPrefix);
347        s.append(c);
348        if(suffix != NULL) {
349            s.append(*suffix);
350        }
351        tailored->add(s);
352    }
353}
354
355ContractionsAndExpansions::CESink::~CESink() {}
356
357U_CDECL_BEGIN
358
359static UBool U_CALLCONV
360enumCnERange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) {
361    ContractionsAndExpansions *cne = (ContractionsAndExpansions *)context;
362    if(cne->checkTailored == 0) {
363        // There is no tailoring.
364        // No need to collect nor check the tailored set.
365    } else if(cne->checkTailored < 0) {
366        // Collect the set of code points with mappings in the tailoring data.
367        if(ce32 == Collation::FALLBACK_CE32) {
368            return TRUE;  // fallback to base, not tailored
369        } else {
370            cne->tailored.add(start, end);
371        }
372        // checkTailored > 0: Exclude tailored ranges from the base data enumeration.
373    } else if(start == end) {
374        if(cne->tailored.contains(start)) {
375            return TRUE;
376        }
377    } else if(cne->tailored.containsSome(start, end)) {
378        cne->ranges.set(start, end).removeAll(cne->tailored);
379        int32_t count = cne->ranges.getRangeCount();
380        for(int32_t i = 0; i < count; ++i) {
381            cne->handleCE32(cne->ranges.getRangeStart(i), cne->ranges.getRangeEnd(i), ce32);
382        }
383        return U_SUCCESS(cne->errorCode);
384    }
385    cne->handleCE32(start, end, ce32);
386    return U_SUCCESS(cne->errorCode);
387}
388
389U_CDECL_END
390
391void
392ContractionsAndExpansions::forData(const CollationData *d, UErrorCode &ec) {
393    if(U_FAILURE(ec)) { return; }
394    errorCode = ec;  // Preserve info & warning codes.
395    // Add all from the data, can be tailoring or base.
396    if(d->base != NULL) {
397        checkTailored = -1;
398    }
399    data = d;
400    utrie2_enum(data->trie, NULL, enumCnERange, this);
401    if(d->base == NULL || U_FAILURE(errorCode)) {
402        ec = errorCode;
403        return;
404    }
405    // Add all from the base data but only for un-tailored code points.
406    tailored.freeze();
407    checkTailored = 1;
408    data = d->base;
409    utrie2_enum(data->trie, NULL, enumCnERange, this);
410    ec = errorCode;
411}
412
413void
414ContractionsAndExpansions::forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec) {
415    if(U_FAILURE(ec)) { return; }
416    errorCode = ec;  // Preserve info & warning codes.
417    uint32_t ce32 = d->getCE32(c);
418    if(ce32 == Collation::FALLBACK_CE32) {
419        d = d->base;
420        ce32 = d->getCE32(c);
421    }
422    data = d;
423    handleCE32(c, c, ce32);
424    ec = errorCode;
425}
426
427void
428ContractionsAndExpansions::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) {
429    for(;;) {
430        if((ce32 & 0xff) < Collation::SPECIAL_CE32_LOW_BYTE) {
431            // !isSpecialCE32()
432            if(sink != NULL) {
433                sink->handleCE(Collation::ceFromSimpleCE32(ce32));
434            }
435            return;
436        }
437        switch(Collation::tagFromCE32(ce32)) {
438        case Collation::FALLBACK_TAG:
439            return;
440        case Collation::RESERVED_TAG_3:
441        case Collation::BUILDER_DATA_TAG:
442        case Collation::LEAD_SURROGATE_TAG:
443            if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; }
444            return;
445        case Collation::LONG_PRIMARY_TAG:
446            if(sink != NULL) {
447                sink->handleCE(Collation::ceFromLongPrimaryCE32(ce32));
448            }
449            return;
450        case Collation::LONG_SECONDARY_TAG:
451            if(sink != NULL) {
452                sink->handleCE(Collation::ceFromLongSecondaryCE32(ce32));
453            }
454            return;
455        case Collation::LATIN_EXPANSION_TAG:
456            if(sink != NULL) {
457                ces[0] = Collation::latinCE0FromCE32(ce32);
458                ces[1] = Collation::latinCE1FromCE32(ce32);
459                sink->handleExpansion(ces, 2);
460            }
461            // Optimization: If we have a prefix,
462            // then the relevant strings have been added already.
463            if(unreversedPrefix.isEmpty()) {
464                addExpansions(start, end);
465            }
466            return;
467        case Collation::EXPANSION32_TAG:
468            if(sink != NULL) {
469                const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32);
470                int32_t length = Collation::lengthFromCE32(ce32);
471                for(int32_t i = 0; i < length; ++i) {
472                    ces[i] = Collation::ceFromCE32(*ce32s++);
473                }
474                sink->handleExpansion(ces, length);
475            }
476            // Optimization: If we have a prefix,
477            // then the relevant strings have been added already.
478            if(unreversedPrefix.isEmpty()) {
479                addExpansions(start, end);
480            }
481            return;
482        case Collation::EXPANSION_TAG:
483            if(sink != NULL) {
484                int32_t length = Collation::lengthFromCE32(ce32);
485                sink->handleExpansion(data->ces + Collation::indexFromCE32(ce32), length);
486            }
487            // Optimization: If we have a prefix,
488            // then the relevant strings have been added already.
489            if(unreversedPrefix.isEmpty()) {
490                addExpansions(start, end);
491            }
492            return;
493        case Collation::PREFIX_TAG:
494            handlePrefixes(start, end, ce32);
495            return;
496        case Collation::CONTRACTION_TAG:
497            handleContractions(start, end, ce32);
498            return;
499        case Collation::DIGIT_TAG:
500            // Fetch the non-numeric-collation CE32 and continue.
501            ce32 = data->ce32s[Collation::indexFromCE32(ce32)];
502            break;
503        case Collation::U0000_TAG:
504            U_ASSERT(start == 0 && end == 0);
505            // Fetch the normal ce32 for U+0000 and continue.
506            ce32 = data->ce32s[0];
507            break;
508        case Collation::HANGUL_TAG:
509            if(sink != NULL) {
510                // TODO: This should be optimized,
511                // especially if [start..end] is the complete Hangul range. (assert that)
512                UTF16CollationIterator iter(data, FALSE, NULL, NULL, NULL);
513                UChar hangul[1] = { 0 };
514                for(UChar32 c = start; c <= end; ++c) {
515                    hangul[0] = (UChar)c;
516                    iter.setText(hangul, hangul + 1);
517                    int32_t length = iter.fetchCEs(errorCode);
518                    if(U_FAILURE(errorCode)) { return; }
519                    // Ignore the terminating non-CE.
520                    U_ASSERT(length >= 2 && iter.getCE(length - 1) == Collation::NO_CE);
521                    sink->handleExpansion(iter.getCEs(), length - 1);
522                }
523            }
524            // Optimization: If we have a prefix,
525            // then the relevant strings have been added already.
526            if(unreversedPrefix.isEmpty()) {
527                addExpansions(start, end);
528            }
529            return;
530        case Collation::OFFSET_TAG:
531            // Currently no need to send offset CEs to the sink.
532            return;
533        case Collation::IMPLICIT_TAG:
534            // Currently no need to send implicit CEs to the sink.
535            return;
536        }
537    }
538}
539
540void
541ContractionsAndExpansions::handlePrefixes(
542        UChar32 start, UChar32 end, uint32_t ce32) {
543    const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
544    ce32 = CollationData::readCE32(p);  // Default if no prefix match.
545    handleCE32(start, end, ce32);
546    if(!addPrefixes) { return; }
547    UCharsTrie::Iterator prefixes(p + 2, 0, errorCode);
548    while(prefixes.next(errorCode)) {
549        setPrefix(prefixes.getString());
550        // Prefix/pre-context mappings are special kinds of contractions
551        // that always yield expansions.
552        addStrings(start, end, contractions);
553        addStrings(start, end, expansions);
554        handleCE32(start, end, (uint32_t)prefixes.getValue());
555    }
556    resetPrefix();
557}
558
559void
560ContractionsAndExpansions::handleContractions(
561        UChar32 start, UChar32 end, uint32_t ce32) {
562    const UChar *p = data->contexts + Collation::indexFromCE32(ce32);
563    if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
564        // No match on the single code point.
565        // We are underneath a prefix, and the default mapping is just
566        // a fallback to the mappings for a shorter prefix.
567        U_ASSERT(!unreversedPrefix.isEmpty());
568    } else {
569        ce32 = CollationData::readCE32(p);  // Default if no suffix match.
570        U_ASSERT(!Collation::isContractionCE32(ce32));
571        handleCE32(start, end, ce32);
572    }
573    UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);
574    while(suffixes.next(errorCode)) {
575        suffix = &suffixes.getString();
576        addStrings(start, end, contractions);
577        if(!unreversedPrefix.isEmpty()) {
578            addStrings(start, end, expansions);
579        }
580        handleCE32(start, end, (uint32_t)suffixes.getValue());
581    }
582    suffix = NULL;
583}
584
585void
586ContractionsAndExpansions::addExpansions(UChar32 start, UChar32 end) {
587    if(unreversedPrefix.isEmpty() && suffix == NULL) {
588        if(expansions != NULL) {
589            expansions->add(start, end);
590        }
591    } else {
592        addStrings(start, end, expansions);
593    }
594}
595
596void
597ContractionsAndExpansions::addStrings(UChar32 start, UChar32 end, UnicodeSet *set) {
598    if(set == NULL) { return; }
599    UnicodeString s(unreversedPrefix);
600    do {
601        s.append(start);
602        if(suffix != NULL) {
603            s.append(*suffix);
604        }
605        set->add(s);
606        s.truncate(unreversedPrefix.length());
607    } while(++start <= end);
608}
609
610U_NAMESPACE_END
611
612#endif  // !UCONFIG_NO_COLLATION
613