1/*
2*******************************************************************************
3* Copyright (C) 1996-2015, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* collationcompare.cpp
7*
8* created on: 2012feb14 with new and old collation code
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/ucol.h"
17#include "cmemory.h"
18#include "collation.h"
19#include "collationcompare.h"
20#include "collationiterator.h"
21#include "collationsettings.h"
22#include "uassert.h"
23
24U_NAMESPACE_BEGIN
25
26UCollationResult
27CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
28                                        const CollationSettings &settings,
29                                        UErrorCode &errorCode) {
30    if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
31
32    int32_t options = settings.options;
33    uint32_t variableTop;
34    if((options & CollationSettings::ALTERNATE_MASK) == 0) {
35        variableTop = 0;
36    } else {
37        // +1 so that we can use "<" and primary ignorables test out early.
38        variableTop = settings.variableTop + 1;
39    }
40    UBool anyVariable = FALSE;
41
42    // Fetch CEs, compare primaries, store secondary & tertiary weights.
43    for(;;) {
44        // We fetch CEs until we get a non-ignorable primary or reach the end.
45        uint32_t leftPrimary;
46        do {
47            int64_t ce = left.nextCE(errorCode);
48            leftPrimary = (uint32_t)(ce >> 32);
49            if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
50                // Variable CE, shift it to quaternary level.
51                // Ignore all following primary ignorables, and shift further variable CEs.
52                anyVariable = TRUE;
53                do {
54                    // Store only the primary of the variable CE.
55                    left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
56                    for(;;) {
57                        ce = left.nextCE(errorCode);
58                        leftPrimary = (uint32_t)(ce >> 32);
59                        if(leftPrimary == 0) {
60                            left.setCurrentCE(0);
61                        } else {
62                            break;
63                        }
64                    }
65                } while(leftPrimary < variableTop &&
66                        leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
67            }
68        } while(leftPrimary == 0);
69
70        uint32_t rightPrimary;
71        do {
72            int64_t ce = right.nextCE(errorCode);
73            rightPrimary = (uint32_t)(ce >> 32);
74            if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
75                // Variable CE, shift it to quaternary level.
76                // Ignore all following primary ignorables, and shift further variable CEs.
77                anyVariable = TRUE;
78                do {
79                    // Store only the primary of the variable CE.
80                    right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
81                    for(;;) {
82                        ce = right.nextCE(errorCode);
83                        rightPrimary = (uint32_t)(ce >> 32);
84                        if(rightPrimary == 0) {
85                            right.setCurrentCE(0);
86                        } else {
87                            break;
88                        }
89                    }
90                } while(rightPrimary < variableTop &&
91                        rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
92            }
93        } while(rightPrimary == 0);
94
95        if(leftPrimary != rightPrimary) {
96            // Return the primary difference, with script reordering.
97            if(settings.hasReordering()) {
98                leftPrimary = settings.reorder(leftPrimary);
99                rightPrimary = settings.reorder(rightPrimary);
100            }
101            return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
102        }
103        if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
104    }
105    if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
106
107    // Compare the buffered secondary & tertiary weights.
108    // We might skip the secondary level but continue with the case level
109    // which is turned on separately.
110    if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
111        if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
112            int32_t leftIndex = 0;
113            int32_t rightIndex = 0;
114            for(;;) {
115                uint32_t leftSecondary;
116                do {
117                    leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
118                } while(leftSecondary == 0);
119
120                uint32_t rightSecondary;
121                do {
122                    rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
123                } while(rightSecondary == 0);
124
125                if(leftSecondary != rightSecondary) {
126                    return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
127                }
128                if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
129            }
130        } else {
131            // The backwards secondary level compares secondary weights backwards
132            // within segments separated by the merge separator (U+FFFE, weight 02).
133            int32_t leftStart = 0;
134            int32_t rightStart = 0;
135            for(;;) {
136                // Find the merge separator or the NO_CE terminator.
137                uint32_t p;
138                int32_t leftLimit = leftStart;
139                while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
140                            Collation::MERGE_SEPARATOR_PRIMARY ||
141                        p == 0) {
142                    ++leftLimit;
143                }
144                int32_t rightLimit = rightStart;
145                while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
146                            Collation::MERGE_SEPARATOR_PRIMARY ||
147                        p == 0) {
148                    ++rightLimit;
149                }
150
151                // Compare the segments.
152                int32_t leftIndex = leftLimit;
153                int32_t rightIndex = rightLimit;
154                for(;;) {
155                    int32_t leftSecondary = 0;
156                    while(leftSecondary == 0 && leftIndex > leftStart) {
157                        leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
158                    }
159
160                    int32_t rightSecondary = 0;
161                    while(rightSecondary == 0 && rightIndex > rightStart) {
162                        rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
163                    }
164
165                    if(leftSecondary != rightSecondary) {
166                        return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
167                    }
168                    if(leftSecondary == 0) { break; }
169                }
170
171                // Did we reach the end of either string?
172                // Both strings have the same number of merge separators,
173                // or else there would have been a primary-level difference.
174                U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
175                if(p == Collation::NO_CE_PRIMARY) { break; }
176                // Skip both merge separators and continue.
177                leftStart = leftLimit + 1;
178                rightStart = rightLimit + 1;
179            }
180        }
181    }
182
183    if((options & CollationSettings::CASE_LEVEL) != 0) {
184        int32_t strength = CollationSettings::getStrength(options);
185        int32_t leftIndex = 0;
186        int32_t rightIndex = 0;
187        for(;;) {
188            uint32_t leftCase, leftLower32, rightCase;
189            if(strength == UCOL_PRIMARY) {
190                // Primary+caseLevel: Ignore case level weights of primary ignorables.
191                // Otherwise we would get a-umlaut > a
192                // which is not desirable for accent-insensitive sorting.
193                // Check for (lower 32 bits) == 0 as well because variable CEs are stored
194                // with only primary weights.
195                int64_t ce;
196                do {
197                    ce = left.getCE(leftIndex++);
198                    leftCase = (uint32_t)ce;
199                } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
200                leftLower32 = leftCase;
201                leftCase &= 0xc000;
202
203                do {
204                    ce = right.getCE(rightIndex++);
205                    rightCase = (uint32_t)ce;
206                } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
207                rightCase &= 0xc000;
208            } else {
209                // Secondary+caseLevel: By analogy with the above,
210                // ignore case level weights of secondary ignorables.
211                //
212                // Note: A tertiary CE has uppercase case bits (0.0.ut)
213                // to keep tertiary+caseFirst well-formed.
214                //
215                // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
216                // Otherwise a tertiary CE's uppercase would be no greater than
217                // a primary/secondary CE's uppercase.
218                // (See UCA well-formedness condition 2.)
219                // We could construct a special case weight higher than uppercase,
220                // but it's simpler to always ignore case weights of secondary ignorables,
221                // turning 0.0.ut into 0.0.0.t.
222                // (See LDML Collation, Case Parameters.)
223                do {
224                    leftCase = (uint32_t)left.getCE(leftIndex++);
225                } while(leftCase <= 0xffff);
226                leftLower32 = leftCase;
227                leftCase &= 0xc000;
228
229                do {
230                    rightCase = (uint32_t)right.getCE(rightIndex++);
231                } while(rightCase <= 0xffff);
232                rightCase &= 0xc000;
233            }
234
235            // No need to handle NO_CE and MERGE_SEPARATOR specially:
236            // There is one case weight for each previous-level weight,
237            // so level length differences were handled there.
238            if(leftCase != rightCase) {
239                if((options & CollationSettings::UPPER_FIRST) == 0) {
240                    return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
241                } else {
242                    return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
243                }
244            }
245            if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
246        }
247    }
248    if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
249
250    uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
251
252    int32_t leftIndex = 0;
253    int32_t rightIndex = 0;
254    uint32_t anyQuaternaries = 0;
255    for(;;) {
256        uint32_t leftLower32, leftTertiary;
257        do {
258            leftLower32 = (uint32_t)left.getCE(leftIndex++);
259            anyQuaternaries |= leftLower32;
260            U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
261                     (leftLower32 & 0xc0c0) == 0);
262            leftTertiary = leftLower32 & tertiaryMask;
263        } while(leftTertiary == 0);
264
265        uint32_t rightLower32, rightTertiary;
266        do {
267            rightLower32 = (uint32_t)right.getCE(rightIndex++);
268            anyQuaternaries |= rightLower32;
269            U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
270                     (rightLower32 & 0xc0c0) == 0);
271            rightTertiary = rightLower32 & tertiaryMask;
272        } while(rightTertiary == 0);
273
274        if(leftTertiary != rightTertiary) {
275            if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
276                // Pass through NO_CE and keep real tertiary weights larger than that.
277                // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
278                // to keep tertiary CEs well-formed.
279                // Their case+tertiary weights must be greater than those of
280                // primary and secondary CEs.
281                if(leftTertiary > Collation::NO_CE_WEIGHT16) {
282                    if(leftLower32 > 0xffff) {
283                        leftTertiary ^= 0xc000;
284                    } else {
285                        leftTertiary += 0x4000;
286                    }
287                }
288                if(rightTertiary > Collation::NO_CE_WEIGHT16) {
289                    if(rightLower32 > 0xffff) {
290                        rightTertiary ^= 0xc000;
291                    } else {
292                        rightTertiary += 0x4000;
293                    }
294                }
295            }
296            return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
297        }
298        if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
299    }
300    if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
301
302    if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
303        // If there are no "variable" CEs and no non-zero quaternary weights,
304        // then there are no quaternary differences.
305        return UCOL_EQUAL;
306    }
307
308    leftIndex = 0;
309    rightIndex = 0;
310    for(;;) {
311        uint32_t leftQuaternary;
312        do {
313            int64_t ce = left.getCE(leftIndex++);
314            leftQuaternary = (uint32_t)ce & 0xffff;
315            if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
316                // Variable primary or completely ignorable or NO_CE.
317                leftQuaternary = (uint32_t)(ce >> 32);
318            } else {
319                // Regular CE, not tertiary ignorable.
320                // Preserve the quaternary weight in bits 7..6.
321                leftQuaternary |= 0xffffff3f;
322            }
323        } while(leftQuaternary == 0);
324
325        uint32_t rightQuaternary;
326        do {
327            int64_t ce = right.getCE(rightIndex++);
328            rightQuaternary = (uint32_t)ce & 0xffff;
329            if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
330                // Variable primary or completely ignorable or NO_CE.
331                rightQuaternary = (uint32_t)(ce >> 32);
332            } else {
333                // Regular CE, not tertiary ignorable.
334                // Preserve the quaternary weight in bits 7..6.
335                rightQuaternary |= 0xffffff3f;
336            }
337        } while(rightQuaternary == 0);
338
339        if(leftQuaternary != rightQuaternary) {
340            // Return the difference, with script reordering.
341            if(settings.hasReordering()) {
342                leftQuaternary = settings.reorder(leftQuaternary);
343                rightQuaternary = settings.reorder(rightQuaternary);
344            }
345            return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
346        }
347        if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
348    }
349    return UCOL_EQUAL;
350}
351
352U_NAMESPACE_END
353
354#endif  // !UCONFIG_NO_COLLATION
355