CollationCompare.java revision f716bda031dccdec5e47bb40e758c5901d209729
1/*
2 *******************************************************************************
3 * Copyright (C) 1996-2015, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * CollationCompare.java, ported from collationcompare.h/.cpp
7 *
8 * C++ version created on: 2012feb14 with new and old collation code
9 * created by: Markus W. Scherer
10 */
11
12package com.ibm.icu.impl.coll;
13
14import com.ibm.icu.text.Collator;
15
16public final class CollationCompare /* all static */ {
17    public static int compareUpToQuaternary(CollationIterator left, CollationIterator right,
18            CollationSettings settings) {
19        int options = settings.options;
20        long variableTop;
21        if ((options & CollationSettings.ALTERNATE_MASK) == 0) {
22            variableTop = 0;
23        } else {
24            // +1 so that we can use "<" and primary ignorables test out early.
25            variableTop = settings.variableTop + 1;
26        }
27        boolean anyVariable = false;
28
29        // Fetch CEs, compare primaries, store secondary & tertiary weights.
30        for (;;) {
31            // We fetch CEs until we get a non-ignorable primary or reach the end.
32            long leftPrimary;
33            do {
34                long ce = left.nextCE();
35                leftPrimary = ce >>> 32;
36                if (leftPrimary < variableTop && leftPrimary > Collation.MERGE_SEPARATOR_PRIMARY) {
37                    // Variable CE, shift it to quaternary level.
38                    // Ignore all following primary ignorables, and shift further variable CEs.
39                    anyVariable = true;
40                    do {
41                        // Store only the primary of the variable CE.
42                        left.setCurrentCE(ce & 0xffffffff00000000L);
43                        for (;;) {
44                            ce = left.nextCE();
45                            leftPrimary = ce >>> 32;
46                            if (leftPrimary == 0) {
47                                left.setCurrentCE(0);
48                            } else {
49                                break;
50                            }
51                        }
52                    } while (leftPrimary < variableTop && leftPrimary > Collation.MERGE_SEPARATOR_PRIMARY);
53                }
54            } while (leftPrimary == 0);
55
56            long rightPrimary;
57            do {
58                long ce = right.nextCE();
59                rightPrimary = ce >>> 32;
60                if (rightPrimary < variableTop && rightPrimary > Collation.MERGE_SEPARATOR_PRIMARY) {
61                    // Variable CE, shift it to quaternary level.
62                    // Ignore all following primary ignorables, and shift further variable CEs.
63                    anyVariable = true;
64                    do {
65                        // Store only the primary of the variable CE.
66                        right.setCurrentCE(ce & 0xffffffff00000000L);
67                        for (;;) {
68                            ce = right.nextCE();
69                            rightPrimary = ce >>> 32;
70                            if (rightPrimary == 0) {
71                                right.setCurrentCE(0);
72                            } else {
73                                break;
74                            }
75                        }
76                    } while (rightPrimary < variableTop && rightPrimary > Collation.MERGE_SEPARATOR_PRIMARY);
77                }
78            } while (rightPrimary == 0);
79
80            if (leftPrimary != rightPrimary) {
81                // Return the primary difference, with script reordering.
82                if (settings.hasReordering()) {
83                    leftPrimary = settings.reorder(leftPrimary);
84                    rightPrimary = settings.reorder(rightPrimary);
85                }
86                return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER;
87            }
88            if (leftPrimary == Collation.NO_CE_PRIMARY) {
89                break;
90            }
91        }
92
93        // Compare the buffered secondary & tertiary weights.
94        // We might skip the secondary level but continue with the case level
95        // which is turned on separately.
96        if (CollationSettings.getStrength(options) >= Collator.SECONDARY) {
97            if ((options & CollationSettings.BACKWARD_SECONDARY) == 0) {
98                int leftIndex = 0;
99                int rightIndex = 0;
100                for (;;) {
101                    int leftSecondary;
102                    do {
103                        leftSecondary = ((int) left.getCE(leftIndex++)) >>> 16;
104                    } while (leftSecondary == 0);
105
106                    int rightSecondary;
107                    do {
108                        rightSecondary = ((int) right.getCE(rightIndex++)) >>> 16;
109                    } while (rightSecondary == 0);
110
111                    if (leftSecondary != rightSecondary) {
112                        return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER;
113                    }
114                    if (leftSecondary == Collation.NO_CE_WEIGHT16) {
115                        break;
116                    }
117                }
118            } else {
119                // The backwards secondary level compares secondary weights backwards
120                // within segments separated by the merge separator (U+FFFE, weight 02).
121                int leftStart = 0;
122                int rightStart = 0;
123                for (;;) {
124                    // Find the merge separator or the NO_CE terminator.
125                    long p;
126                    int leftLimit = leftStart;
127                    while ((p = left.getCE(leftLimit) >>> 32) > Collation.MERGE_SEPARATOR_PRIMARY
128                            || p == 0) {
129                        ++leftLimit;
130                    }
131                    int rightLimit = rightStart;
132                    while ((p = right.getCE(rightLimit) >>> 32) > Collation.MERGE_SEPARATOR_PRIMARY
133                            || p == 0) {
134                        ++rightLimit;
135                    }
136
137                    // Compare the segments.
138                    int leftIndex = leftLimit;
139                    int rightIndex = rightLimit;
140                    for (;;) {
141                        int leftSecondary = 0;
142                        while (leftSecondary == 0 && leftIndex > leftStart) {
143                            leftSecondary = ((int) left.getCE(--leftIndex)) >>> 16;
144                        }
145
146                        int rightSecondary = 0;
147                        while (rightSecondary == 0 && rightIndex > rightStart) {
148                            rightSecondary = ((int) right.getCE(--rightIndex)) >>> 16;
149                        }
150
151                        if (leftSecondary != rightSecondary) {
152                            return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER;
153                        }
154                        if (leftSecondary == 0) {
155                            break;
156                        }
157                    }
158
159                    // Did we reach the end of either string?
160                    // Both strings have the same number of merge separators,
161                    // or else there would have been a primary-level difference.
162                    assert (left.getCE(leftLimit) == right.getCE(rightLimit));
163                    if (p == Collation.NO_CE_PRIMARY) {
164                        break;
165                    }
166                    // Skip both merge separators and continue.
167                    leftStart = leftLimit + 1;
168                    rightStart = rightLimit + 1;
169                }
170            }
171        }
172
173        if ((options & CollationSettings.CASE_LEVEL) != 0) {
174            int strength = CollationSettings.getStrength(options);
175            int leftIndex = 0;
176            int rightIndex = 0;
177            for (;;) {
178                int leftCase, leftLower32, rightCase;
179                if (strength == Collator.PRIMARY) {
180                    // Primary+caseLevel: Ignore case level weights of primary ignorables.
181                    // Otherwise we would get a-umlaut > a
182                    // which is not desirable for accent-insensitive sorting.
183                    // Check for (lower 32 bits) == 0 as well because variable CEs are stored
184                    // with only primary weights.
185                    long ce;
186                    do {
187                        ce = left.getCE(leftIndex++);
188                        leftCase = (int) ce;
189                    } while ((ce >>> 32) == 0 || leftCase == 0);
190                    leftLower32 = leftCase;
191                    leftCase &= 0xc000;
192
193                    do {
194                        ce = right.getCE(rightIndex++);
195                        rightCase = (int) ce;
196                    } while ((ce >>> 32) == 0 || rightCase == 0);
197                    rightCase &= 0xc000;
198                } else {
199                    // Secondary+caseLevel: By analogy with the above,
200                    // ignore case level weights of secondary ignorables.
201                    //
202                    // Note: A tertiary CE has uppercase case bits (0.0.ut)
203                    // to keep tertiary+caseFirst well-formed.
204                    //
205                    // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
206                    // Otherwise a tertiary CE's uppercase would be no greater than
207                    // a primary/secondary CE's uppercase.
208                    // (See UCA well-formedness condition 2.)
209                    // We could construct a special case weight higher than uppercase,
210                    // but it's simpler to always ignore case weights of secondary ignorables,
211                    // turning 0.0.ut into 0.0.0.t.
212                    // (See LDML Collation, Case Parameters.)
213                    do {
214                        leftCase = (int) left.getCE(leftIndex++);
215                    } while ((leftCase & 0xffff0000) == 0);
216                    leftLower32 = leftCase;
217                    leftCase &= 0xc000;
218
219                    do {
220                        rightCase = (int) right.getCE(rightIndex++);
221                    } while ((rightCase & 0xffff0000) == 0);
222                    rightCase &= 0xc000;
223                }
224
225                // No need to handle NO_CE and MERGE_SEPARATOR specially:
226                // There is one case weight for each previous-level weight,
227                // so level length differences were handled there.
228                if (leftCase != rightCase) {
229                    if ((options & CollationSettings.UPPER_FIRST) == 0) {
230                        return (leftCase < rightCase) ? Collation.LESS : Collation.GREATER;
231                    } else {
232                        return (leftCase < rightCase) ? Collation.GREATER : Collation.LESS;
233                    }
234                }
235                if ((leftLower32 >>> 16) == Collation.NO_CE_WEIGHT16) {
236                    break;
237                }
238            }
239        }
240        if (CollationSettings.getStrength(options) <= Collator.SECONDARY) {
241            return Collation.EQUAL;
242        }
243
244        int tertiaryMask = CollationSettings.getTertiaryMask(options);
245
246        int leftIndex = 0;
247        int rightIndex = 0;
248        int anyQuaternaries = 0;
249        for (;;) {
250            int leftLower32, leftTertiary;
251            do {
252                leftLower32 = (int) left.getCE(leftIndex++);
253                anyQuaternaries |= leftLower32;
254                assert ((leftLower32 & Collation.ONLY_TERTIARY_MASK) != 0 || (leftLower32 & 0xc0c0) == 0);
255                leftTertiary = leftLower32 & tertiaryMask;
256            } while (leftTertiary == 0);
257
258            int rightLower32, rightTertiary;
259            do {
260                rightLower32 = (int) right.getCE(rightIndex++);
261                anyQuaternaries |= rightLower32;
262                assert ((rightLower32 & Collation.ONLY_TERTIARY_MASK) != 0 || (rightLower32 & 0xc0c0) == 0);
263                rightTertiary = rightLower32 & tertiaryMask;
264            } while (rightTertiary == 0);
265
266            if (leftTertiary != rightTertiary) {
267                if (CollationSettings.sortsTertiaryUpperCaseFirst(options)) {
268                    // Pass through NO_CE and keep real tertiary weights larger than that.
269                    // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
270                    // to keep tertiary CEs well-formed.
271                    // Their case+tertiary weights must be greater than those of
272                    // primary and secondary CEs.
273                    if (leftTertiary > Collation.NO_CE_WEIGHT16) {
274                        if ((leftLower32 & 0xffff0000) != 0) {
275                            leftTertiary ^= 0xc000;
276                        } else {
277                            leftTertiary += 0x4000;
278                        }
279                    }
280                    if (rightTertiary > Collation.NO_CE_WEIGHT16) {
281                        if ((rightLower32 & 0xffff0000) != 0) {
282                            rightTertiary ^= 0xc000;
283                        } else {
284                            rightTertiary += 0x4000;
285                        }
286                    }
287                }
288                return (leftTertiary < rightTertiary) ? Collation.LESS : Collation.GREATER;
289            }
290            if (leftTertiary == Collation.NO_CE_WEIGHT16) {
291                break;
292            }
293        }
294        if (CollationSettings.getStrength(options) <= Collator.TERTIARY) {
295            return Collation.EQUAL;
296        }
297
298        if (!anyVariable && (anyQuaternaries & 0xc0) == 0) {
299            // If there are no "variable" CEs and no non-zero quaternary weights,
300            // then there are no quaternary differences.
301            return Collation.EQUAL;
302        }
303
304        leftIndex = 0;
305        rightIndex = 0;
306        for (;;) {
307            long leftQuaternary;
308            do {
309                long ce = left.getCE(leftIndex++);
310                leftQuaternary = ce & 0xffff;
311                if (leftQuaternary <= Collation.NO_CE_WEIGHT16) {
312                    // Variable primary or completely ignorable or NO_CE.
313                    leftQuaternary = ce >>> 32;
314                } else {
315                    // Regular CE, not tertiary ignorable.
316                    // Preserve the quaternary weight in bits 7..6.
317                    leftQuaternary |= 0xffffff3fL;
318                }
319            } while (leftQuaternary == 0);
320
321            long rightQuaternary;
322            do {
323                long ce = right.getCE(rightIndex++);
324                rightQuaternary = ce & 0xffff;
325                if (rightQuaternary <= Collation.NO_CE_WEIGHT16) {
326                    // Variable primary or completely ignorable or NO_CE.
327                    rightQuaternary = ce >>> 32;
328                } else {
329                    // Regular CE, not tertiary ignorable.
330                    // Preserve the quaternary weight in bits 7..6.
331                    rightQuaternary |= 0xffffff3fL;
332                }
333            } while (rightQuaternary == 0);
334
335            if (leftQuaternary != rightQuaternary) {
336                // Return the difference, with script reordering.
337                if (settings.hasReordering()) {
338                    leftQuaternary = settings.reorder(leftQuaternary);
339                    rightQuaternary = settings.reorder(rightQuaternary);
340                }
341                return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER;
342            }
343            if (leftQuaternary == Collation.NO_CE_PRIMARY) {
344                break;
345            }
346        }
347        return Collation.EQUAL;
348    }
349}
350