CollationCompare.java revision 7935b1839a081ed19ae0d33029ad3c09632a2caa
1/*
2 *******************************************************************************
3 * Copyright (C) 1996-2014, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * CollationCompare.java, ported from collationcompare.h/.cpp
7 *
8 * C++ version created on: 2012feb14 with new and old collation code
9 * created by: Markus W. Scherer
10 */
11
12package com.ibm.icu.impl.coll;
13
14import com.ibm.icu.text.Collator;
15
16public final class CollationCompare /* all static */ {
17    public static int compareUpToQuaternary(CollationIterator left, CollationIterator right,
18            CollationSettings settings) {
19        int options = settings.options;
20        long variableTop;
21        if ((options & CollationSettings.ALTERNATE_MASK) == 0) {
22            variableTop = 0;
23        } else {
24            // +1 so that we can use "<" and primary ignorables test out early.
25            variableTop = settings.variableTop + 1;
26        }
27        boolean anyVariable = false;
28
29        // Fetch CEs, compare primaries, store secondary & tertiary weights.
30        for (;;) {
31            // We fetch CEs until we get a non-ignorable primary or reach the end.
32            long leftPrimary;
33            do {
34                long ce = left.nextCE();
35                leftPrimary = ce >>> 32;
36                if (leftPrimary < variableTop && leftPrimary > Collation.MERGE_SEPARATOR_PRIMARY) {
37                    // Variable CE, shift it to quaternary level.
38                    // Ignore all following primary ignorables, and shift further variable CEs.
39                    anyVariable = true;
40                    do {
41                        // Store only the primary of the variable CE.
42                        left.setCurrentCE(ce & 0xffffffff00000000L);
43                        for (;;) {
44                            ce = left.nextCE();
45                            leftPrimary = ce >>> 32;
46                            if (leftPrimary == 0) {
47                                left.setCurrentCE(0);
48                            } else {
49                                break;
50                            }
51                        }
52                    } while (leftPrimary < variableTop && leftPrimary > Collation.MERGE_SEPARATOR_PRIMARY);
53                }
54            } while (leftPrimary == 0);
55
56            long rightPrimary;
57            do {
58                long ce = right.nextCE();
59                rightPrimary = ce >>> 32;
60                if (rightPrimary < variableTop && rightPrimary > Collation.MERGE_SEPARATOR_PRIMARY) {
61                    // Variable CE, shift it to quaternary level.
62                    // Ignore all following primary ignorables, and shift further variable CEs.
63                    anyVariable = true;
64                    do {
65                        // Store only the primary of the variable CE.
66                        right.setCurrentCE(ce & 0xffffffff00000000L);
67                        for (;;) {
68                            ce = right.nextCE();
69                            rightPrimary = ce >>> 32;
70                            if (rightPrimary == 0) {
71                                right.setCurrentCE(0);
72                            } else {
73                                break;
74                            }
75                        }
76                    } while (rightPrimary < variableTop && rightPrimary > Collation.MERGE_SEPARATOR_PRIMARY);
77                }
78            } while (rightPrimary == 0);
79
80            if (leftPrimary != rightPrimary) {
81                // Return the primary difference, with script reordering.
82                byte[] reorderTable = settings.reorderTable;
83                if (reorderTable != null) {
84                    leftPrimary = Collation.reorder(reorderTable, leftPrimary);
85                    rightPrimary = Collation.reorder(reorderTable, rightPrimary);
86                }
87                return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER;
88            }
89            if (leftPrimary == Collation.NO_CE_PRIMARY) {
90                break;
91            }
92        }
93
94        // Compare the buffered secondary & tertiary weights.
95        // We might skip the secondary level but continue with the case level
96        // which is turned on separately.
97        if (CollationSettings.getStrength(options) >= Collator.SECONDARY) {
98            if ((options & CollationSettings.BACKWARD_SECONDARY) == 0) {
99                int leftIndex = 0;
100                int rightIndex = 0;
101                for (;;) {
102                    int leftSecondary;
103                    do {
104                        leftSecondary = ((int) left.getCE(leftIndex++)) >>> 16;
105                    } while (leftSecondary == 0);
106
107                    int rightSecondary;
108                    do {
109                        rightSecondary = ((int) right.getCE(rightIndex++)) >>> 16;
110                    } while (rightSecondary == 0);
111
112                    if (leftSecondary != rightSecondary) {
113                        return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER;
114                    }
115                    if (leftSecondary == Collation.NO_CE_WEIGHT16) {
116                        break;
117                    }
118                }
119            } else {
120                // The backwards secondary level compares secondary weights backwards
121                // within segments separated by the merge separator (U+FFFE, weight 02).
122                int leftStart = 0;
123                int rightStart = 0;
124                for (;;) {
125                    // Find the merge separator or the NO_CE terminator.
126                    int leftLimit = leftStart;
127                    long leftLower32;
128                    while ((leftLower32 = left.getCE(leftLimit) & 0xffffffffL) > Collation.MERGE_SEPARATOR_LOWER32
129                            || leftLower32 == 0) {
130                        ++leftLimit;
131                    }
132                    int rightLimit = rightStart;
133                    long rightLower32;
134                    while ((rightLower32 = right.getCE(rightLimit) & 0xffffffffL) > Collation.MERGE_SEPARATOR_LOWER32
135                            || rightLower32 == 0) {
136                        ++rightLimit;
137                    }
138
139                    // Compare the segments.
140                    int leftIndex = leftLimit;
141                    int rightIndex = rightLimit;
142                    for (;;) {
143                        int leftSecondary = 0;
144                        while (leftSecondary == 0 && leftIndex > leftStart) {
145                            leftSecondary = ((int) left.getCE(--leftIndex)) >>> 16;
146                        }
147
148                        int rightSecondary = 0;
149                        while (rightSecondary == 0 && rightIndex > rightStart) {
150                            rightSecondary = ((int) right.getCE(--rightIndex)) >>> 16;
151                        }
152
153                        if (leftSecondary != rightSecondary) {
154                            return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER;
155                        }
156                        if (leftSecondary == 0) {
157                            break;
158                        }
159                    }
160
161                    // Did we reach the end of either string?
162                    // Both strings have the same number of merge separators,
163                    // or else there would have been a primary-level difference.
164                    assert (left.getCE(leftLimit) == right.getCE(rightLimit));
165                    if (left.getCE(leftLimit) == Collation.NO_CE) {
166                        break;
167                    }
168                    // Skip both merge separators and continue.
169                    leftStart = leftLimit + 1;
170                    rightStart = rightLimit + 1;
171                }
172            }
173        }
174
175        if ((options & CollationSettings.CASE_LEVEL) != 0) {
176            int strength = CollationSettings.getStrength(options);
177            int leftIndex = 0;
178            int rightIndex = 0;
179            for (;;) {
180                int leftCase, leftLower32, rightCase;
181                if (strength == Collator.PRIMARY) {
182                    // Primary+caseLevel: Ignore case level weights of primary ignorables.
183                    // Otherwise we would get a-umlaut > a
184                    // which is not desirable for accent-insensitive sorting.
185                    // Check for (lower 32 bits) == 0 as well because variable CEs are stored
186                    // with only primary weights.
187                    long ce;
188                    do {
189                        ce = left.getCE(leftIndex++);
190                        leftCase = (int) ce;
191                    } while ((ce >>> 32) == 0 || leftCase == 0);
192                    leftLower32 = leftCase;
193                    leftCase &= 0xc000;
194
195                    do {
196                        ce = right.getCE(rightIndex++);
197                        rightCase = (int) ce;
198                    } while ((ce >>> 32) == 0 || rightCase == 0);
199                    rightCase &= 0xc000;
200                } else {
201                    // Secondary+caseLevel: By analogy with the above,
202                    // ignore case level weights of secondary ignorables.
203                    //
204                    // Note: A tertiary CE has uppercase case bits (0.0.ut)
205                    // to keep tertiary+caseFirst well-formed.
206                    //
207                    // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
208                    // Otherwise a tertiary CE's uppercase would be no greater than
209                    // a primary/secondary CE's uppercase.
210                    // (See UCA well-formedness condition 2.)
211                    // We could construct a special case weight higher than uppercase,
212                    // but it's simpler to always ignore case weights of secondary ignorables,
213                    // turning 0.0.ut into 0.0.0.t.
214                    // (See LDML Collation, Case Parameters.)
215                    do {
216                        leftCase = (int) left.getCE(leftIndex++);
217                    } while ((leftCase & 0xffff0000) == 0);
218                    leftLower32 = leftCase;
219                    leftCase &= 0xc000;
220
221                    do {
222                        rightCase = (int) right.getCE(rightIndex++);
223                    } while ((rightCase & 0xffff0000) == 0);
224                    rightCase &= 0xc000;
225                }
226
227                // No need to handle NO_CE and MERGE_SEPARATOR specially:
228                // There is one case weight for each previous-level weight,
229                // so level length differences were handled there.
230                if (leftCase != rightCase) {
231                    if ((options & CollationSettings.UPPER_FIRST) == 0) {
232                        return (leftCase < rightCase) ? Collation.LESS : Collation.GREATER;
233                    } else {
234                        return (leftCase < rightCase) ? Collation.GREATER : Collation.LESS;
235                    }
236                }
237                if ((leftLower32 >>> 16) == Collation.NO_CE_WEIGHT16) {
238                    break;
239                }
240            }
241        }
242        if (CollationSettings.getStrength(options) <= Collator.SECONDARY) {
243            return Collation.EQUAL;
244        }
245
246        int tertiaryMask = CollationSettings.getTertiaryMask(options);
247
248        int leftIndex = 0;
249        int rightIndex = 0;
250        int anyQuaternaries = 0;
251        for (;;) {
252            int leftLower32, leftTertiary;
253            do {
254                leftLower32 = (int) left.getCE(leftIndex++);
255                anyQuaternaries |= leftLower32;
256                assert ((leftLower32 & Collation.ONLY_TERTIARY_MASK) != 0 || (leftLower32 & 0xc0c0) == 0);
257                leftTertiary = leftLower32 & tertiaryMask;
258            } while (leftTertiary == 0);
259
260            int rightLower32, rightTertiary;
261            do {
262                rightLower32 = (int) right.getCE(rightIndex++);
263                anyQuaternaries |= rightLower32;
264                assert ((rightLower32 & Collation.ONLY_TERTIARY_MASK) != 0 || (rightLower32 & 0xc0c0) == 0);
265                rightTertiary = rightLower32 & tertiaryMask;
266            } while (rightTertiary == 0);
267
268            if (leftTertiary != rightTertiary) {
269                if (CollationSettings.sortsTertiaryUpperCaseFirst(options)) {
270                    // Pass through NO_CE and MERGE_SEPARATOR
271                    // and keep real tertiary weights larger than the MERGE_SEPARATOR.
272                    // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
273                    // to keep tertiary CEs well-formed.
274                    // Their case+tertiary weights must be greater than those of
275                    // primary and secondary CEs.
276                    if (leftTertiary > Collation.MERGE_SEPARATOR_WEIGHT16) {
277                        if ((leftLower32 & 0xffff0000) != 0) {
278                            leftTertiary ^= 0xc000;
279                        } else {
280                            leftTertiary += 0x4000;
281                        }
282                    }
283                    if (rightTertiary > Collation.MERGE_SEPARATOR_WEIGHT16) {
284                        if ((rightLower32 & 0xffff0000) != 0) {
285                            rightTertiary ^= 0xc000;
286                        } else {
287                            rightTertiary += 0x4000;
288                        }
289                    }
290                }
291                return (leftTertiary < rightTertiary) ? Collation.LESS : Collation.GREATER;
292            }
293            if (leftTertiary == Collation.NO_CE_WEIGHT16) {
294                break;
295            }
296        }
297        if (CollationSettings.getStrength(options) <= Collator.TERTIARY) {
298            return Collation.EQUAL;
299        }
300
301        if (!anyVariable && (anyQuaternaries & 0xc0) == 0) {
302            // If there are no "variable" CEs and no non-zero quaternary weights,
303            // then there are no quaternary differences.
304            return Collation.EQUAL;
305        }
306
307        leftIndex = 0;
308        rightIndex = 0;
309        for (;;) {
310            long leftQuaternary;
311            do {
312                long ce = left.getCE(leftIndex++);
313                leftQuaternary = ce & 0xffff;
314                if (leftQuaternary == 0) {
315                    // Variable primary or completely ignorable.
316                    leftQuaternary = ce >>> 32;
317                } else if (leftQuaternary <= Collation.MERGE_SEPARATOR_WEIGHT16) {
318                    // Leave NO_CE or MERGE_SEPARATOR as is.
319                } else {
320                    // Regular CE, not tertiary ignorable.
321                    // Preserve the quaternary weight in bits 7..6.
322                    leftQuaternary |= 0xffffff3fL;
323                }
324            } while (leftQuaternary == 0);
325
326            long rightQuaternary;
327            do {
328                long ce = right.getCE(rightIndex++);
329                rightQuaternary = ce & 0xffff;
330                if (rightQuaternary == 0) {
331                    // Variable primary or completely ignorable.
332                    rightQuaternary = ce >>> 32;
333                } else if (rightQuaternary <= Collation.MERGE_SEPARATOR_WEIGHT16) {
334                    // Leave NO_CE or MERGE_SEPARATOR as is.
335                } else {
336                    // Regular CE, not tertiary ignorable.
337                    // Preserve the quaternary weight in bits 7..6.
338                    rightQuaternary |= 0xffffff3fL;
339                }
340            } while (rightQuaternary == 0);
341
342            if (leftQuaternary != rightQuaternary) {
343                // Return the difference, with script reordering.
344                byte[] reorderTable = settings.reorderTable;
345                if (reorderTable != null) {
346                    leftQuaternary = Collation.reorder(reorderTable, leftQuaternary);
347                    rightQuaternary = Collation.reorder(reorderTable, rightQuaternary);
348                }
349                return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER;
350            }
351            if (leftQuaternary == Collation.NO_CE_WEIGHT16) {
352                break;
353            }
354        }
355        return Collation.EQUAL;
356    }
357}
358