CollationCompare.java revision f716bda031dccdec5e47bb40e758c5901d209729
1/* 2 ******************************************************************************* 3 * Copyright (C) 1996-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * CollationCompare.java, ported from collationcompare.h/.cpp 7 * 8 * C++ version created on: 2012feb14 with new and old collation code 9 * created by: Markus W. Scherer 10 */ 11 12package com.ibm.icu.impl.coll; 13 14import com.ibm.icu.text.Collator; 15 16public final class CollationCompare /* all static */ { 17 public static int compareUpToQuaternary(CollationIterator left, CollationIterator right, 18 CollationSettings settings) { 19 int options = settings.options; 20 long variableTop; 21 if ((options & CollationSettings.ALTERNATE_MASK) == 0) { 22 variableTop = 0; 23 } else { 24 // +1 so that we can use "<" and primary ignorables test out early. 25 variableTop = settings.variableTop + 1; 26 } 27 boolean anyVariable = false; 28 29 // Fetch CEs, compare primaries, store secondary & tertiary weights. 30 for (;;) { 31 // We fetch CEs until we get a non-ignorable primary or reach the end. 32 long leftPrimary; 33 do { 34 long ce = left.nextCE(); 35 leftPrimary = ce >>> 32; 36 if (leftPrimary < variableTop && leftPrimary > Collation.MERGE_SEPARATOR_PRIMARY) { 37 // Variable CE, shift it to quaternary level. 38 // Ignore all following primary ignorables, and shift further variable CEs. 39 anyVariable = true; 40 do { 41 // Store only the primary of the variable CE. 42 left.setCurrentCE(ce & 0xffffffff00000000L); 43 for (;;) { 44 ce = left.nextCE(); 45 leftPrimary = ce >>> 32; 46 if (leftPrimary == 0) { 47 left.setCurrentCE(0); 48 } else { 49 break; 50 } 51 } 52 } while (leftPrimary < variableTop && leftPrimary > Collation.MERGE_SEPARATOR_PRIMARY); 53 } 54 } while (leftPrimary == 0); 55 56 long rightPrimary; 57 do { 58 long ce = right.nextCE(); 59 rightPrimary = ce >>> 32; 60 if (rightPrimary < variableTop && rightPrimary > Collation.MERGE_SEPARATOR_PRIMARY) { 61 // Variable CE, shift it to quaternary level. 62 // Ignore all following primary ignorables, and shift further variable CEs. 63 anyVariable = true; 64 do { 65 // Store only the primary of the variable CE. 66 right.setCurrentCE(ce & 0xffffffff00000000L); 67 for (;;) { 68 ce = right.nextCE(); 69 rightPrimary = ce >>> 32; 70 if (rightPrimary == 0) { 71 right.setCurrentCE(0); 72 } else { 73 break; 74 } 75 } 76 } while (rightPrimary < variableTop && rightPrimary > Collation.MERGE_SEPARATOR_PRIMARY); 77 } 78 } while (rightPrimary == 0); 79 80 if (leftPrimary != rightPrimary) { 81 // Return the primary difference, with script reordering. 82 if (settings.hasReordering()) { 83 leftPrimary = settings.reorder(leftPrimary); 84 rightPrimary = settings.reorder(rightPrimary); 85 } 86 return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER; 87 } 88 if (leftPrimary == Collation.NO_CE_PRIMARY) { 89 break; 90 } 91 } 92 93 // Compare the buffered secondary & tertiary weights. 94 // We might skip the secondary level but continue with the case level 95 // which is turned on separately. 96 if (CollationSettings.getStrength(options) >= Collator.SECONDARY) { 97 if ((options & CollationSettings.BACKWARD_SECONDARY) == 0) { 98 int leftIndex = 0; 99 int rightIndex = 0; 100 for (;;) { 101 int leftSecondary; 102 do { 103 leftSecondary = ((int) left.getCE(leftIndex++)) >>> 16; 104 } while (leftSecondary == 0); 105 106 int rightSecondary; 107 do { 108 rightSecondary = ((int) right.getCE(rightIndex++)) >>> 16; 109 } while (rightSecondary == 0); 110 111 if (leftSecondary != rightSecondary) { 112 return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER; 113 } 114 if (leftSecondary == Collation.NO_CE_WEIGHT16) { 115 break; 116 } 117 } 118 } else { 119 // The backwards secondary level compares secondary weights backwards 120 // within segments separated by the merge separator (U+FFFE, weight 02). 121 int leftStart = 0; 122 int rightStart = 0; 123 for (;;) { 124 // Find the merge separator or the NO_CE terminator. 125 long p; 126 int leftLimit = leftStart; 127 while ((p = left.getCE(leftLimit) >>> 32) > Collation.MERGE_SEPARATOR_PRIMARY 128 || p == 0) { 129 ++leftLimit; 130 } 131 int rightLimit = rightStart; 132 while ((p = right.getCE(rightLimit) >>> 32) > Collation.MERGE_SEPARATOR_PRIMARY 133 || p == 0) { 134 ++rightLimit; 135 } 136 137 // Compare the segments. 138 int leftIndex = leftLimit; 139 int rightIndex = rightLimit; 140 for (;;) { 141 int leftSecondary = 0; 142 while (leftSecondary == 0 && leftIndex > leftStart) { 143 leftSecondary = ((int) left.getCE(--leftIndex)) >>> 16; 144 } 145 146 int rightSecondary = 0; 147 while (rightSecondary == 0 && rightIndex > rightStart) { 148 rightSecondary = ((int) right.getCE(--rightIndex)) >>> 16; 149 } 150 151 if (leftSecondary != rightSecondary) { 152 return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER; 153 } 154 if (leftSecondary == 0) { 155 break; 156 } 157 } 158 159 // Did we reach the end of either string? 160 // Both strings have the same number of merge separators, 161 // or else there would have been a primary-level difference. 162 assert (left.getCE(leftLimit) == right.getCE(rightLimit)); 163 if (p == Collation.NO_CE_PRIMARY) { 164 break; 165 } 166 // Skip both merge separators and continue. 167 leftStart = leftLimit + 1; 168 rightStart = rightLimit + 1; 169 } 170 } 171 } 172 173 if ((options & CollationSettings.CASE_LEVEL) != 0) { 174 int strength = CollationSettings.getStrength(options); 175 int leftIndex = 0; 176 int rightIndex = 0; 177 for (;;) { 178 int leftCase, leftLower32, rightCase; 179 if (strength == Collator.PRIMARY) { 180 // Primary+caseLevel: Ignore case level weights of primary ignorables. 181 // Otherwise we would get a-umlaut > a 182 // which is not desirable for accent-insensitive sorting. 183 // Check for (lower 32 bits) == 0 as well because variable CEs are stored 184 // with only primary weights. 185 long ce; 186 do { 187 ce = left.getCE(leftIndex++); 188 leftCase = (int) ce; 189 } while ((ce >>> 32) == 0 || leftCase == 0); 190 leftLower32 = leftCase; 191 leftCase &= 0xc000; 192 193 do { 194 ce = right.getCE(rightIndex++); 195 rightCase = (int) ce; 196 } while ((ce >>> 32) == 0 || rightCase == 0); 197 rightCase &= 0xc000; 198 } else { 199 // Secondary+caseLevel: By analogy with the above, 200 // ignore case level weights of secondary ignorables. 201 // 202 // Note: A tertiary CE has uppercase case bits (0.0.ut) 203 // to keep tertiary+caseFirst well-formed. 204 // 205 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables. 206 // Otherwise a tertiary CE's uppercase would be no greater than 207 // a primary/secondary CE's uppercase. 208 // (See UCA well-formedness condition 2.) 209 // We could construct a special case weight higher than uppercase, 210 // but it's simpler to always ignore case weights of secondary ignorables, 211 // turning 0.0.ut into 0.0.0.t. 212 // (See LDML Collation, Case Parameters.) 213 do { 214 leftCase = (int) left.getCE(leftIndex++); 215 } while ((leftCase & 0xffff0000) == 0); 216 leftLower32 = leftCase; 217 leftCase &= 0xc000; 218 219 do { 220 rightCase = (int) right.getCE(rightIndex++); 221 } while ((rightCase & 0xffff0000) == 0); 222 rightCase &= 0xc000; 223 } 224 225 // No need to handle NO_CE and MERGE_SEPARATOR specially: 226 // There is one case weight for each previous-level weight, 227 // so level length differences were handled there. 228 if (leftCase != rightCase) { 229 if ((options & CollationSettings.UPPER_FIRST) == 0) { 230 return (leftCase < rightCase) ? Collation.LESS : Collation.GREATER; 231 } else { 232 return (leftCase < rightCase) ? Collation.GREATER : Collation.LESS; 233 } 234 } 235 if ((leftLower32 >>> 16) == Collation.NO_CE_WEIGHT16) { 236 break; 237 } 238 } 239 } 240 if (CollationSettings.getStrength(options) <= Collator.SECONDARY) { 241 return Collation.EQUAL; 242 } 243 244 int tertiaryMask = CollationSettings.getTertiaryMask(options); 245 246 int leftIndex = 0; 247 int rightIndex = 0; 248 int anyQuaternaries = 0; 249 for (;;) { 250 int leftLower32, leftTertiary; 251 do { 252 leftLower32 = (int) left.getCE(leftIndex++); 253 anyQuaternaries |= leftLower32; 254 assert ((leftLower32 & Collation.ONLY_TERTIARY_MASK) != 0 || (leftLower32 & 0xc0c0) == 0); 255 leftTertiary = leftLower32 & tertiaryMask; 256 } while (leftTertiary == 0); 257 258 int rightLower32, rightTertiary; 259 do { 260 rightLower32 = (int) right.getCE(rightIndex++); 261 anyQuaternaries |= rightLower32; 262 assert ((rightLower32 & Collation.ONLY_TERTIARY_MASK) != 0 || (rightLower32 & 0xc0c0) == 0); 263 rightTertiary = rightLower32 & tertiaryMask; 264 } while (rightTertiary == 0); 265 266 if (leftTertiary != rightTertiary) { 267 if (CollationSettings.sortsTertiaryUpperCaseFirst(options)) { 268 // Pass through NO_CE and keep real tertiary weights larger than that. 269 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut), 270 // to keep tertiary CEs well-formed. 271 // Their case+tertiary weights must be greater than those of 272 // primary and secondary CEs. 273 if (leftTertiary > Collation.NO_CE_WEIGHT16) { 274 if ((leftLower32 & 0xffff0000) != 0) { 275 leftTertiary ^= 0xc000; 276 } else { 277 leftTertiary += 0x4000; 278 } 279 } 280 if (rightTertiary > Collation.NO_CE_WEIGHT16) { 281 if ((rightLower32 & 0xffff0000) != 0) { 282 rightTertiary ^= 0xc000; 283 } else { 284 rightTertiary += 0x4000; 285 } 286 } 287 } 288 return (leftTertiary < rightTertiary) ? Collation.LESS : Collation.GREATER; 289 } 290 if (leftTertiary == Collation.NO_CE_WEIGHT16) { 291 break; 292 } 293 } 294 if (CollationSettings.getStrength(options) <= Collator.TERTIARY) { 295 return Collation.EQUAL; 296 } 297 298 if (!anyVariable && (anyQuaternaries & 0xc0) == 0) { 299 // If there are no "variable" CEs and no non-zero quaternary weights, 300 // then there are no quaternary differences. 301 return Collation.EQUAL; 302 } 303 304 leftIndex = 0; 305 rightIndex = 0; 306 for (;;) { 307 long leftQuaternary; 308 do { 309 long ce = left.getCE(leftIndex++); 310 leftQuaternary = ce & 0xffff; 311 if (leftQuaternary <= Collation.NO_CE_WEIGHT16) { 312 // Variable primary or completely ignorable or NO_CE. 313 leftQuaternary = ce >>> 32; 314 } else { 315 // Regular CE, not tertiary ignorable. 316 // Preserve the quaternary weight in bits 7..6. 317 leftQuaternary |= 0xffffff3fL; 318 } 319 } while (leftQuaternary == 0); 320 321 long rightQuaternary; 322 do { 323 long ce = right.getCE(rightIndex++); 324 rightQuaternary = ce & 0xffff; 325 if (rightQuaternary <= Collation.NO_CE_WEIGHT16) { 326 // Variable primary or completely ignorable or NO_CE. 327 rightQuaternary = ce >>> 32; 328 } else { 329 // Regular CE, not tertiary ignorable. 330 // Preserve the quaternary weight in bits 7..6. 331 rightQuaternary |= 0xffffff3fL; 332 } 333 } while (rightQuaternary == 0); 334 335 if (leftQuaternary != rightQuaternary) { 336 // Return the difference, with script reordering. 337 if (settings.hasReordering()) { 338 leftQuaternary = settings.reorder(leftQuaternary); 339 rightQuaternary = settings.reorder(rightQuaternary); 340 } 341 return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER; 342 } 343 if (leftQuaternary == Collation.NO_CE_PRIMARY) { 344 break; 345 } 346 } 347 return Collation.EQUAL; 348 } 349} 350