CollationCompare.java revision 7935b1839a081ed19ae0d33029ad3c09632a2caa
1/* 2 ******************************************************************************* 3 * Copyright (C) 1996-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * CollationCompare.java, ported from collationcompare.h/.cpp 7 * 8 * C++ version created on: 2012feb14 with new and old collation code 9 * created by: Markus W. Scherer 10 */ 11 12package com.ibm.icu.impl.coll; 13 14import com.ibm.icu.text.Collator; 15 16public final class CollationCompare /* all static */ { 17 public static int compareUpToQuaternary(CollationIterator left, CollationIterator right, 18 CollationSettings settings) { 19 int options = settings.options; 20 long variableTop; 21 if ((options & CollationSettings.ALTERNATE_MASK) == 0) { 22 variableTop = 0; 23 } else { 24 // +1 so that we can use "<" and primary ignorables test out early. 25 variableTop = settings.variableTop + 1; 26 } 27 boolean anyVariable = false; 28 29 // Fetch CEs, compare primaries, store secondary & tertiary weights. 30 for (;;) { 31 // We fetch CEs until we get a non-ignorable primary or reach the end. 32 long leftPrimary; 33 do { 34 long ce = left.nextCE(); 35 leftPrimary = ce >>> 32; 36 if (leftPrimary < variableTop && leftPrimary > Collation.MERGE_SEPARATOR_PRIMARY) { 37 // Variable CE, shift it to quaternary level. 38 // Ignore all following primary ignorables, and shift further variable CEs. 39 anyVariable = true; 40 do { 41 // Store only the primary of the variable CE. 42 left.setCurrentCE(ce & 0xffffffff00000000L); 43 for (;;) { 44 ce = left.nextCE(); 45 leftPrimary = ce >>> 32; 46 if (leftPrimary == 0) { 47 left.setCurrentCE(0); 48 } else { 49 break; 50 } 51 } 52 } while (leftPrimary < variableTop && leftPrimary > Collation.MERGE_SEPARATOR_PRIMARY); 53 } 54 } while (leftPrimary == 0); 55 56 long rightPrimary; 57 do { 58 long ce = right.nextCE(); 59 rightPrimary = ce >>> 32; 60 if (rightPrimary < variableTop && rightPrimary > Collation.MERGE_SEPARATOR_PRIMARY) { 61 // Variable CE, shift it to quaternary level. 62 // Ignore all following primary ignorables, and shift further variable CEs. 63 anyVariable = true; 64 do { 65 // Store only the primary of the variable CE. 66 right.setCurrentCE(ce & 0xffffffff00000000L); 67 for (;;) { 68 ce = right.nextCE(); 69 rightPrimary = ce >>> 32; 70 if (rightPrimary == 0) { 71 right.setCurrentCE(0); 72 } else { 73 break; 74 } 75 } 76 } while (rightPrimary < variableTop && rightPrimary > Collation.MERGE_SEPARATOR_PRIMARY); 77 } 78 } while (rightPrimary == 0); 79 80 if (leftPrimary != rightPrimary) { 81 // Return the primary difference, with script reordering. 82 byte[] reorderTable = settings.reorderTable; 83 if (reorderTable != null) { 84 leftPrimary = Collation.reorder(reorderTable, leftPrimary); 85 rightPrimary = Collation.reorder(reorderTable, rightPrimary); 86 } 87 return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER; 88 } 89 if (leftPrimary == Collation.NO_CE_PRIMARY) { 90 break; 91 } 92 } 93 94 // Compare the buffered secondary & tertiary weights. 95 // We might skip the secondary level but continue with the case level 96 // which is turned on separately. 97 if (CollationSettings.getStrength(options) >= Collator.SECONDARY) { 98 if ((options & CollationSettings.BACKWARD_SECONDARY) == 0) { 99 int leftIndex = 0; 100 int rightIndex = 0; 101 for (;;) { 102 int leftSecondary; 103 do { 104 leftSecondary = ((int) left.getCE(leftIndex++)) >>> 16; 105 } while (leftSecondary == 0); 106 107 int rightSecondary; 108 do { 109 rightSecondary = ((int) right.getCE(rightIndex++)) >>> 16; 110 } while (rightSecondary == 0); 111 112 if (leftSecondary != rightSecondary) { 113 return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER; 114 } 115 if (leftSecondary == Collation.NO_CE_WEIGHT16) { 116 break; 117 } 118 } 119 } else { 120 // The backwards secondary level compares secondary weights backwards 121 // within segments separated by the merge separator (U+FFFE, weight 02). 122 int leftStart = 0; 123 int rightStart = 0; 124 for (;;) { 125 // Find the merge separator or the NO_CE terminator. 126 int leftLimit = leftStart; 127 long leftLower32; 128 while ((leftLower32 = left.getCE(leftLimit) & 0xffffffffL) > Collation.MERGE_SEPARATOR_LOWER32 129 || leftLower32 == 0) { 130 ++leftLimit; 131 } 132 int rightLimit = rightStart; 133 long rightLower32; 134 while ((rightLower32 = right.getCE(rightLimit) & 0xffffffffL) > Collation.MERGE_SEPARATOR_LOWER32 135 || rightLower32 == 0) { 136 ++rightLimit; 137 } 138 139 // Compare the segments. 140 int leftIndex = leftLimit; 141 int rightIndex = rightLimit; 142 for (;;) { 143 int leftSecondary = 0; 144 while (leftSecondary == 0 && leftIndex > leftStart) { 145 leftSecondary = ((int) left.getCE(--leftIndex)) >>> 16; 146 } 147 148 int rightSecondary = 0; 149 while (rightSecondary == 0 && rightIndex > rightStart) { 150 rightSecondary = ((int) right.getCE(--rightIndex)) >>> 16; 151 } 152 153 if (leftSecondary != rightSecondary) { 154 return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER; 155 } 156 if (leftSecondary == 0) { 157 break; 158 } 159 } 160 161 // Did we reach the end of either string? 162 // Both strings have the same number of merge separators, 163 // or else there would have been a primary-level difference. 164 assert (left.getCE(leftLimit) == right.getCE(rightLimit)); 165 if (left.getCE(leftLimit) == Collation.NO_CE) { 166 break; 167 } 168 // Skip both merge separators and continue. 169 leftStart = leftLimit + 1; 170 rightStart = rightLimit + 1; 171 } 172 } 173 } 174 175 if ((options & CollationSettings.CASE_LEVEL) != 0) { 176 int strength = CollationSettings.getStrength(options); 177 int leftIndex = 0; 178 int rightIndex = 0; 179 for (;;) { 180 int leftCase, leftLower32, rightCase; 181 if (strength == Collator.PRIMARY) { 182 // Primary+caseLevel: Ignore case level weights of primary ignorables. 183 // Otherwise we would get a-umlaut > a 184 // which is not desirable for accent-insensitive sorting. 185 // Check for (lower 32 bits) == 0 as well because variable CEs are stored 186 // with only primary weights. 187 long ce; 188 do { 189 ce = left.getCE(leftIndex++); 190 leftCase = (int) ce; 191 } while ((ce >>> 32) == 0 || leftCase == 0); 192 leftLower32 = leftCase; 193 leftCase &= 0xc000; 194 195 do { 196 ce = right.getCE(rightIndex++); 197 rightCase = (int) ce; 198 } while ((ce >>> 32) == 0 || rightCase == 0); 199 rightCase &= 0xc000; 200 } else { 201 // Secondary+caseLevel: By analogy with the above, 202 // ignore case level weights of secondary ignorables. 203 // 204 // Note: A tertiary CE has uppercase case bits (0.0.ut) 205 // to keep tertiary+caseFirst well-formed. 206 // 207 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables. 208 // Otherwise a tertiary CE's uppercase would be no greater than 209 // a primary/secondary CE's uppercase. 210 // (See UCA well-formedness condition 2.) 211 // We could construct a special case weight higher than uppercase, 212 // but it's simpler to always ignore case weights of secondary ignorables, 213 // turning 0.0.ut into 0.0.0.t. 214 // (See LDML Collation, Case Parameters.) 215 do { 216 leftCase = (int) left.getCE(leftIndex++); 217 } while ((leftCase & 0xffff0000) == 0); 218 leftLower32 = leftCase; 219 leftCase &= 0xc000; 220 221 do { 222 rightCase = (int) right.getCE(rightIndex++); 223 } while ((rightCase & 0xffff0000) == 0); 224 rightCase &= 0xc000; 225 } 226 227 // No need to handle NO_CE and MERGE_SEPARATOR specially: 228 // There is one case weight for each previous-level weight, 229 // so level length differences were handled there. 230 if (leftCase != rightCase) { 231 if ((options & CollationSettings.UPPER_FIRST) == 0) { 232 return (leftCase < rightCase) ? Collation.LESS : Collation.GREATER; 233 } else { 234 return (leftCase < rightCase) ? Collation.GREATER : Collation.LESS; 235 } 236 } 237 if ((leftLower32 >>> 16) == Collation.NO_CE_WEIGHT16) { 238 break; 239 } 240 } 241 } 242 if (CollationSettings.getStrength(options) <= Collator.SECONDARY) { 243 return Collation.EQUAL; 244 } 245 246 int tertiaryMask = CollationSettings.getTertiaryMask(options); 247 248 int leftIndex = 0; 249 int rightIndex = 0; 250 int anyQuaternaries = 0; 251 for (;;) { 252 int leftLower32, leftTertiary; 253 do { 254 leftLower32 = (int) left.getCE(leftIndex++); 255 anyQuaternaries |= leftLower32; 256 assert ((leftLower32 & Collation.ONLY_TERTIARY_MASK) != 0 || (leftLower32 & 0xc0c0) == 0); 257 leftTertiary = leftLower32 & tertiaryMask; 258 } while (leftTertiary == 0); 259 260 int rightLower32, rightTertiary; 261 do { 262 rightLower32 = (int) right.getCE(rightIndex++); 263 anyQuaternaries |= rightLower32; 264 assert ((rightLower32 & Collation.ONLY_TERTIARY_MASK) != 0 || (rightLower32 & 0xc0c0) == 0); 265 rightTertiary = rightLower32 & tertiaryMask; 266 } while (rightTertiary == 0); 267 268 if (leftTertiary != rightTertiary) { 269 if (CollationSettings.sortsTertiaryUpperCaseFirst(options)) { 270 // Pass through NO_CE and MERGE_SEPARATOR 271 // and keep real tertiary weights larger than the MERGE_SEPARATOR. 272 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut), 273 // to keep tertiary CEs well-formed. 274 // Their case+tertiary weights must be greater than those of 275 // primary and secondary CEs. 276 if (leftTertiary > Collation.MERGE_SEPARATOR_WEIGHT16) { 277 if ((leftLower32 & 0xffff0000) != 0) { 278 leftTertiary ^= 0xc000; 279 } else { 280 leftTertiary += 0x4000; 281 } 282 } 283 if (rightTertiary > Collation.MERGE_SEPARATOR_WEIGHT16) { 284 if ((rightLower32 & 0xffff0000) != 0) { 285 rightTertiary ^= 0xc000; 286 } else { 287 rightTertiary += 0x4000; 288 } 289 } 290 } 291 return (leftTertiary < rightTertiary) ? Collation.LESS : Collation.GREATER; 292 } 293 if (leftTertiary == Collation.NO_CE_WEIGHT16) { 294 break; 295 } 296 } 297 if (CollationSettings.getStrength(options) <= Collator.TERTIARY) { 298 return Collation.EQUAL; 299 } 300 301 if (!anyVariable && (anyQuaternaries & 0xc0) == 0) { 302 // If there are no "variable" CEs and no non-zero quaternary weights, 303 // then there are no quaternary differences. 304 return Collation.EQUAL; 305 } 306 307 leftIndex = 0; 308 rightIndex = 0; 309 for (;;) { 310 long leftQuaternary; 311 do { 312 long ce = left.getCE(leftIndex++); 313 leftQuaternary = ce & 0xffff; 314 if (leftQuaternary == 0) { 315 // Variable primary or completely ignorable. 316 leftQuaternary = ce >>> 32; 317 } else if (leftQuaternary <= Collation.MERGE_SEPARATOR_WEIGHT16) { 318 // Leave NO_CE or MERGE_SEPARATOR as is. 319 } else { 320 // Regular CE, not tertiary ignorable. 321 // Preserve the quaternary weight in bits 7..6. 322 leftQuaternary |= 0xffffff3fL; 323 } 324 } while (leftQuaternary == 0); 325 326 long rightQuaternary; 327 do { 328 long ce = right.getCE(rightIndex++); 329 rightQuaternary = ce & 0xffff; 330 if (rightQuaternary == 0) { 331 // Variable primary or completely ignorable. 332 rightQuaternary = ce >>> 32; 333 } else if (rightQuaternary <= Collation.MERGE_SEPARATOR_WEIGHT16) { 334 // Leave NO_CE or MERGE_SEPARATOR as is. 335 } else { 336 // Regular CE, not tertiary ignorable. 337 // Preserve the quaternary weight in bits 7..6. 338 rightQuaternary |= 0xffffff3fL; 339 } 340 } while (rightQuaternary == 0); 341 342 if (leftQuaternary != rightQuaternary) { 343 // Return the difference, with script reordering. 344 byte[] reorderTable = settings.reorderTable; 345 if (reorderTable != null) { 346 leftQuaternary = Collation.reorder(reorderTable, leftQuaternary); 347 rightQuaternary = Collation.reorder(reorderTable, rightQuaternary); 348 } 349 return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER; 350 } 351 if (leftQuaternary == Collation.NO_CE_WEIGHT16) { 352 break; 353 } 354 } 355 return Collation.EQUAL; 356 } 357} 358