1/* 2******************************************************************************* 3* Copyright (C) 2012-2014, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* FCDIterCollationIterator.java, ported from uitercollationiterator.h/.cpp 7* 8* C++ version created on: 2012sep23 (from utf16collationiterator.h) 9* created by: Markus W. Scherer 10*/ 11 12package com.ibm.icu.impl.coll; 13 14import com.ibm.icu.impl.Normalizer2Impl; 15import com.ibm.icu.text.UCharacterIterator; 16 17/** 18 * Incrementally checks the input text for FCD and normalizes where necessary. 19 */ 20public final class FCDIterCollationIterator extends IterCollationIterator { 21 public FCDIterCollationIterator(CollationData data, boolean numeric, 22 UCharacterIterator ui, int startIndex) { 23 super(data, numeric, ui); 24 state = State.ITER_CHECK_FWD; 25 start = startIndex; 26 nfcImpl = data.nfcImpl; 27 } 28 29 @Override 30 public void resetToOffset(int newOffset) { 31 super.resetToOffset(newOffset); 32 start = newOffset; 33 state = State.ITER_CHECK_FWD; 34 } 35 36 @Override 37 public int getOffset() { 38 if(state.compareTo(State.ITER_CHECK_BWD) <= 0) { 39 return iter.getIndex(); 40 } else if(state == State.ITER_IN_FCD_SEGMENT) { 41 return pos; 42 } else if(pos == 0) { 43 return start; 44 } else { 45 return limit; 46 } 47 } 48 49 @Override 50 public int nextCodePoint() { 51 int c; 52 for(;;) { 53 if(state == State.ITER_CHECK_FWD) { 54 c = iter.next(); 55 if(c < 0) { 56 return c; 57 } 58 if(CollationFCD.hasTccc(c)) { 59 if(CollationFCD.maybeTibetanCompositeVowel(c) || 60 CollationFCD.hasLccc(iter.current())) { 61 iter.previous(); 62 if(!nextSegment()) { 63 return Collation.SENTINEL_CP; 64 } 65 continue; 66 } 67 } 68 if(isLeadSurrogate(c)) { 69 int trail = iter.next(); 70 if(isTrailSurrogate(trail)) { 71 return Character.toCodePoint((char)c, (char)trail); 72 } else if(trail >= 0) { 73 iter.previous(); 74 } 75 } 76 return c; 77 } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) { 78 c = iter.nextCodePoint(); 79 pos += Character.charCount(c); 80 assert(c >= 0); 81 return c; 82 } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && 83 pos != normalized.length()) { 84 c = normalized.codePointAt(pos); 85 pos += Character.charCount(c); 86 return c; 87 } else { 88 switchToForward(); 89 } 90 } 91 } 92 93 @Override 94 public int previousCodePoint() { 95 int c; 96 for(;;) { 97 if(state == State.ITER_CHECK_BWD) { 98 c = iter.previous(); 99 if(c < 0) { 100 start = pos = 0; 101 state = State.ITER_IN_FCD_SEGMENT; 102 return Collation.SENTINEL_CP; 103 } 104 if(CollationFCD.hasLccc(c)) { 105 int prev = Collation.SENTINEL_CP; 106 if(CollationFCD.maybeTibetanCompositeVowel(c) || 107 CollationFCD.hasTccc(prev = iter.previous())) { 108 iter.next(); 109 if(prev >= 0) { 110 iter.next(); 111 } 112 if(!previousSegment()) { 113 return Collation.SENTINEL_CP; 114 } 115 continue; 116 } 117 // hasLccc(trail)=true for all trail surrogates 118 if(isTrailSurrogate(c)) { 119 if(prev < 0) { 120 prev = iter.previous(); 121 } 122 if(isLeadSurrogate(prev)) { 123 return Character.toCodePoint((char)prev, (char)c); 124 } 125 } 126 if(prev >= 0) { 127 iter.next(); 128 } 129 } 130 return c; 131 } else if(state == State.ITER_IN_FCD_SEGMENT && pos != start) { 132 c = iter.previousCodePoint(); 133 pos -= Character.charCount(c); 134 assert(c >= 0); 135 return c; 136 } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != 0) { 137 c = normalized.codePointBefore(pos); 138 pos -= Character.charCount(c); 139 return c; 140 } else { 141 switchToBackward(); 142 } 143 } 144 } 145 146 @Override 147 protected long handleNextCE32() { 148 int c; 149 for(;;) { 150 if(state == State.ITER_CHECK_FWD) { 151 c = iter.next(); 152 if(c < 0) { 153 return NO_CP_AND_CE32; 154 } 155 if(CollationFCD.hasTccc(c)) { 156 if(CollationFCD.maybeTibetanCompositeVowel(c) || 157 CollationFCD.hasLccc(iter.current())) { 158 iter.previous(); 159 if(!nextSegment()) { 160 c = Collation.SENTINEL_CP; 161 return Collation.FALLBACK_CE32; 162 } 163 continue; 164 } 165 } 166 break; 167 } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) { 168 c = iter.next(); 169 ++pos; 170 assert(c >= 0); 171 break; 172 } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && 173 pos != normalized.length()) { 174 c = normalized.charAt(pos++); 175 break; 176 } else { 177 switchToForward(); 178 } 179 } 180 return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead((char)c)); 181 } 182 183 @Override 184 protected char handleGetTrailSurrogate() { 185 if(state.compareTo(State.ITER_IN_FCD_SEGMENT) <= 0) { 186 int trail = iter.next(); 187 if(isTrailSurrogate(trail)) { 188 if(state == State.ITER_IN_FCD_SEGMENT) { ++pos; } 189 } else if(trail >= 0) { 190 iter.previous(); 191 } 192 return (char)trail; 193 } else { 194 assert(pos < normalized.length()); 195 char trail; 196 if(Character.isLowSurrogate(trail = normalized.charAt(pos))) { ++pos; } 197 return trail; 198 } 199 } 200 201 @Override 202 protected void forwardNumCodePoints(int num) { 203 // Specify the class to avoid a virtual-function indirection. 204 // In Java, we would declare this class final. 205 while(num > 0 && nextCodePoint() >= 0) { 206 --num; 207 } 208 } 209 210 @Override 211 protected void backwardNumCodePoints(int num) { 212 // Specify the class to avoid a virtual-function indirection. 213 // In Java, we would declare this class final. 214 while(num > 0 && previousCodePoint() >= 0) { 215 --num; 216 } 217 } 218 219 /** 220 * Switches to forward checking if possible. 221 */ 222 private void switchToForward() { 223 assert(state == State.ITER_CHECK_BWD || 224 (state == State.ITER_IN_FCD_SEGMENT && pos == limit) || 225 (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == normalized.length())); 226 if(state == State.ITER_CHECK_BWD) { 227 // Turn around from backward checking. 228 start = pos = iter.getIndex(); 229 if(pos == limit) { 230 state = State.ITER_CHECK_FWD; // Check forward. 231 } else { // pos < limit 232 state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment. 233 } 234 } else { 235 // Reached the end of the FCD segment. 236 if(state == State.ITER_IN_FCD_SEGMENT) { 237 // The input text segment is FCD, extend it forward. 238 } else { 239 // The input text segment needed to be normalized. 240 // Switch to checking forward from it. 241 if(state == State.IN_NORM_ITER_AT_START) { 242 iter.moveIndex(limit - start); 243 } 244 start = limit; 245 } 246 state = State.ITER_CHECK_FWD; 247 } 248 } 249 250 /** 251 * Extends the FCD text segment forward or normalizes around pos. 252 * @return true if success 253 */ 254 private boolean nextSegment() { 255 assert(state == State.ITER_CHECK_FWD); 256 // The input text [start..(iter index)[ passes the FCD check. 257 pos = iter.getIndex(); 258 // Collect the characters being checked, in case they need to be normalized. 259 if(s == null) { 260 s = new StringBuilder(); 261 } else { 262 s.setLength(0); 263 } 264 int prevCC = 0; 265 for(;;) { 266 // Fetch the next character and its fcd16 value. 267 int c = iter.nextCodePoint(); 268 if(c < 0) { break; } 269 int fcd16 = nfcImpl.getFCD16(c); 270 int leadCC = fcd16 >> 8; 271 if(leadCC == 0 && s.length() != 0) { 272 // FCD boundary before this character. 273 iter.previousCodePoint(); 274 break; 275 } 276 s.appendCodePoint(c); 277 if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) { 278 // Fails FCD check. Find the next FCD boundary and normalize. 279 for(;;) { 280 c = iter.nextCodePoint(); 281 if(c < 0) { break; } 282 if(nfcImpl.getFCD16(c) <= 0xff) { 283 iter.previousCodePoint(); 284 break; 285 } 286 s.appendCodePoint(c); 287 } 288 normalize(s); 289 start = pos; 290 limit = pos + s.length(); 291 state = State.IN_NORM_ITER_AT_LIMIT; 292 pos = 0; 293 return true; 294 } 295 prevCC = fcd16 & 0xff; 296 if(prevCC == 0) { 297 // FCD boundary after the last character. 298 break; 299 } 300 } 301 limit = pos + s.length(); 302 assert(pos != limit); 303 iter.moveIndex(-s.length()); 304 state = State.ITER_IN_FCD_SEGMENT; 305 return true; 306 } 307 308 /** 309 * Switches to backward checking. 310 */ 311 private void switchToBackward() { 312 assert(state == State.ITER_CHECK_FWD || 313 (state == State.ITER_IN_FCD_SEGMENT && pos == start) || 314 (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == 0)); 315 if(state == State.ITER_CHECK_FWD) { 316 // Turn around from forward checking. 317 limit = pos = iter.getIndex(); 318 if(pos == start) { 319 state = State.ITER_CHECK_BWD; // Check backward. 320 } else { // pos > start 321 state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment. 322 } 323 } else { 324 // Reached the start of the FCD segment. 325 if(state == State.ITER_IN_FCD_SEGMENT) { 326 // The input text segment is FCD, extend it backward. 327 } else { 328 // The input text segment needed to be normalized. 329 // Switch to checking backward from it. 330 if(state == State.IN_NORM_ITER_AT_LIMIT) { 331 iter.moveIndex(start - limit); 332 } 333 limit = start; 334 } 335 state = State.ITER_CHECK_BWD; 336 } 337 } 338 339 /** 340 * Extends the FCD text segment backward or normalizes around pos. 341 * @return true if success 342 */ 343 private boolean previousSegment() { 344 assert(state == State.ITER_CHECK_BWD); 345 // The input text [(iter index)..limit[ passes the FCD check. 346 pos = iter.getIndex(); 347 // Collect the characters being checked, in case they need to be normalized. 348 if(s == null) { 349 s = new StringBuilder(); 350 } else { 351 s.setLength(0); 352 } 353 int nextCC = 0; 354 for(;;) { 355 // Fetch the previous character and its fcd16 value. 356 int c = iter.previousCodePoint(); 357 if(c < 0) { break; } 358 int fcd16 = nfcImpl.getFCD16(c); 359 int trailCC = fcd16 & 0xff; 360 if(trailCC == 0 && s.length() != 0) { 361 // FCD boundary after this character. 362 iter.nextCodePoint(); 363 break; 364 } 365 s.appendCodePoint(c); 366 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || 367 CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) { 368 // Fails FCD check. Find the previous FCD boundary and normalize. 369 while(fcd16 > 0xff) { 370 c = iter.previousCodePoint(); 371 if(c < 0) { break; } 372 fcd16 = nfcImpl.getFCD16(c); 373 if(fcd16 == 0) { 374 iter.nextCodePoint(); 375 break; 376 } 377 s.appendCodePoint(c); 378 } 379 s.reverse(); 380 normalize(s); 381 limit = pos; 382 start = pos - s.length(); 383 state = State.IN_NORM_ITER_AT_START; 384 pos = normalized.length(); 385 return true; 386 } 387 nextCC = fcd16 >> 8; 388 if(nextCC == 0) { 389 // FCD boundary before the following character. 390 break; 391 } 392 } 393 start = pos - s.length(); 394 assert(pos != start); 395 iter.moveIndex(s.length()); 396 state = State.ITER_IN_FCD_SEGMENT; 397 return true; 398 } 399 400 private void normalize(CharSequence s) { 401 if(normalized == null) { 402 normalized = new StringBuilder(); 403 } 404 // NFD without argument checking. 405 nfcImpl.decompose(s, normalized); 406 } 407 408 private enum State { 409 /** 410 * The input text [start..(iter index)[ passes the FCD check. 411 * Moving forward checks incrementally. 412 * pos & limit are undefined. 413 */ 414 ITER_CHECK_FWD, 415 /** 416 * The input text [(iter index)..limit[ passes the FCD check. 417 * Moving backward checks incrementally. 418 * start & pos are undefined. 419 */ 420 ITER_CHECK_BWD, 421 /** 422 * The input text [start..limit[ passes the FCD check. 423 * pos tracks the current text index. 424 */ 425 ITER_IN_FCD_SEGMENT, 426 /** 427 * The input text [start..limit[ failed the FCD check and was normalized. 428 * pos tracks the current index in the normalized string. 429 * The text iterator is at the limit index. 430 */ 431 IN_NORM_ITER_AT_LIMIT, 432 /** 433 * The input text [start..limit[ failed the FCD check and was normalized. 434 * pos tracks the current index in the normalized string. 435 * The text iterator is at the start index. 436 */ 437 IN_NORM_ITER_AT_START 438 } 439 440 private State state; 441 442 private int start; 443 private int pos; 444 private int limit; 445 446 private final Normalizer2Impl nfcImpl; 447 private StringBuilder s; 448 private StringBuilder normalized; 449} 450