1/* 2 ******************************************************************************* 3 * Copyright (C) 2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7package com.ibm.icu.text; 8 9import com.ibm.icu.text.UnicodeSet.SpanCondition; 10import com.ibm.icu.util.OutputInt; 11 12/** 13 * A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches. 14 * An instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen. 15 * <p><b>Note:</b> The counting, deletion, and replacement depend on alternating a {@link SpanCondition} with 16 * its inverse. That is, the code spans, then spans for the inverse, then spans, and so on. 17 * For the inverse, the following mapping is used:</p> 18 * <ul> 19 * <li>{@link UnicodeSet.SpanCondition#SIMPLE} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li> 20 * <li>{@link UnicodeSet.SpanCondition#CONTAINED} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li> 21 * <li>{@link UnicodeSet.SpanCondition#NOT_CONTAINED} → {@link UnicodeSet.SpanCondition#SIMPLE}</li> 22 * </ul> 23 * These are actually not complete inverses. However, the alternating works because there are no gaps. 24 * For example, with [a{ab}{bc}], you get the following behavior when scanning forward: 25 * <p> 26 * <table border="1"> 27 * <tr><th>SIMPLE</th><td>xxx[ab]cyyy</td></tr> 28 * <tr><th>CONTAINED</th><td>xxx[abc]yyy</td></tr> 29 * <tr><th>NOT_CONTAINED</th><td>[xxx]ab[cyyy]</td></tr> 30 * </table> 31 * <p>So here is what happens when you alternate: 32 * <p> 33 * <table border="1"> 34 * <tr><th>start</th><td>|xxxabcyyy</td></tr> 35 * <tr><th>NOT_CONTAINED</th><td>xxx|abcyyy</td></tr> 36 * <tr><th>CONTAINED</th><td>xxxabc|yyy</td></tr> 37 * <tr><th>NOT_CONTAINED</th><td>xxxabcyyy|</td></tr> 38 * </table> 39 * </p>The entire string is traversed. 40 * 41 * @draft ICU 54 42 * @provisional This is a draft API and might change in a future release of ICU. 43 */ 44public class UnicodeSetSpanner { 45 46 private final UnicodeSet unicodeSet; 47 48 /** 49 * Create a spanner from a UnicodeSet. For speed and safety, the UnicodeSet should be frozen. However, this class 50 * can be used with a non-frozen version to avoid the cost of freezing. 51 * 52 * @param source 53 * the original UnicodeSet 54 * 55 * @draft ICU 54 56 * @provisional This is a draft API and might change in a future release of ICU. 57 */ 58 public UnicodeSetSpanner(UnicodeSet source) { 59 unicodeSet = source; 60 } 61 62 /** 63 * Returns the UnicodeSet used for processing. It is frozen iff the original was. 64 * 65 * @return the construction set. 66 * 67 * @draft ICU 54 68 * @provisional This is a draft API and might change in a future release of ICU. 69 */ 70 public UnicodeSet getUnicodeSet() { 71 return unicodeSet; 72 } 73 74 75 /** 76 * {@inheritDoc} 77 * 78 * @draft ICU 54 79 * @provisional This is a draft API and might change in a future release of ICU. 80 */ 81 @Override 82 public boolean equals(Object other) { 83 return other instanceof UnicodeSetSpanner && unicodeSet.equals(((UnicodeSetSpanner) other).unicodeSet); 84 } 85 86 /** 87 * {@inheritDoc} 88 * 89 * @draft ICU 54 90 * @provisional This is a draft API and might change in a future release of ICU. 91 */ 92 @Override 93 public int hashCode() { 94 return unicodeSet.hashCode(); 95 } 96 97 /** 98 * Options for replaceFrom and countIn to control how to treat each matched span. 99 * It is similar to whether one is replacing [abc] by x, or [abc]* by x. 100 * 101 * @draft ICU 54 102 * @provisional This is a draft API and might change in a future release of ICU. 103 */ 104 public enum CountMethod { 105 /** 106 * Collapse spans. That is, modify/count the entire matching span as a single item, instead of separate 107 * set elements. 108 * 109 * @draft ICU 54 110 * @provisional This is a draft API and might change in a future release of ICU. 111 */ 112 WHOLE_SPAN, 113 /** 114 * Use the smallest number of elements in the spanned range for counting and modification, 115 * based on the {@link UnicodeSet.SpanCondition}. 116 * If the set has no strings, this will be the same as the number of spanned code points. 117 * <p>For example, in the string "abab" with SpanCondition.SIMPLE: 118 * <ul> 119 * <li>spanning with [ab] will count four MIN_ELEMENTS.</li> 120 * <li>spanning with [{ab}] will count two MIN_ELEMENTS.</li> 121 * <li>spanning with [ab{ab}] will also count two MIN_ELEMENTS.</li> 122 * </ul> 123 * 124 * @draft ICU 54 125 * @provisional This is a draft API and might change in a future release of ICU. 126 */ 127 MIN_ELEMENTS, 128 // Note: could in the future have an additional option MAX_ELEMENTS 129 } 130 131 /** 132 * Returns the number of matching characters found in a character sequence, 133 * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE. 134 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 135 * @param sequence 136 * the sequence to count characters in 137 * @return the count. Zero if there are none. 138 * 139 * @draft ICU 54 140 * @provisional This is a draft API and might change in a future release of ICU. 141 */ 142 public int countIn(CharSequence sequence) { 143 return countIn(sequence, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE); 144 } 145 146 /** 147 * Returns the number of matching characters found in a character sequence, using SpanCondition.SIMPLE. 148 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 149 * @param sequence 150 * the sequence to count characters in 151 * @param countMethod 152 * whether to treat an entire span as a match, or individual elements as matches 153 * @return the count. Zero if there are none. 154 * 155 * @draft ICU 54 156 * @provisional This is a draft API and might change in a future release of ICU. 157 */ 158 public int countIn(CharSequence sequence, CountMethod countMethod) { 159 return countIn(sequence, countMethod, SpanCondition.SIMPLE); 160 } 161 162 /** 163 * Returns the number of matching characters found in a character sequence. 164 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 165 * @param sequence 166 * the sequence to count characters in 167 * @param countMethod 168 * whether to treat an entire span as a match, or individual elements as matches 169 * @param spanCondition 170 * the spanCondition to use. SIMPLE or CONTAINED means only count the elements in the span; 171 * NOT_CONTAINED is the reverse. 172 * <br><b>WARNING: </b> when a UnicodeSet contains strings, there may be unexpected behavior in edge cases. 173 * @return the count. Zero if there are none. 174 * 175 * @draft ICU 54 176 * @provisional This is a draft API and might change in a future release of ICU. 177 */ 178 public int countIn(CharSequence sequence, CountMethod countMethod, SpanCondition spanCondition) { 179 int count = 0; 180 int start = 0; 181 SpanCondition skipSpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE 182 : SpanCondition.NOT_CONTAINED; 183 final int length = sequence.length(); 184 OutputInt spanCount = null; 185 while (start != length) { 186 int endOfSpan = unicodeSet.span(sequence, start, skipSpan); 187 if (endOfSpan == length) { 188 break; 189 } 190 if (countMethod == CountMethod.WHOLE_SPAN) { 191 start = unicodeSet.span(sequence, endOfSpan, spanCondition); 192 count += 1; 193 } else { 194 if (spanCount == null) { 195 spanCount = new OutputInt(); 196 } 197 start = unicodeSet.spanAndCount(sequence, endOfSpan, spanCondition, spanCount); 198 count += spanCount.value; 199 } 200 } 201 return count; 202 } 203 204 /** 205 * Delete all the matching spans in sequence, using SpanCondition.SIMPLE 206 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 207 * @param sequence 208 * charsequence to replace matching spans in. 209 * @return modified string. 210 * 211 * @draft ICU 54 212 * @provisional This is a draft API and might change in a future release of ICU. 213 */ 214 public String deleteFrom(CharSequence sequence) { 215 return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, SpanCondition.SIMPLE); 216 } 217 218 /** 219 * Delete all matching spans in sequence, according to the spanCondition. 220 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 221 * @param sequence 222 * charsequence to replace matching spans in. 223 * @param spanCondition 224 * specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching (NOT_CONTAINED) 225 * @return modified string. 226 * 227 * @draft ICU 54 228 * @provisional This is a draft API and might change in a future release of ICU. 229 */ 230 public String deleteFrom(CharSequence sequence, SpanCondition spanCondition) { 231 return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, spanCondition); 232 } 233 234 /** 235 * Replace all matching spans in sequence by the replacement, 236 * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE. 237 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 238 * @param sequence 239 * charsequence to replace matching spans in. 240 * @param replacement 241 * replacement sequence. To delete, use "" 242 * @return modified string. 243 * 244 * @draft ICU 54 245 * @provisional This is a draft API and might change in a future release of ICU. 246 */ 247 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 248 return replaceFrom(sequence, replacement, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE); 249 } 250 251 /** 252 * Replace all matching spans in sequence by replacement, according to the CountMethod, using SpanCondition.SIMPLE. 253 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 254 * 255 * @param sequence 256 * charsequence to replace matching spans in. 257 * @param replacement 258 * replacement sequence. To delete, use "" 259 * @param countMethod 260 * whether to treat an entire span as a match, or individual elements as matches 261 * @return modified string. 262 * 263 * @draft ICU 54 264 * @provisional This is a draft API and might change in a future release of ICU. 265 */ 266 public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod) { 267 return replaceFrom(sequence, replacement, countMethod, SpanCondition.SIMPLE); 268 } 269 270 /** 271 * Replace all matching spans in sequence by replacement, according to the countMethod and spanCondition. 272 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 273 * @param sequence 274 * charsequence to replace matching spans in. 275 * @param replacement 276 * replacement sequence. To delete, use "" 277 * @param countMethod 278 * whether to treat an entire span as a match, or individual elements as matches 279 * @param spanCondition 280 * specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching 281 * (NOT_CONTAINED) 282 * @return modified string. 283 * 284 * @draft ICU 54 285 * @provisional This is a draft API and might change in a future release of ICU. 286 */ 287 public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod, 288 SpanCondition spanCondition) { 289 SpanCondition copySpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE 290 : SpanCondition.NOT_CONTAINED; 291 final boolean remove = replacement.length() == 0; 292 StringBuilder result = new StringBuilder(); 293 // TODO, we can optimize this to 294 // avoid this allocation unless needed 295 296 final int length = sequence.length(); 297 OutputInt spanCount = null; 298 for (int endCopy = 0; endCopy != length;) { 299 int endModify; 300 if (countMethod == CountMethod.WHOLE_SPAN) { 301 endModify = unicodeSet.span(sequence, endCopy, spanCondition); 302 } else { 303 if (spanCount == null) { 304 spanCount = new OutputInt(); 305 } 306 endModify = unicodeSet.spanAndCount(sequence, endCopy, spanCondition, spanCount); 307 } 308 if (remove || endModify == 0) { 309 // do nothing 310 } else if (countMethod == CountMethod.WHOLE_SPAN) { 311 result.append(replacement); 312 } else { 313 for (int i = spanCount.value; i > 0; --i) { 314 result.append(replacement); 315 } 316 } 317 if (endModify == length) { 318 break; 319 } 320 endCopy = unicodeSet.span(sequence, endModify, copySpan); 321 result.append(sequence.subSequence(endModify, endCopy)); 322 } 323 return result.toString(); 324 } 325 326 /** 327 * Options for the trim() method 328 * 329 * @draft ICU 54 330 * @provisional This is a draft API and might change in a future release of ICU. 331 */ 332 public enum TrimOption { 333 /** 334 * Trim leading spans. 335 * 336 * @draft ICU 54 337 * @provisional This is a draft API and might change in a future release of ICU. 338 */ 339 LEADING, 340 /** 341 * Trim leading and trailing spans. 342 * 343 * @draft ICU 54 344 * @provisional This is a draft API and might change in a future release of ICU. 345 */ 346 BOTH, 347 /** 348 * Trim trailing spans. 349 * 350 * @draft ICU 54 351 * @provisional This is a draft API and might change in a future release of ICU. 352 */ 353 TRAILING; 354 } 355 356 /** 357 * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start and 358 * end of the string, using TrimOption.BOTH and SpanCondition.SIMPLE. For example: 359 * 360 * <pre> 361 * {@code 362 * 363 * new UnicodeSet("[ab]").trim("abacatbab")} 364 * </pre> 365 * 366 * ... returns {@code "cat"}. 367 * @param sequence 368 * the sequence to trim 369 * @return a subsequence 370 * 371 * @draft ICU 54 372 * @provisional This is a draft API and might change in a future release of ICU. 373 */ 374 public CharSequence trim(CharSequence sequence) { 375 return trim(sequence, TrimOption.BOTH, SpanCondition.SIMPLE); 376 } 377 378 /** 379 * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or 380 * end of the string, using the trimOption and SpanCondition.SIMPLE. For example: 381 * 382 * <pre> 383 * {@code 384 * 385 * new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING)} 386 * </pre> 387 * 388 * ... returns {@code "catbab"}. 389 * 390 * @param sequence 391 * the sequence to trim 392 * @param trimOption 393 * LEADING, TRAILING, or BOTH 394 * @return a subsequence 395 * 396 * @draft ICU 54 397 * @provisional This is a draft API and might change in a future release of ICU. 398 */ 399 public CharSequence trim(CharSequence sequence, TrimOption trimOption) { 400 return trim(sequence, trimOption, SpanCondition.SIMPLE); 401 } 402 403 /** 404 * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or 405 * end of the string, depending on the trimOption and spanCondition. For example: 406 * 407 * <pre> 408 * {@code 409 * 410 * new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING, SpanCondition.SIMPLE)} 411 * </pre> 412 * 413 * ... returns {@code "catbab"}. 414 * 415 * @param sequence 416 * the sequence to trim 417 * @param trimOption 418 * LEADING, TRAILING, or BOTH 419 * @param spanCondition 420 * SIMPLE, CONTAINED or NOT_CONTAINED 421 * @return a subsequence 422 * 423 * @draft ICU 54 424 * @provisional This is a draft API and might change in a future release of ICU. 425 */ 426 public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition) { 427 int endLeadContained, startTrailContained; 428 final int length = sequence.length(); 429 if (trimOption != TrimOption.TRAILING) { 430 endLeadContained = unicodeSet.span(sequence, spanCondition); 431 if (endLeadContained == length) { 432 return ""; 433 } 434 } else { 435 endLeadContained = 0; 436 } 437 if (trimOption != TrimOption.LEADING) { 438 startTrailContained = unicodeSet.spanBack(sequence, spanCondition); 439 } else { 440 startTrailContained = length; 441 } 442 return endLeadContained == 0 && startTrailContained == length ? sequence : sequence.subSequence( 443 endLeadContained, startTrailContained); 444 } 445 446} 447