1/* 2 * Copyright (C) 2009 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.google.common.base; 18 19import static com.google.common.base.Preconditions.checkArgument; 20import static com.google.common.base.Preconditions.checkNotNull; 21import static com.google.common.base.Preconditions.checkState; 22 23import java.util.Iterator; 24import java.util.NoSuchElementException; 25import java.util.StringTokenizer; 26import java.util.regex.Matcher; 27import java.util.regex.Pattern; 28import java.util.regex.PatternSyntaxException; 29 30/** 31 * An object that divides strings (or other instances of {@code CharSequence}) 32 * into substrings, by recognizing a <i>separator</i> (a.k.a. "delimiter") 33 * which can be expressed as a single character, literal string, regular 34 * expression, {@code CharMatcher}, or by using a fixed substring length. This 35 * class provides the complementary functionality to {@link Joiner}. 36 * 37 * <p>Here is the most basic example of {@code Splitter} usage: <pre> {@code 38 * 39 * Splitter.on(',').split("foo,bar")}</pre> 40 * 41 * This invocation returns an {@code Iterable<String>} containing {@code "foo"} 42 * and {@code "bar"}, in that order. 43 * 44 * <p>By default {@code Splitter}'s behavior is very simplistic: <pre> {@code 45 * 46 * Splitter.on(',').split("foo,,bar, quux")}</pre> 47 * 48 * This returns an iterable containing {@code ["foo", "", "bar", " quux"]}. 49 * Notice that the splitter does not assume that you want empty strings removed, 50 * or that you wish to trim whitespace. If you want features like these, simply 51 * ask for them: <pre> {@code 52 * 53 * private static final Splitter MY_SPLITTER = Splitter.on(',') 54 * .trimResults() 55 * .omitEmptyStrings();}</pre> 56 * 57 * Now {@code MY_SPLITTER.split("foo, ,bar, quux,")} returns an iterable 58 * containing just {@code ["foo", "bar", "quux"]}. Note that the order in which 59 * the configuration methods are called is never significant; for instance, 60 * trimming is always applied first before checking for an empty result, 61 * regardless of the order in which the {@link #trimResults()} and 62 * {@link #omitEmptyStrings()} methods were invoked. 63 * 64 * <p><b>Warning: splitter instances are always immutable</b>; a configuration 65 * method such as {@code omitEmptyStrings} has no effect on the instance it 66 * is invoked on! You must store and use the new splitter instance returned by 67 * the method. This makes splitters thread-safe, and safe to store as {@code 68 * static final} constants (as illustrated above). <pre> {@code 69 * 70 * // Bad! Do not do this! 71 * Splitter splitter = Splitter.on('/'); 72 * splitter.trimResults(); // does nothing! 73 * return splitter.split("wrong / wrong / wrong");}</pre> 74 * 75 * The separator recognized by the splitter does not have to be a single 76 * literal character as in the examples above. See the methods {@link 77 * #on(String)}, {@link #on(Pattern)} and {@link #on(CharMatcher)} for examples 78 * of other ways to specify separators. 79 * 80 * <p><b>Note:</b> this class does not mimic any of the quirky behaviors of 81 * similar JDK methods; for instance, it does not silently discard trailing 82 * separators, as does {@link String#split(String)}, nor does it have a default 83 * behavior of using five particular whitespace characters as separators, like 84 * {@link StringTokenizer}. 85 * 86 * @author Julien Silland 87 * @author Jesse Wilson 88 * @author Kevin Bourrillion 89 * @since 2009.09.15 <b>tentative</b> 90 */ 91public final class Splitter { 92 private final CharMatcher trimmer; 93 private final boolean omitEmptyStrings; 94 private final Strategy strategy; 95 96 private Splitter(Strategy strategy) { 97 this(strategy, false, CharMatcher.NONE); 98 } 99 100 private Splitter(Strategy strategy, boolean omitEmptyStrings, 101 CharMatcher trimmer) { 102 this.strategy = strategy; 103 this.omitEmptyStrings = omitEmptyStrings; 104 this.trimmer = trimmer; 105 } 106 107 /** 108 * Returns a splitter that uses the given single-character separator. For 109 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable 110 * containing {@code ["foo", "", "bar"]}. 111 * 112 * @param separator the character to recognize as a separator 113 * @return a splitter, with default settings, that recognizes that separator 114 */ 115 public static Splitter on(char separator) { 116 return on(CharMatcher.is(separator)); 117 } 118 119 /** 120 * Returns a splitter that considers any single character matched by the 121 * given {@code CharMatcher} to be a separator. For example, {@code 122 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an 123 * iterable containing {@code ["foo", "", "bar", "quux"]}. 124 * 125 * @param separatorMatcher a {@link CharMatcher} that determines whether a 126 * character is a separator 127 * @return a splitter, with default settings, that uses this matcher 128 */ 129 public static Splitter on(final CharMatcher separatorMatcher) { 130 checkNotNull(separatorMatcher); 131 132 return new Splitter(new Strategy() { 133 /*@Override*/ public SplittingIterator iterator( 134 Splitter splitter, final CharSequence toSplit) { 135 return new SplittingIterator(splitter, toSplit) { 136 @Override int separatorStart(int start) { 137 return separatorMatcher.indexIn(toSplit, start); 138 } 139 140 @Override int separatorEnd(int separatorPosition) { 141 return separatorPosition + 1; 142 } 143 }; 144 } 145 }); 146 } 147 148 /** 149 * Returns a splitter that uses the given fixed string as a separator. For 150 * example, {@code Splitter.on(", ").split("foo, bar, baz,qux")} returns an 151 * iterable containing {@code ["foo", "bar", "baz,qux"]}. 152 * 153 * @param separator the literal, nonempty string to recognize as a separator 154 * @return a splitter, with default settings, that recognizes that separator 155 */ 156 public static Splitter on(final String separator) { 157 checkArgument(separator.length() != 0, 158 "The separator may not be the empty string."); 159 160 return new Splitter(new Strategy() { 161 /*@Override*/ public SplittingIterator iterator( 162 Splitter splitter, CharSequence toSplit) { 163 return new SplittingIterator(splitter, toSplit) { 164 @Override public int separatorStart(int start) { 165 int delimeterLength = separator.length(); 166 167 positions: 168 for (int p = start, last = toSplit.length() - delimeterLength; 169 p <= last; p++) { 170 for (int i = 0; i < delimeterLength; i++) { 171 if (toSplit.charAt(i + p) != separator.charAt(i)) { 172 continue positions; 173 } 174 } 175 return p; 176 } 177 return -1; 178 } 179 180 @Override public int separatorEnd(int separatorPosition) { 181 return separatorPosition + separator.length(); 182 } 183 }; 184 } 185 }); 186 } 187 188 /** 189 * Returns a splitter that considers any subsequence matching {@code 190 * pattern} to be a separator. For example, {@code 191 * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string 192 * into lines whether it uses DOS-style or UNIX-style line terminators. 193 * 194 * @param separatorPattern the pattern that determines whether a subsequence 195 * is a separator. This pattern may not match the empty string. 196 * @return a splitter, with default settings, that uses this pattern 197 * @throws IllegalArgumentException if {@code separatorPattern} matches the 198 * empty string 199 */ 200 public static Splitter on(final Pattern separatorPattern) { 201 checkNotNull(separatorPattern); 202 checkArgument(!separatorPattern.matcher("").matches(), 203 "The pattern may not match the empty string: %s", separatorPattern); 204 205 return new Splitter(new Strategy() { 206 /*@Override*/ public SplittingIterator iterator( 207 final Splitter splitter, CharSequence toSplit) { 208 final Matcher matcher = separatorPattern.matcher(toSplit); 209 return new SplittingIterator(splitter, toSplit) { 210 @Override public int separatorStart(int start) { 211 return matcher.find(start) ? matcher.start() : -1; 212 } 213 214 @Override public int separatorEnd(int separatorPosition) { 215 return matcher.end(); 216 } 217 }; 218 } 219 }); 220 } 221 222 /** 223 * Returns a splitter that considers any subsequence matching a given 224 * pattern (regular expression) to be a separator. For example, {@code 225 * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines 226 * whether it uses DOS-style or UNIX-style line terminators. This is 227 * equivalent to {@code Splitter.on(Pattern.compile(pattern))}. 228 * 229 * @param separatorPattern the pattern that determines whether a subsequence 230 * is a separator. This pattern may not match the empty string. 231 * @return a splitter, with default settings, that uses this pattern 232 * @throws PatternSyntaxException if {@code separatorPattern} is a malformed 233 * expression 234 * @throws IllegalArgumentException if {@code separatorPattern} matches the 235 * empty string 236 */ 237 public static Splitter onPattern(String separatorPattern) { 238 return on(Pattern.compile(separatorPattern)); 239 } 240 241 /** 242 * Returns a splitter that divides strings into pieces of the given length. 243 * For example, {@code Splitter.atEach(2).split("abcde")} returns an 244 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be 245 * smaller than {@code length} but will never be empty. 246 * 247 * @param length the desired length of pieces after splitting 248 * @return a splitter, with default settings, that can split into fixed sized 249 * pieces 250 */ 251 public static Splitter fixedLength(final int length) { 252 checkArgument(length > 0, "The length may not be less than 1"); 253 254 return new Splitter(new Strategy() { 255 /*@Override*/ public SplittingIterator iterator( 256 final Splitter splitter, CharSequence toSplit) { 257 return new SplittingIterator(splitter, toSplit) { 258 @Override public int separatorStart(int start) { 259 int nextChunkStart = start + length; 260 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1); 261 } 262 263 @Override public int separatorEnd(int separatorPosition) { 264 return separatorPosition; 265 } 266 }; 267 } 268 }); 269 } 270 271 /** 272 * Returns a splitter that behaves equivalently to {@code this} splitter, but 273 * automatically omits empty strings from the results. For example, {@code 274 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an 275 * iterable containing only {@code ["a", "b", "c"]}. 276 * 277 * <p>If either {@code trimResults} option is also specified when creating a 278 * splitter, that splitter always trims results first before checking for 279 * emptiness. So, for example, {@code 280 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns 281 * an empty iterable. 282 * 283 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)} 284 * to return an empty iterable, but when using this option, it can (if the 285 * input sequence consists of nothing but separators). 286 * 287 * @return a splitter with the desired configuration 288 */ 289 public Splitter omitEmptyStrings() { 290 return new Splitter(strategy, true, trimmer); 291 } 292 293 /** 294 * Returns a splitter that behaves equivalently to {@code this} splitter, but 295 * automatically removes leading and trailing {@linkplain 296 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent 297 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code 298 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable 299 * containing {@code ["a", "b", "c"]}. 300 * 301 * @return a splitter with the desired configuration 302 */ 303 public Splitter trimResults() { 304 return trimResults(CharMatcher.WHITESPACE); 305 } 306 307 /** 308 * Returns a splitter that behaves equivalently to {@code this} splitter, but 309 * removes all leading or trailing characters matching the given {@code 310 * CharMatcher} from each returned substring. For example, {@code 311 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")} 312 * returns an iterable containing {@code ["a ", "b_ ", "c"]}. 313 * 314 * @param trimmer a {@link CharMatcher} that determines whether a character 315 * should be removed from the beginning/end of a subsequence 316 * @return a splitter with the desired configuration 317 */ 318 public Splitter trimResults(CharMatcher trimmer) { 319 checkNotNull(trimmer); 320 return new Splitter(strategy, omitEmptyStrings, trimmer); 321 } 322 323 /** 324 * Splits the {@link CharSequence} passed in parameter. 325 * 326 * @param sequence the sequence of characters to split 327 * @return an iteration over the segments split from the parameter. 328 */ 329 public Iterable<String> split(final CharSequence sequence) { 330 checkNotNull(sequence); 331 332 return new Iterable<String>() { 333 /*@Override*/ public Iterator<String> iterator() { 334 return strategy.iterator(Splitter.this, sequence); 335 } 336 }; 337 } 338 339 private interface Strategy { 340 Iterator<String> iterator(Splitter splitter, CharSequence toSplit); 341 } 342 343 private abstract static class SplittingIterator 344 extends AbstractIterator<String> { 345 final CharSequence toSplit; 346 final CharMatcher trimmer; 347 final boolean omitEmptyStrings; 348 349 /** 350 * Returns the first index in {@code toSplit} at or after {@code start} 351 * that contains the separator. 352 */ 353 abstract int separatorStart(int start); 354 355 /** 356 * Returns the first index in {@code toSplit} after {@code 357 * separatorPosition} that does not contain a separator. This method is only 358 * invoked after a call to {@code separatorStart}. 359 */ 360 abstract int separatorEnd(int separatorPosition); 361 362 int offset = 0; 363 364 protected SplittingIterator(Splitter splitter, CharSequence toSplit) { 365 this.trimmer = splitter.trimmer; 366 this.omitEmptyStrings = splitter.omitEmptyStrings; 367 this.toSplit = toSplit; 368 } 369 370 @Override protected String computeNext() { 371 while (offset != -1) { 372 int start = offset; 373 int end; 374 375 int separatorPosition = separatorStart(offset); 376 if (separatorPosition == -1) { 377 end = toSplit.length(); 378 offset = -1; 379 } else { 380 end = separatorPosition; 381 offset = separatorEnd(separatorPosition); 382 } 383 384 while (start < end && trimmer.matches(toSplit.charAt(start))) { 385 start++; 386 } 387 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 388 end--; 389 } 390 391 if (omitEmptyStrings && start == end) { 392 continue; 393 } 394 395 return toSplit.subSequence(start, end).toString(); 396 } 397 return endOfData(); 398 } 399 } 400 401 /* 402 * Copied from common.collect.AbstractIterator. TODO: un-fork once these 403 * packages have been combined into a single library. 404 */ 405 private static abstract class AbstractIterator<T> implements Iterator<T> { 406 State state = State.NOT_READY; 407 408 enum State { 409 READY, NOT_READY, DONE, FAILED, 410 } 411 412 T next; 413 414 protected abstract T computeNext(); 415 416 protected final T endOfData() { 417 state = State.DONE; 418 return null; 419 } 420 421 public final boolean hasNext() { 422 checkState(state != State.FAILED); 423 switch (state) { 424 case DONE: 425 return false; 426 case READY: 427 return true; 428 default: 429 } 430 return tryToComputeNext(); 431 } 432 433 boolean tryToComputeNext() { 434 state = State.FAILED; // temporary pessimism 435 next = computeNext(); 436 if (state != State.DONE) { 437 state = State.READY; 438 return true; 439 } 440 return false; 441 } 442 443 public final T next() { 444 if (!hasNext()) { 445 throw new NoSuchElementException(); 446 } 447 state = State.NOT_READY; 448 return next; 449 } 450 451 /*@Override*/ public void remove() { 452 throw new UnsupportedOperationException(); 453 } 454 } 455} 456