1/* 2 * Copyright (C) 2009 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.google.common.base; 18 19import static com.google.common.base.Preconditions.checkArgument; 20import static com.google.common.base.Preconditions.checkNotNull; 21 22import com.google.common.annotations.Beta; 23import com.google.common.annotations.GwtCompatible; 24 25import java.util.Collections; 26import java.util.Iterator; 27import java.util.LinkedHashMap; 28import java.util.Map; 29 30import javax.annotation.CheckReturnValue; 31 32/** 33 * An object that divides strings (or other instances of {@code CharSequence}) 34 * into substrings, by recognizing a <i>separator</i> (a.k.a. "delimiter") 35 * which can be expressed as a single character, literal string, regular 36 * expression, {@code CharMatcher}, or by using a fixed substring length. This 37 * class provides the complementary functionality to {@link Joiner}. 38 * 39 * <p>Here is the most basic example of {@code Splitter} usage: <pre> {@code 40 * 41 * Splitter.on(',').split("foo,bar")}</pre> 42 * 43 * This invocation returns an {@code Iterable<String>} containing {@code "foo"} 44 * and {@code "bar"}, in that order. 45 * 46 * <p>By default {@code Splitter}'s behavior is very simplistic: <pre> {@code 47 * 48 * Splitter.on(',').split("foo,,bar, quux")}</pre> 49 * 50 * This returns an iterable containing {@code ["foo", "", "bar", " quux"]}. 51 * Notice that the splitter does not assume that you want empty strings removed, 52 * or that you wish to trim whitespace. If you want features like these, simply 53 * ask for them: <pre> {@code 54 * 55 * private static final Splitter MY_SPLITTER = Splitter.on(',') 56 * .trimResults() 57 * .omitEmptyStrings();}</pre> 58 * 59 * Now {@code MY_SPLITTER.split("foo, ,bar, quux,")} returns an iterable 60 * containing just {@code ["foo", "bar", "quux"]}. Note that the order in which 61 * the configuration methods are called is never significant; for instance, 62 * trimming is always applied first before checking for an empty result, 63 * regardless of the order in which the {@link #trimResults()} and 64 * {@link #omitEmptyStrings()} methods were invoked. 65 * 66 * <p><b>Warning: splitter instances are always immutable</b>; a configuration 67 * method such as {@code omitEmptyStrings} has no effect on the instance it 68 * is invoked on! You must store and use the new splitter instance returned by 69 * the method. This makes splitters thread-safe, and safe to store as {@code 70 * static final} constants (as illustrated above). <pre> {@code 71 * 72 * // Bad! Do not do this! 73 * Splitter splitter = Splitter.on('/'); 74 * splitter.trimResults(); // does nothing! 75 * return splitter.split("wrong / wrong / wrong");}</pre> 76 * 77 * The separator recognized by the splitter does not have to be a single 78 * literal character as in the examples above. See the methods {@link 79 * #on(String)}, {@link #on(Pattern)} and {@link #on(CharMatcher)} for examples 80 * of other ways to specify separators. 81 * 82 * <p><b>Note:</b> this class does not mimic any of the quirky behaviors of 83 * similar JDK methods; for instance, it does not silently discard trailing 84 * separators, as does {@link String#split(String)}, nor does it have a default 85 * behavior of using five particular whitespace characters as separators, like 86 * {@link java.util.StringTokenizer}. 87 * 88 * @author Julien Silland 89 * @author Jesse Wilson 90 * @author Kevin Bourrillion 91 * @author Louis Wasserman 92 * @since 1.0 93 */ 94@GwtCompatible(emulated = true) 95public final class Splitter { 96 private final CharMatcher trimmer; 97 private final boolean omitEmptyStrings; 98 private final Strategy strategy; 99 private final int limit; 100 101 private Splitter(Strategy strategy) { 102 this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE); 103 } 104 105 private Splitter(Strategy strategy, boolean omitEmptyStrings, 106 CharMatcher trimmer, int limit) { 107 this.strategy = strategy; 108 this.omitEmptyStrings = omitEmptyStrings; 109 this.trimmer = trimmer; 110 this.limit = limit; 111 } 112 113 /** 114 * Returns a splitter that uses the given single-character separator. For 115 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable 116 * containing {@code ["foo", "", "bar"]}. 117 * 118 * @param separator the character to recognize as a separator 119 * @return a splitter, with default settings, that recognizes that separator 120 */ 121 public static Splitter on(char separator) { 122 return on(CharMatcher.is(separator)); 123 } 124 125 /** 126 * Returns a splitter that considers any single character matched by the 127 * given {@code CharMatcher} to be a separator. For example, {@code 128 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an 129 * iterable containing {@code ["foo", "", "bar", "quux"]}. 130 * 131 * @param separatorMatcher a {@link CharMatcher} that determines whether a 132 * character is a separator 133 * @return a splitter, with default settings, that uses this matcher 134 */ 135 public static Splitter on(final CharMatcher separatorMatcher) { 136 checkNotNull(separatorMatcher); 137 138 return new Splitter(new Strategy() { 139 @Override public SplittingIterator iterator( 140 Splitter splitter, final CharSequence toSplit) { 141 return new SplittingIterator(splitter, toSplit) { 142 @Override int separatorStart(int start) { 143 return separatorMatcher.indexIn(toSplit, start); 144 } 145 146 @Override int separatorEnd(int separatorPosition) { 147 return separatorPosition + 1; 148 } 149 }; 150 } 151 }); 152 } 153 154 /** 155 * Returns a splitter that uses the given fixed string as a separator. For 156 * example, {@code Splitter.on(", ").split("foo, bar, baz,qux")} returns an 157 * iterable containing {@code ["foo", "bar", "baz,qux"]}. 158 * 159 * @param separator the literal, nonempty string to recognize as a separator 160 * @return a splitter, with default settings, that recognizes that separator 161 */ 162 public static Splitter on(final String separator) { 163 checkArgument(separator.length() != 0, 164 "The separator may not be the empty string."); 165 166 return new Splitter(new Strategy() { 167 @Override public SplittingIterator iterator( 168 Splitter splitter, CharSequence toSplit) { 169 return new SplittingIterator(splitter, toSplit) { 170 @Override public int separatorStart(int start) { 171 int delimeterLength = separator.length(); 172 173 positions: 174 for (int p = start, last = toSplit.length() - delimeterLength; 175 p <= last; p++) { 176 for (int i = 0; i < delimeterLength; i++) { 177 if (toSplit.charAt(i + p) != separator.charAt(i)) { 178 continue positions; 179 } 180 } 181 return p; 182 } 183 return -1; 184 } 185 186 @Override public int separatorEnd(int separatorPosition) { 187 return separatorPosition + separator.length(); 188 } 189 }; 190 } 191 }); 192 } 193 194 /** 195 * Returns a splitter that divides strings into pieces of the given length. 196 * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an 197 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be 198 * smaller than {@code length} but will never be empty. 199 * 200 * @param length the desired length of pieces after splitting 201 * @return a splitter, with default settings, that can split into fixed sized 202 * pieces 203 */ 204 public static Splitter fixedLength(final int length) { 205 checkArgument(length > 0, "The length may not be less than 1"); 206 207 return new Splitter(new Strategy() { 208 @Override public SplittingIterator iterator( 209 final Splitter splitter, CharSequence toSplit) { 210 return new SplittingIterator(splitter, toSplit) { 211 @Override public int separatorStart(int start) { 212 int nextChunkStart = start + length; 213 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1); 214 } 215 216 @Override public int separatorEnd(int separatorPosition) { 217 return separatorPosition; 218 } 219 }; 220 } 221 }); 222 } 223 224 /** 225 * Returns a splitter that behaves equivalently to {@code this} splitter, but 226 * automatically omits empty strings from the results. For example, {@code 227 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an 228 * iterable containing only {@code ["a", "b", "c"]}. 229 * 230 * <p>If either {@code trimResults} option is also specified when creating a 231 * splitter, that splitter always trims results first before checking for 232 * emptiness. So, for example, {@code 233 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns 234 * an empty iterable. 235 * 236 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)} 237 * to return an empty iterable, but when using this option, it can (if the 238 * input sequence consists of nothing but separators). 239 * 240 * @return a splitter with the desired configuration 241 */ 242 @CheckReturnValue 243 public Splitter omitEmptyStrings() { 244 return new Splitter(strategy, true, trimmer, limit); 245 } 246 247 /** 248 * Returns a splitter that behaves equivalently to {@code this} splitter but 249 * stops splitting after it reaches the limit. 250 * The limit defines the maximum number of items returned by the iterator. 251 * 252 * <p>For example, 253 * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable 254 * containing {@code ["a", "b", "c,d"]}. When omitting empty strings, the 255 * omitted strings do no count. Hence, 256 * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")} 257 * returns an iterable containing {@code ["a", "b", "c,d"}. 258 * When trim is requested, all entries, including the last are trimmed. Hence 259 * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")} 260 * results in @{code ["a", "b", "c , d"]}. 261 * 262 * @param limit the maximum number of items returns 263 * @return a splitter with the desired configuration 264 * @since 9.0 265 */ 266 @CheckReturnValue 267 public Splitter limit(int limit) { 268 checkArgument(limit > 0, "must be greater than zero: %s", limit); 269 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 270 } 271 272 /** 273 * Returns a splitter that behaves equivalently to {@code this} splitter, but 274 * automatically removes leading and trailing {@linkplain 275 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent 276 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code 277 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable 278 * containing {@code ["a", "b", "c"]}. 279 * 280 * @return a splitter with the desired configuration 281 */ 282 @CheckReturnValue 283 public Splitter trimResults() { 284 return trimResults(CharMatcher.WHITESPACE); 285 } 286 287 /** 288 * Returns a splitter that behaves equivalently to {@code this} splitter, but 289 * removes all leading or trailing characters matching the given {@code 290 * CharMatcher} from each returned substring. For example, {@code 291 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")} 292 * returns an iterable containing {@code ["a ", "b_ ", "c"]}. 293 * 294 * @param trimmer a {@link CharMatcher} that determines whether a character 295 * should be removed from the beginning/end of a subsequence 296 * @return a splitter with the desired configuration 297 */ 298 // TODO(kevinb): throw if a trimmer was already specified! 299 @CheckReturnValue 300 public Splitter trimResults(CharMatcher trimmer) { 301 checkNotNull(trimmer); 302 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 303 } 304 305 /** 306 * Splits {@code sequence} into string components and makes them available 307 * through an {@link Iterator}, which may be lazily evaluated. 308 * 309 * @param sequence the sequence of characters to split 310 * @return an iteration over the segments split from the parameter. 311 */ 312 public Iterable<String> split(final CharSequence sequence) { 313 checkNotNull(sequence); 314 315 return new Iterable<String>() { 316 @Override public Iterator<String> iterator() { 317 return spliterator(sequence); 318 } 319 }; 320 } 321 322 private Iterator<String> spliterator(CharSequence sequence) { 323 return strategy.iterator(this, sequence); 324 } 325 326 /** 327 * Returns a {@code MapSplitter} which splits entries based on this splitter, 328 * and splits entries into keys and values using the specified separator. 329 * 330 * @since 10.0 331 */ 332 @CheckReturnValue 333 @Beta 334 public MapSplitter withKeyValueSeparator(String separator) { 335 return withKeyValueSeparator(on(separator)); 336 } 337 338 /** 339 * Returns a {@code MapSplitter} which splits entries based on this splitter, 340 * and splits entries into keys and values using the specified key-value 341 * splitter. 342 * 343 * @since 10.0 344 */ 345 @CheckReturnValue 346 @Beta 347 public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) { 348 return new MapSplitter(this, keyValueSplitter); 349 } 350 351 /** 352 * An object that splits strings into maps as {@code Splitter} splits 353 * iterables and lists. Like {@code Splitter}, it is thread-safe and 354 * immutable. 355 * 356 * @since 10.0 357 */ 358 @Beta 359 public static final class MapSplitter { 360 private static final String INVALID_ENTRY_MESSAGE = 361 "Chunk [%s] is not a valid entry"; 362 private final Splitter outerSplitter; 363 private final Splitter entrySplitter; 364 365 private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) { 366 this.outerSplitter = outerSplitter; // only "this" is passed 367 this.entrySplitter = checkNotNull(entrySplitter); 368 } 369 370 /** 371 * Splits {@code sequence} into substrings, splits each substring into 372 * an entry, and returns an unmodifiable map with each of the entries. For 373 * example, <code> 374 * Splitter.on(';').trimResults().withKeyValueSeparator("=>") 375 * .split("a=>b ; c=>b") 376 * </code> will return a mapping from {@code "a"} to {@code "b"} and 377 * {@code "c"} to {@code b}. 378 * 379 * <p>The returned map preserves the order of the entries from 380 * {@code sequence}. 381 * 382 * @throws IllegalArgumentException if the specified sequence does not split 383 * into valid map entries, or if there are duplicate keys 384 */ 385 public Map<String, String> split(CharSequence sequence) { 386 Map<String, String> map = new LinkedHashMap<String, String>(); 387 for (String entry : outerSplitter.split(sequence)) { 388 Iterator<String> entryFields = entrySplitter.spliterator(entry); 389 390 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 391 String key = entryFields.next(); 392 checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key); 393 394 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 395 String value = entryFields.next(); 396 map.put(key, value); 397 398 checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 399 } 400 return Collections.unmodifiableMap(map); 401 } 402 } 403 404 private interface Strategy { 405 Iterator<String> iterator(Splitter splitter, CharSequence toSplit); 406 } 407 408 private abstract static class SplittingIterator 409 extends AbstractIterator<String> { 410 final CharSequence toSplit; 411 final CharMatcher trimmer; 412 final boolean omitEmptyStrings; 413 414 /** 415 * Returns the first index in {@code toSplit} at or after {@code start} 416 * that contains the separator. 417 */ 418 abstract int separatorStart(int start); 419 420 /** 421 * Returns the first index in {@code toSplit} after {@code 422 * separatorPosition} that does not contain a separator. This method is only 423 * invoked after a call to {@code separatorStart}. 424 */ 425 abstract int separatorEnd(int separatorPosition); 426 427 int offset = 0; 428 int limit; 429 430 protected SplittingIterator(Splitter splitter, CharSequence toSplit) { 431 this.trimmer = splitter.trimmer; 432 this.omitEmptyStrings = splitter.omitEmptyStrings; 433 this.limit = splitter.limit; 434 this.toSplit = toSplit; 435 } 436 437 @Override protected String computeNext() { 438 while (offset != -1) { 439 int start = offset; 440 int end; 441 442 int separatorPosition = separatorStart(offset); 443 if (separatorPosition == -1) { 444 end = toSplit.length(); 445 offset = -1; 446 } else { 447 end = separatorPosition; 448 offset = separatorEnd(separatorPosition); 449 } 450 451 while (start < end && trimmer.matches(toSplit.charAt(start))) { 452 start++; 453 } 454 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 455 end--; 456 } 457 458 if (omitEmptyStrings && start == end) { 459 continue; 460 } 461 462 if (limit == 1) { 463 // The limit has been reached, return the rest of the string as the 464 // final item. This is tested after empty string removal so that 465 // empty strings do not count towards the limit. 466 end = toSplit.length(); 467 offset = -1; 468 // Since we may have changed the end, we need to trim it again. 469 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 470 end--; 471 } 472 } else { 473 limit--; 474 } 475 476 return toSplit.subSequence(start, end).toString(); 477 } 478 return endOfData(); 479 } 480 } 481} 482