1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. Oracle designates this 9 * particular file as subject to the "Classpath" exception as provided 10 * by Oracle in the LICENSE file that accompanied this code. 11 * 12 * This code is distributed in the hope that it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 * version 2 for more details (a copy is included in the LICENSE file that 16 * accompanied this code). 17 * 18 * You should have received a copy of the GNU General Public License version 19 * 2 along with this work; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 21 * 22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 23 * or visit www.oracle.com if you need additional information or have any 24 * questions. 25 */ 26 27package java.util.regex; 28 29import dalvik.annotation.optimization.ReachabilitySensitive; 30import libcore.util.NativeAllocationRegistry; 31 32/** 33 * An engine that performs match operations on a {@link java.lang.CharSequence 34 * </code>character sequence<code>} by interpreting a {@link Pattern}. 35 * 36 * <p> A matcher is created from a pattern by invoking the pattern's {@link 37 * Pattern#matcher matcher} method. Once created, a matcher can be used to 38 * perform three different kinds of match operations: 39 * 40 * <ul> 41 * 42 * <li><p> The {@link #matches matches} method attempts to match the entire 43 * input sequence against the pattern. </p></li> 44 * 45 * <li><p> The {@link #lookingAt lookingAt} method attempts to match the 46 * input sequence, starting at the beginning, against the pattern. </p></li> 47 * 48 * <li><p> The {@link #find find} method scans the input sequence looking for 49 * the next subsequence that matches the pattern. </p></li> 50 * 51 * </ul> 52 * 53 * <p> Each of these methods returns a boolean indicating success or failure. 54 * More information about a successful match can be obtained by querying the 55 * state of the matcher. 56 * 57 * <p> A matcher finds matches in a subset of its input called the 58 * <i>region</i>. By default, the region contains all of the matcher's input. 59 * The region can be modified via the{@link #region region} method and queried 60 * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd} 61 * methods. The way that the region boundaries interact with some pattern 62 * constructs can be changed. See {@link #useAnchoringBounds 63 * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds} 64 * for more details. 65 * 66 * <p> This class also defines methods for replacing matched subsequences with 67 * new strings whose contents can, if desired, be computed from the match 68 * result. The {@link #appendReplacement appendReplacement} and {@link 69 * #appendTail appendTail} methods can be used in tandem in order to collect 70 * the result into an existing string buffer, or the more convenient {@link 71 * #replaceAll replaceAll} method can be used to create a string in which every 72 * matching subsequence in the input sequence is replaced. 73 * 74 * <p> The explicit state of a matcher includes the start and end indices of 75 * the most recent successful match. It also includes the start and end 76 * indices of the input subsequence captured by each <a 77 * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total 78 * count of such subsequences. As a convenience, methods are also provided for 79 * returning these captured subsequences in string form. 80 * 81 * <p> The explicit state of a matcher is initially undefined; attempting to 82 * query any part of it before a successful match will cause an {@link 83 * IllegalStateException} to be thrown. The explicit state of a matcher is 84 * recomputed by every match operation. 85 * 86 * <p> The implicit state of a matcher includes the input character sequence as 87 * well as the <i>append position</i>, which is initially zero and is updated 88 * by the {@link #appendReplacement appendReplacement} method. 89 * 90 * <p> A matcher may be reset explicitly by invoking its {@link #reset()} 91 * method or, if a new input sequence is desired, its {@link 92 * #reset(java.lang.CharSequence) reset(CharSequence)} method. Resetting a 93 * matcher discards its explicit state information and sets the append position 94 * to zero. 95 * 96 * <p> Instances of this class are not safe for use by multiple concurrent 97 * threads. </p> 98 * 99 * 100 * @author Mike McCloskey 101 * @author Mark Reinhold 102 * @author JSR-51 Expert Group 103 * @since 1.4 104 * @spec JSR-51 105 */ 106 107public final class Matcher implements MatchResult { 108 /** 109 * The Pattern object that created this Matcher. 110 */ 111 // Patterns also contain cleanup code and a ReachabilitySensitive field. 112 // This ensures that "this" and pattern remain reachable while we're using pattern.address 113 // directly. 114 @ReachabilitySensitive 115 private Pattern pattern; 116 117 /** 118 * The address of the native peer. 119 * Uses of this must be manually synchronized to avoid native crashes. 120 */ 121 @ReachabilitySensitive 122 private long address; 123 124 /** 125 * If non-null, a Runnable that can be used to explicitly deallocate address. 126 */ 127 private Runnable nativeFinalizer; 128 129 private static final NativeAllocationRegistry registry = new NativeAllocationRegistry( 130 Matcher.class.getClassLoader(), getNativeFinalizer(), nativeSize()); 131 132 /** 133 * Holds the original CharSequence for use in {@link #reset}. {@link #input} is used during 134 * matching. Note that CharSequence is mutable while String is not, so reset can cause the input 135 * to match to change. 136 */ 137 private CharSequence originalInput; 138 139 /** 140 * Holds the input text. 141 */ 142 private String input; 143 144 /** 145 * Holds the start of the region, or 0 if the matching should start at the 146 * beginning of the text. 147 */ 148 private int regionStart; 149 150 /** 151 * Holds the end of the region, or input.length() if the matching should 152 * go until the end of the input. 153 */ 154 private int regionEnd; 155 156 /** 157 * Holds the position where the next append operation will take place. 158 */ 159 private int appendPos; 160 161 /** 162 * Reflects whether a match has been found during the most recent find 163 * operation. 164 */ 165 private boolean matchFound; 166 167 /** 168 * Holds the offsets for the most recent match. 169 */ 170 private int[] matchOffsets; 171 172 /** 173 * Reflects whether the bounds of the region are anchoring. 174 */ 175 private boolean anchoringBounds = true; 176 177 /** 178 * Reflects whether the bounds of the region are transparent. 179 */ 180 private boolean transparentBounds; 181 182 /** 183 * All matchers have the state used by Pattern during a match. 184 */ 185 Matcher(Pattern parent, CharSequence text) { 186 usePattern(parent); 187 reset(text); 188 } 189 190 /** 191 * Returns the pattern that is interpreted by this matcher. 192 * 193 * @return The pattern for which this matcher was created 194 */ 195 public Pattern pattern() { 196 return pattern; 197 } 198 199 /** 200 * Returns the match state of this matcher as a {@link MatchResult}. 201 * The result is unaffected by subsequent operations performed upon this 202 * matcher. 203 * 204 * @return a <code>MatchResult</code> with the state of this matcher 205 * @since 1.5 206 */ 207 public MatchResult toMatchResult() { 208 ensureMatch(); 209 return new OffsetBasedMatchResult(input, matchOffsets); 210 } 211 212 /** 213 * Changes the <tt>Pattern</tt> that this <tt>Matcher</tt> uses to 214 * find matches with. 215 * 216 * <p> This method causes this matcher to lose information 217 * about the groups of the last match that occurred. The 218 * matcher's position in the input is maintained and its 219 * last append position is unaffected.</p> 220 * 221 * @param newPattern 222 * The new pattern used by this matcher 223 * @return This matcher 224 * @throws IllegalArgumentException 225 * If newPattern is <tt>null</tt> 226 * @since 1.5 227 */ 228 public Matcher usePattern(Pattern newPattern) { 229 if (newPattern == null) { 230 throw new IllegalArgumentException("newPattern == null"); 231 } 232 233 this.pattern = newPattern; 234 235 synchronized (this) { 236 if (nativeFinalizer != null) { 237 nativeFinalizer.run(); 238 address = 0; // In case openImpl throws. 239 nativeFinalizer = null; 240 } 241 address = openImpl(pattern.address); 242 nativeFinalizer = registry.registerNativeAllocation(this, address); 243 } 244 245 if (input != null) { 246 resetForInput(); 247 } 248 249 matchOffsets = new int[(groupCount() + 1) * 2]; 250 matchFound = false; 251 return this; 252 } 253 254 /** 255 * Returns the offset after the last character matched. </p> 256 * 257 * @return The offset after the last character matched 258 * 259 * @throws IllegalStateException 260 * If no match has yet been attempted, 261 * or if the previous match operation failed 262 */ 263 public int end() { 264 return end(0); 265 } 266 267 /** 268 * Returns the offset after the last character of the subsequence 269 * captured by the given group during the previous match operation. 270 * 271 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left 272 * to right, starting at one. Group zero denotes the entire pattern, so 273 * the expression <i>m.</i><tt>end(0)</tt> is equivalent to 274 * <i>m.</i><tt>end()</tt>. </p> 275 * 276 * @param group 277 * The index of a capturing group in this matcher's pattern 278 * 279 * @return The offset after the last character captured by the group, 280 * or <tt>-1</tt> if the match was successful 281 * but the group itself did not match anything 282 * 283 * @throws IllegalStateException 284 * If no match has yet been attempted, 285 * or if the previous match operation failed 286 * 287 * @throws IndexOutOfBoundsException 288 * If there is no capturing group in the pattern 289 * with the given index 290 */ 291 public int end(int group) { 292 ensureMatch(); 293 return matchOffsets[(group * 2) + 1]; 294 } 295 296 /** 297 * Returns the offset after the last character of the subsequence 298 * captured by the given <a href="Pattern.html#groupname">named-capturing 299 * group</a> during the previous match operation. 300 * 301 * @param name 302 * The name of a named-capturing group in this matcher's pattern 303 * 304 * @return The offset after the last character captured by the group, 305 * or {@code -1} if the match was successful 306 * but the group itself did not match anything 307 * 308 * @throws IllegalStateException 309 * If no match has yet been attempted, 310 * or if the previous match operation failed 311 * 312 * @throws IllegalArgumentException 313 * If there is no capturing group in the pattern 314 * with the given name 315 * @since 1.8 316 */ 317 public int end(String name) { 318 ensureMatch(); 319 return matchOffsets[getMatchedGroupIndex(pattern.address, name) * 2 + 1]; 320 } 321 322 323 /** 324 * Returns the input subsequence matched by the previous match. 325 * 326 * <p> For a matcher <i>m</i> with input sequence <i>s</i>, 327 * the expressions <i>m.</i><tt>group()</tt> and 328 * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(),</tt> <i>m.</i><tt>end())</tt> 329 * are equivalent. </p> 330 * 331 * <p> Note that some patterns, for example <tt>a*</tt>, match the empty 332 * string. This method will return the empty string when the pattern 333 * successfully matches the empty string in the input. </p> 334 * 335 * @return The (possibly empty) subsequence matched by the previous match, 336 * in string form 337 * 338 * @throws IllegalStateException 339 * If no match has yet been attempted, 340 * or if the previous match operation failed 341 */ 342 public String group() { 343 return group(0); 344 } 345 346 /** 347 * Returns the input subsequence captured by the given group during the 348 * previous match operation. 349 * 350 * <p> For a matcher <i>m</i>, input sequence <i>s</i>, and group index 351 * <i>g</i>, the expressions <i>m.</i><tt>group(</tt><i>g</i><tt>)</tt> and 352 * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(</tt><i>g</i><tt>),</tt> <i>m.</i><tt>end(</tt><i>g</i><tt>))</tt> 353 * are equivalent. </p> 354 * 355 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left 356 * to right, starting at one. Group zero denotes the entire pattern, so 357 * the expression <tt>m.group(0)</tt> is equivalent to <tt>m.group()</tt>. 358 * </p> 359 * 360 * <p> If the match was successful but the group specified failed to match 361 * any part of the input sequence, then <tt>null</tt> is returned. Note 362 * that some groups, for example <tt>(a*)</tt>, match the empty string. 363 * This method will return the empty string when such a group successfully 364 * matches the empty string in the input. </p> 365 * 366 * @param group 367 * The index of a capturing group in this matcher's pattern 368 * 369 * @return The (possibly empty) subsequence captured by the group 370 * during the previous match, or <tt>null</tt> if the group 371 * failed to match part of the input 372 * 373 * @throws IllegalStateException 374 * If no match has yet been attempted, 375 * or if the previous match operation failed 376 * 377 * @throws IndexOutOfBoundsException 378 * If there is no capturing group in the pattern 379 * with the given index 380 */ 381 public String group(int group) { 382 ensureMatch(); 383 int from = matchOffsets[group * 2]; 384 int to = matchOffsets[(group * 2) + 1]; 385 if (from == -1 || to == -1) { 386 return null; 387 } else { 388 return input.substring(from, to); 389 } 390 } 391 392 /** 393 * Returns the input subsequence captured by the given 394 * <a href="Pattern.html#groupname">named-capturing group</a> during the previous 395 * match operation. 396 * 397 * <p> If the match was successful but the group specified failed to match 398 * any part of the input sequence, then <tt>null</tt> is returned. Note 399 * that some groups, for example <tt>(a*)</tt>, match the empty string. 400 * This method will return the empty string when such a group successfully 401 * matches the empty string in the input. </p> 402 * 403 * @param name 404 * The name of a named-capturing group in this matcher's pattern 405 * 406 * @return The (possibly empty) subsequence captured by the named group 407 * during the previous match, or <tt>null</tt> if the group 408 * failed to match part of the input 409 * 410 * @throws IllegalStateException 411 * If no match has yet been attempted, 412 * or if the previous match operation failed 413 * 414 * @throws IllegalArgumentException 415 * If there is no capturing group in the pattern 416 * with the given name 417 * @since 1.7 418 */ 419 public String group(String name) { 420 ensureMatch(); 421 int group = getMatchedGroupIndex(pattern.address, name); 422 int from = matchOffsets[group * 2]; 423 int to = matchOffsets[(group * 2) + 1]; 424 if (from == -1 || to == -1) { 425 return null; 426 } else { 427 return input.substring(from, to); 428 } 429 } 430 431 /** 432 * Returns the number of capturing groups in this matcher's pattern. 433 * 434 * <p> Group zero denotes the entire pattern by convention. It is not 435 * included in this count. 436 * 437 * <p> Any non-negative integer smaller than or equal to the value 438 * returned by this method is guaranteed to be a valid group index for 439 * this matcher. </p> 440 * 441 * @return The number of capturing groups in this matcher's pattern 442 */ 443 public int groupCount() { 444 synchronized (this) { 445 return groupCountImpl(address); 446 } 447 } 448 449 /** 450 * Attempts to match the entire region against the pattern. 451 * 452 * <p> If the match succeeds then more information can be obtained via the 453 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p> 454 * 455 * @return <tt>true</tt> if, and only if, the entire region sequence 456 * matches this matcher's pattern 457 */ 458 public boolean matches() { 459 synchronized (this) { 460 matchFound = matchesImpl(address, matchOffsets); 461 } 462 return matchFound; 463 } 464 465 /** 466 * Attempts to find the next subsequence of the input sequence that matches 467 * the pattern. 468 * 469 * <p> This method starts at the beginning of this matcher's region, or, if 470 * a previous invocation of the method was successful and the matcher has 471 * not since been reset, at the first character not matched by the previous 472 * match. 473 * 474 * <p> If the match succeeds then more information can be obtained via the 475 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p> 476 * 477 * @return <tt>true</tt> if, and only if, a subsequence of the input 478 * sequence matches this matcher's pattern 479 */ 480 public boolean find() { 481 synchronized (this) { 482 matchFound = findNextImpl(address, matchOffsets); 483 } 484 return matchFound; 485 } 486 487 /** 488 * Resets this matcher and then attempts to find the next subsequence of 489 * the input sequence that matches the pattern, starting at the specified 490 * index. 491 * 492 * <p> If the match succeeds then more information can be obtained via the 493 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods, and subsequent 494 * invocations of the {@link #find()} method will start at the first 495 * character not matched by this match. </p> 496 * 497 * @throws IndexOutOfBoundsException 498 * If start is less than zero or if start is greater than the 499 * length of the input sequence. 500 * 501 * @return <tt>true</tt> if, and only if, a subsequence of the input 502 * sequence starting at the given index matches this matcher's 503 * pattern 504 */ 505 public boolean find(int start) { 506 reset(); 507 if (start < 0 || start > input.length()) { 508 throw new IndexOutOfBoundsException("start=" + start + "; length=" + input.length()); 509 } 510 511 synchronized (this) { 512 matchFound = findImpl(address, start, matchOffsets); 513 } 514 return matchFound; 515 } 516 517 /** 518 * Attempts to match the input sequence, starting at the beginning of the 519 * region, against the pattern. 520 * 521 * <p> Like the {@link #matches matches} method, this method always starts 522 * at the beginning of the region; unlike that method, it does not 523 * require that the entire region be matched. 524 * 525 * <p> If the match succeeds then more information can be obtained via the 526 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p> 527 * 528 * @return <tt>true</tt> if, and only if, a prefix of the input 529 * sequence matches this matcher's pattern 530 */ 531 public boolean lookingAt() { 532 synchronized (this) { 533 matchFound = lookingAtImpl(address, matchOffsets); 534 } 535 return matchFound; 536 } 537 538 /** 539 * Returns a literal replacement <code>String</code> for the specified 540 * <code>String</code>. 541 * 542 * This method produces a <code>String</code> that will work 543 * as a literal replacement <code>s</code> in the 544 * <code>appendReplacement</code> method of the {@link Matcher} class. 545 * The <code>String</code> produced will match the sequence of characters 546 * in <code>s</code> treated as a literal sequence. Slashes ('\') and 547 * dollar signs ('$') will be given no special meaning. 548 * 549 * @param s The string to be literalized 550 * @return A literal string replacement 551 * @since 1.5 552 */ 553 public static String quoteReplacement(String s) { 554 if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1)) 555 return s; 556 StringBuilder sb = new StringBuilder(); 557 for (int i=0; i<s.length(); i++) { 558 char c = s.charAt(i); 559 if (c == '\\' || c == '$') { 560 sb.append('\\'); 561 } 562 sb.append(c); 563 } 564 return sb.toString(); 565 } 566 567 /** 568 * Implements a non-terminal append-and-replace step. 569 * 570 * <p> This method performs the following actions: </p> 571 * 572 * <ol> 573 * 574 * <li><p> It reads characters from the input sequence, starting at the 575 * append position, and appends them to the given string buffer. It 576 * stops after reading the last character preceding the previous match, 577 * that is, the character at index {@link 578 * #start()} <tt>-</tt> <tt>1</tt>. </p></li> 579 * 580 * <li><p> It appends the given replacement string to the string buffer. 581 * </p></li> 582 * 583 * <li><p> It sets the append position of this matcher to the index of 584 * the last character matched, plus one, that is, to {@link #end()}. 585 * </p></li> 586 * 587 * </ol> 588 * 589 * <p> The replacement string may contain references to subsequences 590 * captured during the previous match: Each occurrence of 591 * <tt>$</tt><i>g</i> will be replaced by the result of evaluating the corresponding 592 * {@link #group(int) group(g)</tt>} respectively. For <tt>$</tt><i>g</i><tt></tt>, 593 * the first number after the <tt>$</tt> is always treated as part of 594 * the group reference. Subsequent numbers are incorporated into g if 595 * they would form a legal group reference. Only the numerals '0' 596 * through '9' are considered as potential components of the group 597 * reference. If the second group matched the string <tt>"foo"</tt>, for 598 * example, then passing the replacement string <tt>"$2bar"</tt> would 599 * cause <tt>"foobar"</tt> to be appended to the string buffer. A dollar 600 * sign (<tt>$</tt>) may be included as a literal in the replacement 601 * string by preceding it with a backslash (<tt>\$</tt>). 602 * 603 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in 604 * the replacement string may cause the results to be different than if it 605 * were being treated as a literal replacement string. Dollar signs may be 606 * treated as references to captured subsequences as described above, and 607 * backslashes are used to escape literal characters in the replacement 608 * string. 609 * 610 * <p> This method is intended to be used in a loop together with the 611 * {@link #appendTail appendTail} and {@link #find find} methods. The 612 * following code, for example, writes <tt>one dog two dogs in the 613 * yard</tt> to the standard-output stream: </p> 614 * 615 * <blockquote><pre> 616 * Pattern p = Pattern.compile("cat"); 617 * Matcher m = p.matcher("one cat two cats in the yard"); 618 * StringBuffer sb = new StringBuffer(); 619 * while (m.find()) { 620 * m.appendReplacement(sb, "dog"); 621 * } 622 * m.appendTail(sb); 623 * System.out.println(sb.toString());</pre></blockquote> 624 * 625 * @param sb 626 * The target string buffer 627 * 628 * @param replacement 629 * The replacement string 630 * 631 * @return This matcher 632 * 633 * @throws IllegalStateException 634 * If no match has yet been attempted, 635 * or if the previous match operation failed 636 * 637 * @throws IllegalArgumentException 638 * If the replacement string refers to a named-capturing 639 * group that does not exist in the pattern 640 * 641 * @throws IndexOutOfBoundsException 642 * If the replacement string refers to a capturing group 643 * that does not exist in the pattern 644 */ 645 public Matcher appendReplacement(StringBuffer sb, String replacement) { 646 sb.append(input.substring(appendPos, start())); 647 appendEvaluated(sb, replacement); 648 appendPos = end(); 649 650 return this; 651 } 652 653 /** 654 * Internal helper method to append a given string to a given string buffer. 655 * If the string contains any references to groups, these are replaced by 656 * the corresponding group's contents. 657 * 658 * @param buffer the string buffer. 659 * @param s the string to append. 660 */ 661 private void appendEvaluated(StringBuffer buffer, String s) { 662 boolean escape = false; 663 boolean dollar = false; 664 boolean escapeNamedGroup = false; 665 int escapeNamedGroupStart = -1; 666 667 for (int i = 0; i < s.length(); i++) { 668 char c = s.charAt(i); 669 if (c == '\\' && !escape) { 670 escape = true; 671 } else if (c == '$' && !escape) { 672 dollar = true; 673 } else if (c >= '0' && c <= '9' && dollar) { 674 buffer.append(group(c - '0')); 675 dollar = false; 676 } else if (c == '{' && dollar) { 677 escapeNamedGroup = true; 678 escapeNamedGroupStart = i; 679 } else if (c == '}' && dollar && escapeNamedGroup) { 680 String namedGroupName = 681 s.substring(escapeNamedGroupStart + 1, i); 682 buffer.append(group(namedGroupName)); 683 dollar = false; 684 escapeNamedGroup = false; 685 } else if (c != '}' && dollar && escapeNamedGroup) { 686 continue; 687 } else { 688 buffer.append(c); 689 dollar = false; 690 escape = false; 691 escapeNamedGroup = false; 692 } 693 } 694 695 if (escapeNamedGroup) { 696 throw new IllegalArgumentException("Missing ending brace '}' from replacement string"); 697 } 698 699 if (escape) { 700 throw new ArrayIndexOutOfBoundsException(s.length()); 701 } 702 } 703 704 705 /** 706 * Implements a terminal append-and-replace step. 707 * 708 * <p> This method reads characters from the input sequence, starting at 709 * the append position, and appends them to the given string buffer. It is 710 * intended to be invoked after one or more invocations of the {@link 711 * #appendReplacement appendReplacement} method in order to copy the 712 * remainder of the input sequence. </p> 713 * 714 * @param sb 715 * The target string buffer 716 * 717 * @return The target string buffer 718 */ 719 public StringBuffer appendTail(StringBuffer sb) { 720 if (appendPos < regionEnd) { 721 sb.append(input.substring(appendPos, regionEnd)); 722 } 723 return sb; 724 } 725 726 /** 727 * Replaces every subsequence of the input sequence that matches the 728 * pattern with the given replacement string. 729 * 730 * <p> This method first resets this matcher. It then scans the input 731 * sequence looking for matches of the pattern. Characters that are not 732 * part of any match are appended directly to the result string; each match 733 * is replaced in the result by the replacement string. The replacement 734 * string may contain references to captured subsequences as in the {@link 735 * #appendReplacement appendReplacement} method. 736 * 737 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in 738 * the replacement string may cause the results to be different than if it 739 * were being treated as a literal replacement string. Dollar signs may be 740 * treated as references to captured subsequences as described above, and 741 * backslashes are used to escape literal characters in the replacement 742 * string. 743 * 744 * <p> Given the regular expression <tt>a*b</tt>, the input 745 * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string 746 * <tt>"-"</tt>, an invocation of this method on a matcher for that 747 * expression would yield the string <tt>"-foo-foo-foo-"</tt>. 748 * 749 * <p> Invoking this method changes this matcher's state. If the matcher 750 * is to be used in further matching operations then it should first be 751 * reset. </p> 752 * 753 * @param replacement 754 * The replacement string 755 * 756 * @return The string constructed by replacing each matching subsequence 757 * by the replacement string, substituting captured subsequences 758 * as needed 759 */ 760 public String replaceAll(String replacement) { 761 reset(); 762 StringBuffer buffer = new StringBuffer(input.length()); 763 while (find()) { 764 appendReplacement(buffer, replacement); 765 } 766 return appendTail(buffer).toString(); 767 } 768 769 /** 770 * Replaces the first subsequence of the input sequence that matches the 771 * pattern with the given replacement string. 772 * 773 * <p> This method first resets this matcher. It then scans the input 774 * sequence looking for a match of the pattern. Characters that are not 775 * part of the match are appended directly to the result string; the match 776 * is replaced in the result by the replacement string. The replacement 777 * string may contain references to captured subsequences as in the {@link 778 * #appendReplacement appendReplacement} method. 779 * 780 * <p>Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in 781 * the replacement string may cause the results to be different than if it 782 * were being treated as a literal replacement string. Dollar signs may be 783 * treated as references to captured subsequences as described above, and 784 * backslashes are used to escape literal characters in the replacement 785 * string. 786 * 787 * <p> Given the regular expression <tt>dog</tt>, the input 788 * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string 789 * <tt>"cat"</tt>, an invocation of this method on a matcher for that 790 * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>. </p> 791 * 792 * <p> Invoking this method changes this matcher's state. If the matcher 793 * is to be used in further matching operations then it should first be 794 * reset. </p> 795 * 796 * @param replacement 797 * The replacement string 798 * @return The string constructed by replacing the first matching 799 * subsequence by the replacement string, substituting captured 800 * subsequences as needed 801 */ 802 public String replaceFirst(String replacement) { 803 reset(); 804 StringBuffer buffer = new StringBuffer(input.length()); 805 if (find()) { 806 appendReplacement(buffer, replacement); 807 } 808 return appendTail(buffer).toString(); 809 } 810 811 /** 812 * Sets the limits of this matcher's region. The region is the part of the 813 * input sequence that will be searched to find a match. Invoking this 814 * method resets the matcher, and then sets the region to start at the 815 * index specified by the <code>start</code> parameter and end at the 816 * index specified by the <code>end</code> parameter. 817 * 818 * <p>Depending on the transparency and anchoring being used (see 819 * {@link #useTransparentBounds useTransparentBounds} and 820 * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such 821 * as anchors may behave differently at or around the boundaries of the 822 * region. 823 * 824 * @param start 825 * The index to start searching at (inclusive) 826 * @param end 827 * The index to end searching at (exclusive) 828 * @throws IndexOutOfBoundsException 829 * If start or end is less than zero, if 830 * start is greater than the length of the input sequence, if 831 * end is greater than the length of the input sequence, or if 832 * start is greater than end. 833 * @return this matcher 834 * @since 1.5 835 */ 836 public Matcher region(int start, int end) { 837 return reset(originalInput, start, end); 838 } 839 840 /** 841 * Reports the start index of this matcher's region. The 842 * searches this matcher conducts are limited to finding matches 843 * within {@link #regionStart regionStart} (inclusive) and 844 * {@link #regionEnd regionEnd} (exclusive). 845 * 846 * @return The starting point of this matcher's region 847 * @since 1.5 848 */ 849 public int regionStart() { 850 return regionStart; 851 } 852 853 /** 854 * Reports the end index (exclusive) of this matcher's region. 855 * The searches this matcher conducts are limited to finding matches 856 * within {@link #regionStart regionStart} (inclusive) and 857 * {@link #regionEnd regionEnd} (exclusive). 858 * 859 * @return the ending point of this matcher's region 860 * @since 1.5 861 */ 862 public int regionEnd() { 863 return regionEnd; 864 } 865 866 /** 867 * Queries the transparency of region bounds for this matcher. 868 * 869 * <p> This method returns <tt>true</tt> if this matcher uses 870 * <i>transparent</i> bounds, <tt>false</tt> if it uses <i>opaque</i> 871 * bounds. 872 * 873 * <p> See {@link #useTransparentBounds useTransparentBounds} for a 874 * description of transparent and opaque bounds. 875 * 876 * <p> By default, a matcher uses opaque region boundaries. 877 * 878 * @return <tt>true</tt> iff this matcher is using transparent bounds, 879 * <tt>false</tt> otherwise. 880 * @see java.util.regex.Matcher#useTransparentBounds(boolean) 881 * @since 1.5 882 */ 883 public boolean hasTransparentBounds() { 884 return transparentBounds; 885 } 886 887 /** 888 * Sets the transparency of region bounds for this matcher. 889 * 890 * <p> Invoking this method with an argument of <tt>true</tt> will set this 891 * matcher to use <i>transparent</i> bounds. If the boolean 892 * argument is <tt>false</tt>, then <i>opaque</i> bounds will be used. 893 * 894 * <p> Using transparent bounds, the boundaries of this 895 * matcher's region are transparent to lookahead, lookbehind, 896 * and boundary matching constructs. Those constructs can see beyond the 897 * boundaries of the region to see if a match is appropriate. 898 * 899 * <p> Using opaque bounds, the boundaries of this matcher's 900 * region are opaque to lookahead, lookbehind, and boundary matching 901 * constructs that may try to see beyond them. Those constructs cannot 902 * look past the boundaries so they will fail to match anything outside 903 * of the region. 904 * 905 * <p> By default, a matcher uses opaque bounds. 906 * 907 * @param value a boolean indicating whether to use opaque or transparent 908 * regions 909 * @return this matcher 910 * @see java.util.regex.Matcher#hasTransparentBounds 911 * @since 1.5 912 */ 913 public Matcher useTransparentBounds(boolean value) { 914 synchronized (this) { 915 transparentBounds = value; 916 useTransparentBoundsImpl(address, value); 917 } 918 return this; 919 } 920 921 /** 922 * Queries the anchoring of region bounds for this matcher. 923 * 924 * <p> This method returns <tt>true</tt> if this matcher uses 925 * <i>anchoring</i> bounds, <tt>false</tt> otherwise. 926 * 927 * <p> See {@link #useAnchoringBounds useAnchoringBounds} for a 928 * description of anchoring bounds. 929 * 930 * <p> By default, a matcher uses anchoring region boundaries. 931 * 932 * @return <tt>true</tt> iff this matcher is using anchoring bounds, 933 * <tt>false</tt> otherwise. 934 * @see java.util.regex.Matcher#useAnchoringBounds(boolean) 935 * @since 1.5 936 */ 937 public boolean hasAnchoringBounds() { 938 return anchoringBounds; 939 } 940 941 /** 942 * Sets the anchoring of region bounds for this matcher. 943 * 944 * <p> Invoking this method with an argument of <tt>true</tt> will set this 945 * matcher to use <i>anchoring</i> bounds. If the boolean 946 * argument is <tt>false</tt>, then <i>non-anchoring</i> bounds will be 947 * used. 948 * 949 * <p> Using anchoring bounds, the boundaries of this 950 * matcher's region match anchors such as ^ and $. 951 * 952 * <p> Without anchoring bounds, the boundaries of this 953 * matcher's region will not match anchors such as ^ and $. 954 * 955 * <p> By default, a matcher uses anchoring region boundaries. 956 * 957 * @param value a boolean indicating whether or not to use anchoring bounds. 958 * @return this matcher 959 * @see java.util.regex.Matcher#hasAnchoringBounds 960 * @since 1.5 961 */ 962 public Matcher useAnchoringBounds(boolean value) { 963 synchronized (this) { 964 anchoringBounds = value; 965 useAnchoringBoundsImpl(address, value); 966 } 967 return this; 968 } 969 970 /** 971 * <p>Returns the string representation of this matcher. The 972 * string representation of a <code>Matcher</code> contains information 973 * that may be useful for debugging. The exact format is unspecified. 974 * 975 * @return The string representation of this matcher 976 * @since 1.5 977 */ 978 public String toString() { 979 StringBuilder sb = new StringBuilder(); 980 sb.append("java.util.regex.Matcher"); 981 sb.append("[pattern=" + pattern()); 982 sb.append(" region="); 983 sb.append(regionStart() + "," + regionEnd()); 984 sb.append(" lastmatch="); 985 if (matchFound && (group() != null)) { 986 sb.append(group()); 987 } 988 sb.append("]"); 989 return sb.toString(); 990 } 991 992 /** 993 * <p>Returns true if the end of input was hit by the search engine in 994 * the last match operation performed by this matcher. 995 * 996 * <p>When this method returns true, then it is possible that more input 997 * would have changed the result of the last search. 998 * 999 * @return true iff the end of input was hit in the last match; false 1000 * otherwise 1001 * @since 1.5 1002 */ 1003 public boolean hitEnd() { 1004 synchronized (this) { 1005 return hitEndImpl(address); 1006 } 1007 } 1008 1009 1010 /** 1011 * <p>Returns true if more input could change a positive match into a 1012 * negative one. 1013 * 1014 * <p>If this method returns true, and a match was found, then more 1015 * input could cause the match to be lost. If this method returns false 1016 * and a match was found, then more input might change the match but the 1017 * match won't be lost. If a match was not found, then requireEnd has no 1018 * meaning. 1019 * 1020 * @return true iff more input could change a positive match into a 1021 * negative one. 1022 * @since 1.5 1023 */ 1024 public boolean requireEnd() { 1025 synchronized (this) { 1026 return requireEndImpl(address); 1027 } 1028 } 1029 1030 /** 1031 * Resets this matcher. 1032 * 1033 * <p> Resetting a matcher discards all of its explicit state information 1034 * and sets its append position to zero. The matcher's region is set to the 1035 * default region, which is its entire character sequence. The anchoring 1036 * and transparency of this matcher's region boundaries are unaffected. 1037 * 1038 * @return This matcher 1039 */ 1040 public Matcher reset() { 1041 return reset(originalInput, 0, originalInput.length()); 1042 } 1043 1044 /** 1045 * Resets this matcher with a new input sequence. 1046 * 1047 * <p> Resetting a matcher discards all of its explicit state information 1048 * and sets its append position to zero. The matcher's region is set to 1049 * the default region, which is its entire character sequence. The 1050 * anchoring and transparency of this matcher's region boundaries are 1051 * unaffected. 1052 * 1053 * @param input 1054 * The new input character sequence 1055 * 1056 * @return This matcher 1057 */ 1058 public Matcher reset(CharSequence input) { 1059 return reset(input, 0, input.length()); 1060 } 1061 1062 /** 1063 * Resets the Matcher. A new input sequence and a new region can be 1064 * specified. Results of a previous find get lost. The next attempt to find 1065 * an occurrence of the Pattern in the string will start at the beginning of 1066 * the region. This is the internal version of reset() to which the several 1067 * public versions delegate. 1068 * 1069 * @param input 1070 * the input sequence. 1071 * @param start 1072 * the start of the region. 1073 * @param end 1074 * the end of the region. 1075 * 1076 * @return the matcher itself. 1077 */ 1078 private Matcher reset(CharSequence input, int start, int end) { 1079 if (input == null) { 1080 throw new IllegalArgumentException("input == null"); 1081 } 1082 1083 if (start < 0 || end < 0 || start > input.length() || end > input.length() || start > end) { 1084 throw new IndexOutOfBoundsException(); 1085 } 1086 1087 this.originalInput = input; 1088 this.input = input.toString(); 1089 this.regionStart = start; 1090 this.regionEnd = end; 1091 resetForInput(); 1092 1093 matchFound = false; 1094 appendPos = 0; 1095 1096 return this; 1097 } 1098 1099 private void resetForInput() { 1100 synchronized (this) { 1101 setInputImpl(address, input, regionStart, regionEnd); 1102 useAnchoringBoundsImpl(address, anchoringBounds); 1103 useTransparentBoundsImpl(address, transparentBounds); 1104 } 1105 } 1106 1107 /** 1108 * Makes sure that a successful match has been made. Is invoked internally 1109 * from various places in the class. 1110 * 1111 * @throws IllegalStateException 1112 * if no successful match has been made. 1113 */ 1114 private void ensureMatch() { 1115 if (!matchFound) { 1116 throw new IllegalStateException("No successful match so far"); 1117 } 1118 } 1119 1120 /** 1121 * Returns the start index of the previous match. </p> 1122 * 1123 * @return The index of the first character matched 1124 * 1125 * @throws IllegalStateException 1126 * If no match has yet been attempted, 1127 * or if the previous match operation failed 1128 */ 1129 public int start() { 1130 return start(0); 1131 } 1132 1133 /** 1134 * Returns the start index of the subsequence captured by the given group 1135 * during the previous match operation. 1136 * 1137 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left 1138 * to right, starting at one. Group zero denotes the entire pattern, so 1139 * the expression <i>m.</i><tt>start(0)</tt> is equivalent to 1140 * <i>m.</i><tt>start()</tt>. </p> 1141 * 1142 * @param group 1143 * The index of a capturing group in this matcher's pattern 1144 * 1145 * @return The index of the first character captured by the group, 1146 * or <tt>-1</tt> if the match was successful but the group 1147 * itself did not match anything 1148 * 1149 * @throws IllegalStateException 1150 * If no match has yet been attempted, 1151 * or if the previous match operation failed 1152 * 1153 * @throws IndexOutOfBoundsException 1154 * If there is no capturing group in the pattern 1155 * with the given index 1156 */ 1157 public int start(int group) throws IllegalStateException { 1158 ensureMatch(); 1159 return matchOffsets[group * 2]; 1160 } 1161 1162 1163 /** 1164 * Returns the start index of the subsequence captured by the given 1165 * <a href="Pattern.html#groupname">named-capturing group</a> during the 1166 * previous match operation. 1167 * 1168 * @param name 1169 * The name of a named-capturing group in this matcher's pattern 1170 * 1171 * @return The index of the first character captured by the group, 1172 * or {@code -1} if the match was successful but the group 1173 * itself did not match anything 1174 * 1175 * @throws IllegalStateException 1176 * If no match has yet been attempted, 1177 * or if the previous match operation failed 1178 * 1179 * @throws IllegalArgumentException 1180 * If there is no capturing group in the pattern 1181 * with the given name 1182 * @since 1.8 1183 */ 1184 public int start(String name) { 1185 ensureMatch(); 1186 return matchOffsets[getMatchedGroupIndex(pattern.address, name) * 2]; 1187 } 1188 1189 private static int getMatchedGroupIndex(long patternAddr, String name) { 1190 int result = getMatchedGroupIndex0(patternAddr, name); 1191 if (result < 0) { 1192 throw new IllegalArgumentException("No capturing group in the pattern " + 1193 "with the name " + name); 1194 } 1195 return result; 1196 } 1197 1198 private static native int getMatchedGroupIndex0(long patternAddr, String name); 1199 private static native boolean findImpl(long addr, int startIndex, int[] offsets); 1200 private static native boolean findNextImpl(long addr, int[] offsets); 1201 private static native long getNativeFinalizer(); 1202 private static native int groupCountImpl(long addr); 1203 private static native boolean hitEndImpl(long addr); 1204 private static native boolean lookingAtImpl(long addr, int[] offsets); 1205 private static native boolean matchesImpl(long addr, int[] offsets); 1206 private static native int nativeSize(); 1207 private static native long openImpl(long patternAddr); 1208 private static native boolean requireEndImpl(long addr); 1209 private static native void setInputImpl(long addr, String s, int start, int end); 1210 private static native void useAnchoringBoundsImpl(long addr, boolean value); 1211 private static native void useTransparentBoundsImpl(long addr, boolean value); 1212 1213 /** 1214 * A trivial match result implementation that's based on an array of integers 1215 * representing match offsets. The array is of the form 1216 * {@code { start1, end1, start2, end2 ....}) where each consecutive pair of elements represents 1217 * the start and end of a match respectively. 1218 */ 1219 static final class OffsetBasedMatchResult implements MatchResult { 1220 private final String input; 1221 private final int[] offsets; 1222 1223 OffsetBasedMatchResult(String input, int[] offsets) { 1224 this.input = input; 1225 this.offsets = offsets.clone(); 1226 } 1227 1228 @Override 1229 public int start() { 1230 return start(0); 1231 } 1232 1233 @Override 1234 public int start(int group) { 1235 return offsets[2 * group]; 1236 } 1237 1238 @Override 1239 public int end() { 1240 return end(0); 1241 } 1242 1243 @Override 1244 public int end(int group) { 1245 return offsets[2 * group + 1]; 1246 } 1247 1248 @Override 1249 public String group() { 1250 return group(0); 1251 } 1252 1253 @Override 1254 public String group(int group) { 1255 final int start = start(group); 1256 final int end = end(group); 1257 if (start == -1 || end == -1) { 1258 return null; 1259 } 1260 1261 return input.substring(start, end); 1262 } 1263 1264 @Override 1265 public int groupCount() { 1266 return (offsets.length / 2) - 1; 1267 } 1268 } 1269} 1270