1/* Licensed to the Apache Software Foundation (ASF) under one or more 2 * contributor license agreements. See the NOTICE file distributed with 3 * this work for additional information regarding copyright ownership. 4 * The ASF licenses this file to You under the Apache License, Version 2.0 5 * (the "License"); you may not use this file except in compliance with 6 * the License. You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package org.apache.harmony.regex.tests.java.util.regex; 18 19import java.util.regex.Matcher; 20import java.util.regex.Pattern; 21import java.util.regex.PatternSyntaxException; 22 23import junit.framework.TestCase; 24 25/** 26 * Tests simple Pattern compilation and Matcher methods 27 * 28 */ 29public class Pattern2Test extends TestCase { 30 31 public void testSimpleMatch() throws PatternSyntaxException { 32 Pattern p = Pattern.compile("foo.*"); 33 34 Matcher m1 = p.matcher("foo123"); 35 assertTrue(m1.matches()); 36 assertTrue(m1.find(0)); 37 assertTrue(m1.lookingAt()); 38 39 Matcher m2 = p.matcher("fox"); 40 assertFalse(m2.matches()); 41 assertFalse(m2.find(0)); 42 assertFalse(m2.lookingAt()); 43 44 assertTrue(Pattern.matches("foo.*", "foo123")); 45 assertFalse(Pattern.matches("foo.*", "fox")); 46 47 assertFalse(Pattern.matches("bar", "foobar")); 48 49 assertTrue(Pattern.matches("", "")); 50 } 51 public void testCursors() { 52 Pattern p; 53 Matcher m; 54 55 try { 56 p = Pattern.compile("foo"); 57 58 m = p.matcher("foobar"); 59 assertTrue(m.find()); 60 assertEquals(0, m.start()); 61 assertEquals(3, m.end()); 62 assertFalse(m.find()); 63 64 // Note: also testing reset here 65 m.reset(); 66 assertTrue(m.find()); 67 assertEquals(0, m.start()); 68 assertEquals(3, m.end()); 69 assertFalse(m.find()); 70 71 m.reset("barfoobar"); 72 assertTrue(m.find()); 73 assertEquals(3, m.start()); 74 assertEquals(6, m.end()); 75 assertFalse(m.find()); 76 77 m.reset("barfoo"); 78 assertTrue(m.find()); 79 assertEquals(3, m.start()); 80 assertEquals(6, m.end()); 81 assertFalse(m.find()); 82 83 m.reset("foobarfoobarfoo"); 84 assertTrue(m.find()); 85 assertEquals(0, m.start()); 86 assertEquals(3, m.end()); 87 assertTrue(m.find()); 88 assertEquals(6, m.start()); 89 assertEquals(9, m.end()); 90 assertTrue(m.find()); 91 assertEquals(12, m.start()); 92 assertEquals(15, m.end()); 93 assertFalse(m.find()); 94 assertTrue(m.find(0)); 95 assertEquals(0, m.start()); 96 assertEquals(3, m.end()); 97 assertTrue(m.find(4)); 98 assertEquals(6, m.start()); 99 assertEquals(9, m.end()); 100 } catch (PatternSyntaxException e) { 101 System.out.println(e.getMessage()); 102 fail(); 103 } 104 } 105 public void testGroups() throws PatternSyntaxException { 106 Pattern p; 107 Matcher m; 108 109 p = Pattern.compile("(p[0-9]*)#?(q[0-9]*)"); 110 111 m = p.matcher("p1#q3p2q42p5p71p63#q888"); 112 assertTrue(m.find()); 113 assertEquals(0, m.start()); 114 assertEquals(5, m.end()); 115 assertEquals(2, m.groupCount()); 116 assertEquals(0, m.start(0)); 117 assertEquals(5, m.end(0)); 118 assertEquals(0, m.start(1)); 119 assertEquals(2, m.end(1)); 120 assertEquals(3, m.start(2)); 121 assertEquals(5, m.end(2)); 122 assertEquals("p1#q3", m.group()); 123 assertEquals("p1#q3", m.group(0)); 124 assertEquals("p1", m.group(1)); 125 assertEquals("q3", m.group(2)); 126 127 assertTrue(m.find()); 128 assertEquals(5, m.start()); 129 assertEquals(10, m.end()); 130 assertEquals(2, m.groupCount()); 131 assertEquals(10, m.end(0)); 132 assertEquals(5, m.start(1)); 133 assertEquals(7, m.end(1)); 134 assertEquals(7, m.start(2)); 135 assertEquals(10, m.end(2)); 136 assertEquals("p2q42", m.group()); 137 assertEquals("p2q42", m.group(0)); 138 assertEquals("p2", m.group(1)); 139 assertEquals("q42", m.group(2)); 140 141 assertTrue(m.find()); 142 assertEquals(15, m.start()); 143 assertEquals(23, m.end()); 144 assertEquals(2, m.groupCount()); 145 assertEquals(15, m.start(0)); 146 assertEquals(23, m.end(0)); 147 assertEquals(15, m.start(1)); 148 assertEquals(18, m.end(1)); 149 assertEquals(19, m.start(2)); 150 assertEquals(23, m.end(2)); 151 assertEquals("p63#q888", m.group()); 152 assertEquals("p63#q888", m.group(0)); 153 assertEquals("p63", m.group(1)); 154 assertEquals("q888", m.group(2)); 155 assertFalse(m.find()); 156 } 157 158 public void testReplace() throws PatternSyntaxException { 159 Pattern p; 160 Matcher m; 161 162 // Note: examples from book, 163 // Hitchens, Ron, 2002, "Java NIO", O'Reilly, page 171 164 p = Pattern.compile("a*b"); 165 166 m = p.matcher("aabfooaabfooabfoob"); 167 assertTrue(m.replaceAll("-").equals("-foo-foo-foo-")); 168 assertTrue(m.replaceFirst("-").equals("-fooaabfooabfoob")); 169 170 /* 171 * p = Pattern.compile ("\\p{Blank}"); 172 * 173 * m = p.matcher ("fee fie foe fum"); assertTrue 174 * (m.replaceFirst("-").equals ("fee-fie foe fum")); assertTrue 175 * (m.replaceAll("-").equals ("fee-fie-foe-fum")); 176 */ 177 178 p = Pattern.compile("([bB])yte"); 179 180 m = p.matcher("Byte for byte"); 181 assertTrue(m.replaceFirst("$1ite").equals("Bite for byte")); 182 assertTrue(m.replaceAll("$1ite").equals("Bite for bite")); 183 184 p = Pattern.compile("\\d\\d\\d\\d([- ])"); 185 186 m = p.matcher("card #1234-5678-1234"); 187 assertTrue(m.replaceFirst("xxxx$1").equals("card #xxxx-5678-1234")); 188 assertTrue(m.replaceAll("xxxx$1").equals("card #xxxx-xxxx-1234")); 189 190 p = Pattern.compile("(up|left)( *)(right|down)"); 191 192 m = p.matcher("left right, up down"); 193 assertTrue(m.replaceFirst("$3$2$1").equals("right left, up down")); 194 assertTrue(m.replaceAll("$3$2$1").equals("right left, down up")); 195 196 p = Pattern.compile("([CcPp][hl]e[ea]se)"); 197 198 m = p.matcher("I want cheese. Please."); 199 assertTrue(m.replaceFirst("<b> $1 </b>").equals( 200 "I want <b> cheese </b>. Please.")); 201 assertTrue(m.replaceAll("<b> $1 </b>").equals( 202 "I want <b> cheese </b>. <b> Please </b>.")); 203 } 204 205 public void testEscapes() throws PatternSyntaxException { 206 Pattern p; 207 Matcher m; 208 209 // Test \\ sequence 210 p = Pattern.compile("([a-z]+)\\\\([a-z]+);"); 211 m = p.matcher("fred\\ginger;abbott\\costello;jekell\\hyde;"); 212 assertTrue(m.find()); 213 assertEquals("fred", m.group(1)); 214 assertEquals("ginger", m.group(2)); 215 assertTrue(m.find()); 216 assertEquals("abbott", m.group(1)); 217 assertEquals("costello", m.group(2)); 218 assertTrue(m.find()); 219 assertEquals("jekell", m.group(1)); 220 assertEquals("hyde", m.group(2)); 221 assertFalse(m.find()); 222 223 // Test \n, \t, \r, \f, \e, \a sequences 224 p = Pattern.compile("([a-z]+)[\\n\\t\\r\\f\\e\\a]+([a-z]+)"); 225 m = p.matcher("aa\nbb;cc\u0009\rdd;ee\u000C\u001Bff;gg\n\u0007hh"); 226 assertTrue(m.find()); 227 assertEquals("aa", m.group(1)); 228 assertEquals("bb", m.group(2)); 229 assertTrue(m.find()); 230 assertEquals("cc", m.group(1)); 231 assertEquals("dd", m.group(2)); 232 assertTrue(m.find()); 233 assertEquals("ee", m.group(1)); 234 assertEquals("ff", m.group(2)); 235 assertTrue(m.find()); 236 assertEquals("gg", m.group(1)); 237 assertEquals("hh", m.group(2)); 238 assertFalse(m.find()); 239 240 // Test \\u and \\x sequences 241/* p = Pattern.compile("([0-9]+)[\\u0020:\\x21];"); 242 m = p.matcher("11:;22 ;33-;44!;"); 243 assertTrue(m.find()); 244 assertEquals("11", m.group(1)); 245 assertTrue(m.find()); 246 assertEquals("22", m.group(1)); 247 assertTrue(m.find()); 248 assertEquals("44", m.group(1)); 249 assertFalse(m.find()); 250*/ 251 // Test invalid unicode sequences 252/* try { 253 p = Pattern.compile("\\u"); 254 fail("PatternSyntaxException expected"); 255 } catch (PatternSyntaxException e) { 256 } 257 258 try { 259 p = Pattern.compile("\\u;"); 260 fail("PatternSyntaxException expected"); 261 } catch (PatternSyntaxException e) { 262 } 263 264 try { 265 p = Pattern.compile("\\u002"); 266 fail("PatternSyntaxException expected"); 267 } catch (PatternSyntaxException e) { 268 } 269 270 try { 271 p = Pattern.compile("\\u002;"); 272 fail("PatternSyntaxException expected"); 273 } catch (PatternSyntaxException e) { 274 } 275 276 // Test invalid hex sequences 277 try { 278 p = Pattern.compile("\\x"); 279 fail("PatternSyntaxException expected"); 280 } catch (PatternSyntaxException e) { 281 } 282 283 try { 284 p = Pattern.compile("\\x;"); 285 fail("PatternSyntaxException expected"); 286 } catch (PatternSyntaxException e) { 287 } 288 289 try { 290 p = Pattern.compile("\\xa"); 291 fail("PatternSyntaxException expected"); 292 } catch (PatternSyntaxException e) { 293 } 294 295 try { 296 p = Pattern.compile("\\xa;"); 297 fail("PatternSyntaxException expected"); 298 } catch (PatternSyntaxException e) { 299 } 300*/ 301 // Test \0 (octal) sequences (1, 2 and 3 digit) 302 p = Pattern.compile("([0-9]+)[\\07\\040\\0160];"); 303 m = p.matcher("11\u0007;22:;33 ;44p;"); 304 assertTrue(m.find()); 305 assertEquals("11", m.group(1)); 306 assertTrue(m.find()); 307 assertEquals("33", m.group(1)); 308 assertTrue(m.find()); 309 assertEquals("44", m.group(1)); 310 assertFalse(m.find()); 311 312 // Test invalid octal sequences 313 try { 314 p = Pattern.compile("\\08"); 315 fail("PatternSyntaxException expected"); 316 } catch (PatternSyntaxException e) { 317 } 318 319 //originally contributed test did not check the result 320 //TODO: check what RI does here 321// try { 322// p = Pattern.compile("\\0477"); 323// fail("PatternSyntaxException expected"); 324// } catch (PatternSyntaxException e) { 325// } 326 327 try { 328 p = Pattern.compile("\\0"); 329 fail("PatternSyntaxException expected"); 330 } catch (PatternSyntaxException e) { 331 } 332 333 try { 334 p = Pattern.compile("\\0;"); 335 fail("PatternSyntaxException expected"); 336 } catch (PatternSyntaxException e) { 337 } 338 339 340 // Test \c (control character) sequence 341 p = Pattern.compile("([0-9]+)[\\cA\\cB\\cC\\cD];"); 342 m = p.matcher("11\u0001;22:;33\u0002;44p;55\u0003;66\u0004;"); 343 assertTrue(m.find()); 344 assertEquals("11", m.group(1)); 345 assertTrue(m.find()); 346 assertEquals("33", m.group(1)); 347 assertTrue(m.find()); 348 assertEquals("55", m.group(1)); 349 assertTrue(m.find()); 350 assertEquals("66", m.group(1)); 351 assertFalse(m.find()); 352 353 // More thorough control escape test 354 // Ensure that each escape matches exactly the corresponding 355 // character 356 // code and no others (well, from 0-255 at least) 357 int i, j; 358 for (i = 0; i < 26; i++) { 359 p = Pattern.compile("\\c" + Character.toString((char) ('A' + i))); 360 int match_char = -1; 361 for (j = 0; j < 255; j++) { 362 m = p.matcher(Character.toString((char) j)); 363 if (m.matches()) { 364 assertEquals(-1, match_char); 365 match_char = j; 366 } 367 } 368 assertTrue(match_char == i + 1); 369 } 370 371 // Test invalid control escapes 372// BEGIN android-removed 373// ICU doesn't complain about illegal control sequences 374// try { 375// p = Pattern.compile("\\c"); 376// fail("PatternSyntaxException expected"); 377// } catch (PatternSyntaxException e) { 378// } 379// END android-removed 380 381 //originally contributed test did not check the result 382 //TODO: check what RI does here 383// try { 384// p = Pattern.compile("\\c;"); 385// fail("PatternSyntaxException expected"); 386// } catch (PatternSyntaxException e) { 387// } 388// 389// try { 390// p = Pattern.compile("\\ca;"); 391// fail("PatternSyntaxException expected"); 392// } catch (PatternSyntaxException e) { 393// } 394// 395// try { 396// p = Pattern.compile("\\c4;"); 397// fail("PatternSyntaxException expected"); 398// } catch (PatternSyntaxException e) { 399// } 400 } 401 public void testCharacterClasses() throws PatternSyntaxException { 402 Pattern p; 403 Matcher m; 404 405 // Test one character range 406 p = Pattern.compile("[p].*[l]"); 407 m = p.matcher("paul"); 408 assertTrue(m.matches()); 409 m = p.matcher("pool"); 410 assertTrue(m.matches()); 411 m = p.matcher("pong"); 412 assertFalse(m.matches()); 413 m = p.matcher("pl"); 414 assertTrue(m.matches()); 415 416 // Test two character range 417 p = Pattern.compile("[pm].*[lp]"); 418 m = p.matcher("prop"); 419 assertTrue(m.matches()); 420 m = p.matcher("mall"); 421 assertTrue(m.matches()); 422 m = p.matcher("pong"); 423 assertFalse(m.matches()); 424 m = p.matcher("pill"); 425 assertTrue(m.matches()); 426 427 // Test range including [ and ] 428 p = Pattern.compile("[<\\[].*[\\]>]"); 429 m = p.matcher("<foo>"); 430 assertTrue(m.matches()); 431 m = p.matcher("[bar]"); 432 assertTrue(m.matches()); 433 m = p.matcher("{foobar]"); 434 assertFalse(m.matches()); 435 m = p.matcher("<pill]"); 436 assertTrue(m.matches()); 437 438 // Test range using ^ 439 p = Pattern.compile("[^bc][a-z]+[tr]"); 440 m = p.matcher("pat"); 441 assertTrue(m.matches()); 442 m = p.matcher("liar"); 443 assertTrue(m.matches()); 444 m = p.matcher("car"); 445 assertFalse(m.matches()); 446 m = p.matcher("gnat"); 447 assertTrue(m.matches()); 448 449 // Test character range using - 450 p = Pattern.compile("[a-z]_+[a-zA-Z]-+[0-9p-z]"); 451 m = p.matcher("d__F-8"); 452 assertTrue(m.matches()); 453 m = p.matcher("c_a-q"); 454 assertTrue(m.matches()); 455 m = p.matcher("a__R-a"); 456 assertFalse(m.matches()); 457 m = p.matcher("r_____d-----5"); 458 assertTrue(m.matches()); 459 460 // Test range using unicode characters and unicode and hex escapes 461 p = Pattern.compile("[\\u1234-\\u2345]_+[a-z]-+[\u0001-\\x11]"); 462 m = p.matcher("\u2000_q-\u0007"); 463 assertTrue(m.matches()); 464 m = p.matcher("\u1234_z-\u0001"); 465 assertTrue(m.matches()); 466 m = p.matcher("r_p-q"); 467 assertFalse(m.matches()); 468 m = p.matcher("\u2345_____d-----\n"); 469 assertTrue(m.matches()); 470 471// BEGIN android-removed 472// The "---" collides with ICU's "--" operator and is likely to be a user error 473// anyway, so we simply comment this one out. 474// // Test ranges including the "-" character 475// p = Pattern.compile("[\\*-/]_+[---]!+[--AP]"); 476// m = p.matcher("-_-!!A"); 477// assertTrue(m.matches()); 478// m = p.matcher("\u002b_-!!!-"); 479// assertTrue(m.matches()); 480// m = p.matcher("!_-!@"); 481// assertFalse(m.matches()); 482// m = p.matcher(",______-!!!!!!!P"); 483// assertTrue(m.matches()); 484// END android-removed 485 486 // Test nested ranges 487 p = Pattern.compile("[pm[t]][a-z]+[[r]lp]"); 488 m = p.matcher("prop"); 489 assertTrue(m.matches()); 490 m = p.matcher("tsar"); 491 assertTrue(m.matches()); 492 m = p.matcher("pong"); 493 assertFalse(m.matches()); 494 m = p.matcher("moor"); 495 assertTrue(m.matches()); 496 497 // Test character class intersection with && 498 // TODO: figure out what x&&y or any class with a null intersection 499 // set (like [[a-c]&&[d-f]]) might mean. It doesn't mean "match 500 // nothing" and doesn't mean "match anything" so I'm stumped. 501 p = Pattern.compile("[[a-p]&&[g-z]]+-+[[a-z]&&q]-+[x&&[a-z]]-+"); 502 m = p.matcher("h--q--x--"); 503 assertTrue(m.matches()); 504 m = p.matcher("hog--q-x-"); 505 assertTrue(m.matches()); 506 m = p.matcher("ape--q-x-"); 507 assertFalse(m.matches()); 508 m = p.matcher("mop--q-x----"); 509 assertTrue(m.matches()); 510 511 // Test error cases with && 512// BEGIN android-removed 513// This is more of a bug, and ICU doesn't have this behavior. 514// p = Pattern.compile("[&&[xyz]]"); 515// m = p.matcher("&"); 516// // System.out.println(m.matches()); 517// m = p.matcher("x"); 518// // System.out.println(m.matches()); 519// m = p.matcher("y"); 520// // System.out.println(m.matches()); 521// END android-removed 522 p = Pattern.compile("[[xyz]&[axy]]"); 523 m = p.matcher("x"); 524 // System.out.println(m.matches()); 525 m = p.matcher("z"); 526 // System.out.println(m.matches()); 527 m = p.matcher("&"); 528 // System.out.println(m.matches()); 529 p = Pattern.compile("[abc[123]&&[345]def]"); 530 m = p.matcher("a"); 531 // System.out.println(m.matches()); 532 533// BEGIN android-removed 534// This is more of a bug, and ICU doesn't have this behavior. 535// p = Pattern.compile("[[xyz]&&]"); 536// END android-removed 537 p = Pattern.compile("[[abc]&]"); 538 539 try { 540 p = Pattern.compile("[[abc]&&"); 541 fail("PatternSyntaxException expected"); 542 } catch (PatternSyntaxException e) { 543 } 544 545 p = Pattern.compile("[[abc]\\&&[xyz]]"); 546 547 p = Pattern.compile("[[abc]&\\&[xyz]]"); 548 549 // Test 3-way intersection 550 p = Pattern.compile("[[a-p]&&[g-z]&&[d-k]]"); 551 m = p.matcher("g"); 552 assertTrue(m.matches()); 553 m = p.matcher("m"); 554 assertFalse(m.matches()); 555 556 // Test nested intersection 557 p = Pattern.compile("[[[a-p]&&[g-z]]&&[d-k]]"); 558 m = p.matcher("g"); 559 assertTrue(m.matches()); 560 m = p.matcher("m"); 561 assertFalse(m.matches()); 562 563 // Test character class subtraction with && and ^ 564 p = Pattern.compile("[[a-z]&&[^aeiou]][aeiou][[^xyz]&&[a-z]]"); 565 m = p.matcher("pop"); 566 assertTrue(m.matches()); 567 m = p.matcher("tag"); 568 assertTrue(m.matches()); 569 m = p.matcher("eat"); 570 assertFalse(m.matches()); 571 m = p.matcher("tax"); 572 assertFalse(m.matches()); 573 m = p.matcher("zip"); 574 assertTrue(m.matches()); 575 576 // Test . (DOT), with and without DOTALL 577 // Note: DOT not allowed in character classes 578 p = Pattern.compile(".+/x.z"); 579 m = p.matcher("!$/xyz"); 580 assertTrue(m.matches()); 581 m = p.matcher("%\n\r/x\nz"); 582 assertFalse(m.matches()); 583 p = Pattern.compile(".+/x.z", Pattern.DOTALL); 584 m = p.matcher("%\n\r/x\nz"); 585 assertTrue(m.matches()); 586 587 // Test \d (digit) 588 p = Pattern.compile("\\d+[a-z][\\dx]"); 589 m = p.matcher("42a6"); 590 assertTrue(m.matches()); 591 m = p.matcher("21zx"); 592 assertTrue(m.matches()); 593 m = p.matcher("ab6"); 594 assertFalse(m.matches()); 595 m = p.matcher("56912f9"); 596 assertTrue(m.matches()); 597 598 // Test \D (not a digit) 599 p = Pattern.compile("\\D+[a-z]-[\\D3]"); 600 m = p.matcher("za-p"); 601 assertTrue(m.matches()); 602 m = p.matcher("%!e-3"); 603 assertTrue(m.matches()); 604 m = p.matcher("9a-x"); 605 assertFalse(m.matches()); 606 m = p.matcher("\u1234pp\ny-3"); 607 assertTrue(m.matches()); 608 609 // Test \s (whitespace) 610 p = Pattern.compile("<[a-zA-Z]+\\s+[0-9]+[\\sx][^\\s]>"); 611 m = p.matcher("<cat \t1\fx>"); 612 assertTrue(m.matches()); 613 m = p.matcher("<cat \t1\f >"); 614 assertFalse(m.matches()); 615 m = p 616 .matcher("xyz <foo\n\r22 5> <pp \t\n\f\r \u000b41x\u1234><pp \nx7\rc> zzz"); 617 assertTrue(m.find()); 618 assertTrue(m.find()); 619 assertFalse(m.find()); 620 621 // Test \S (not whitespace) 622 p = Pattern.compile("<[a-z] \\S[0-9][\\S\n]+[^\\S]221>"); 623 m = p.matcher("<f $0**\n** 221>"); 624 assertTrue(m.matches()); 625 m = p.matcher("<x 441\t221>"); 626 assertTrue(m.matches()); 627 m = p.matcher("<z \t9\ng 221>"); 628 assertFalse(m.matches()); 629 m = p.matcher("<z 60\ngg\u1234\f221>"); 630 assertTrue(m.matches()); 631 p = Pattern.compile("<[a-z] \\S[0-9][\\S\n]+[^\\S]221[\\S&&[^abc]]>"); 632 m = p.matcher("<f $0**\n** 221x>"); 633 assertTrue(m.matches()); 634 m = p.matcher("<x 441\t221z>"); 635 assertTrue(m.matches()); 636 m = p.matcher("<x 441\t221 >"); 637 assertFalse(m.matches()); 638 m = p.matcher("<x 441\t221c>"); 639 assertFalse(m.matches()); 640 m = p.matcher("<z \t9\ng 221x>"); 641 assertFalse(m.matches()); 642 m = p.matcher("<z 60\ngg\u1234\f221\u0001>"); 643 assertTrue(m.matches()); 644 645 // Test \w (ascii word) 646 p = Pattern.compile("<\\w+\\s[0-9]+;[^\\w]\\w+/[\\w$]+;"); 647 m = p.matcher("<f1 99;!foo5/a$7;"); 648 assertTrue(m.matches()); 649 m = p.matcher("<f$ 99;!foo5/a$7;"); 650 assertFalse(m.matches()); 651 m = p 652 .matcher("<abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789 99;!foo5/a$7;"); 653 assertTrue(m.matches()); 654 655 // Test \W (not an ascii word) 656 p = Pattern.compile("<\\W\\w+\\s[0-9]+;[\\W_][^\\W]+\\s[0-9]+;"); 657 m = p.matcher("<$foo3\n99;_bar\t0;"); 658 assertTrue(m.matches()); 659 m = p.matcher("<hh 99;_g 0;"); 660 assertFalse(m.matches()); 661 m = p.matcher("<*xx\t00;^zz\f11;"); 662 assertTrue(m.matches()); 663 664 // Test x|y pattern 665 // TODO 666 } 667 public void testPOSIXGroups() throws PatternSyntaxException { 668 Pattern p; 669 Matcher m; 670 671 // Test POSIX groups using \p and \P (in the group and not in the group) 672 // Groups are Lower, Upper, ASCII, Alpha, Digit, XDigit, Alnum, Punct, 673 // Graph, Print, Blank, Space, Cntrl 674 // Test \p{Lower} 675 /* 676 * FIXME: Requires complex range processing 677 * p = Pattern.compile("<\\p{Lower}\\d\\P{Lower}:[\\p{Lower}Z]\\s[^\\P{Lower}]>"); 678 * m = p.matcher("<a4P:g x>"); assertTrue(m.matches()); m = 679 * p.matcher("<p4%:Z\tq>"); assertTrue(m.matches()); m = 680 * p.matcher("<A6#:e e>"); assertFalse(m.matches()); 681 */ 682 p = Pattern.compile("\\p{Lower}+"); 683 m = p.matcher("abcdefghijklmnopqrstuvwxyz"); 684 assertTrue(m.matches()); 685 686 // Invalid uses of \p{Lower} 687 try { 688 p = Pattern.compile("\\p"); 689 fail("PatternSyntaxException expected"); 690 } catch (PatternSyntaxException e) { 691 } 692 693 try { 694 p = Pattern.compile("\\p;"); 695 fail("PatternSyntaxException expected"); 696 } catch (PatternSyntaxException e) { 697 } 698 699 try { 700 p = Pattern.compile("\\p{"); 701 fail("PatternSyntaxException expected"); 702 } catch (PatternSyntaxException e) { 703 } 704 705 try { 706 p = Pattern.compile("\\p{;"); 707 fail("PatternSyntaxException expected"); 708 } catch (PatternSyntaxException e) { 709 } 710 711 try { 712 p = Pattern.compile("\\p{Lower"); 713 fail("PatternSyntaxException expected"); 714 } catch (PatternSyntaxException e) { 715 } 716 717 try { 718 p = Pattern.compile("\\p{Lower;"); 719 fail("PatternSyntaxException expected"); 720 } catch (PatternSyntaxException e) { 721 } 722 723 // Test \p{Upper} 724 /* 725 * FIXME: Requires complex range processing 726 * p = Pattern.compile("<\\p{Upper}\\d\\P{Upper}:[\\p{Upper}z]\\s[^\\P{Upper}]>"); 727 * m = p.matcher("<A4p:G X>"); assertTrue(m.matches()); m = 728 * p.matcher("<P4%:z\tQ>"); assertTrue(m.matches()); m = 729 * p.matcher("<a6#:E E>"); assertFalse(m.matches()); 730 */ 731 p = Pattern.compile("\\p{Upper}+"); 732 m = p.matcher("ABCDEFGHIJKLMNOPQRSTUVWXYZ"); 733 assertTrue(m.matches()); 734 735 // Invalid uses of \p{Upper} 736 try { 737 p = Pattern.compile("\\p{Upper"); 738 fail("PatternSyntaxException expected"); 739 } catch (PatternSyntaxException e) { 740 } 741 742 try { 743 p = Pattern.compile("\\p{Upper;"); 744 fail("PatternSyntaxException expected"); 745 } catch (PatternSyntaxException e) { 746 } 747 748 // Test \p{ASCII} 749 /* 750 * FIXME: Requires complex range processing p = Pattern.compile("<\\p{ASCII}\\d\\P{ASCII}:[\\p{ASCII}\u1234]\\s[^\\P{ASCII}]>"); 751 * m = p.matcher("<A4\u0080:G X>"); assertTrue(m.matches()); m = 752 * p.matcher("<P4\u00ff:\u1234\t\n>"); assertTrue(m.matches()); m = 753 * p.matcher("<\u00846#:E E>"); assertFalse(m.matches()) 754 */ 755 int i; 756 p = Pattern.compile("\\p{ASCII}"); 757 for (i = 0; i < 0x80; i++) { 758 m = p.matcher(Character.toString((char) i)); 759 assertTrue(m.matches()); 760 } 761 for (; i < 0xff; i++) { 762 m = p.matcher(Character.toString((char) i)); 763 assertFalse(m.matches()); 764 } 765 766 // Invalid uses of \p{ASCII} 767 try { 768 p = Pattern.compile("\\p{ASCII"); 769 fail("PatternSyntaxException expected"); 770 } catch (PatternSyntaxException e) { 771 } 772 773 try { 774 p = Pattern.compile("\\p{ASCII;"); 775 fail("PatternSyntaxException expected"); 776 } catch (PatternSyntaxException e) { 777 } 778 779 // Test \p{Alpha} 780 // TODO 781 782 // Test \p{Digit} 783 // TODO 784 785 // Test \p{XDigit} 786 // TODO 787 788 // Test \p{Alnum} 789 // TODO 790 791 // Test \p{Punct} 792 // TODO 793 794 // Test \p{Graph} 795 // TODO 796 797 // Test \p{Print} 798 // TODO 799 800 // Test \p{Blank} 801 // TODO 802 803 // Test \p{Space} 804 // TODO 805 806 // Test \p{Cntrl} 807 // TODO 808 } 809 public void testUnicodeCategories() throws PatternSyntaxException { 810 // Test Unicode categories using \p and \P 811 // One letter codes: L, M, N, P, S, Z, C 812 // Two letter codes: Lu, Nd, Sc, Sm, ... 813 // See java.lang.Character and Unicode standard for complete list 814 // TODO 815 // Test \p{L} 816 // TODO 817 818 // Test \p{N} 819 // TODO 820 821 // Test two letter codes: 822 // From unicode.org: 823 // Lu 824 // Ll 825 // Lt 826 // Lm 827 // Lo 828 // Mn 829 // Mc 830 // Me 831 // Nd 832 // Nl 833 // No 834 // Pc 835 // Pd 836 // Ps 837 // Pe 838 // Pi 839 // Pf 840 // Po 841 // Sm 842 // Sc 843 // Sk 844 // So 845 // Zs 846 // Zl 847 // Zp 848 // Cc 849 // Cf 850 // Cs 851 // Co 852 // Cn 853 854 // TODO add more tests per category 855 //{"Cc", "\u0000", "-\u0041"}, 856 testCategory("Cf", "\u202B"); 857 testCategory("Co", "\uE000"); 858 testCategory("Cs", "\uD800"); 859 testCategory("Ll", "a", "b", "x", "y", "z", "-A", "-Z"); 860 testCategory("Lm", "\u02B9"); 861 testCategory("Lu", "B", "C", "-c"); 862 testCategory("Lo", "\u05E2"); 863 testCategory("Lt", "\u01C5"); 864 testCategory("Mc", "\u0903"); 865 testCategory("Me", "\u0488"); 866 testCategory("Mn", "\u0300"); 867 testCategory("Nd", "\u0030"); 868 testCategory("Nl", "\u2164"); 869 testCategory("No", "\u0BF0"); 870 // testCategory("Pc", "\u30FB"); 871 testCategory("Pd", "\u2015"); 872 testCategory("Pe", "\u207E"); 873 testCategory("Po", "\u00B7"); 874 testCategory("Ps", "\u0F3C"); 875 testCategory("Sc", "\u20A0"); 876 testCategory("Sk", "\u00B8"); 877 testCategory("Sm", "\u002B"); 878 testCategory("So", "\u0B70"); 879 testCategory("Zl", "\u2028"); 880 // testCategory("Pi", "\u200C"); 881 testCategory("Zp", "\u2029"); 882 } 883 884 private void testCategory(String cat, String... matches) { 885 String pa = "{"+cat+"}"; 886 String pat = "\\p"+pa; 887 String npat = "\\P"+pa; 888 Pattern p = Pattern.compile(pat); 889 Pattern pn = Pattern.compile(npat); 890 for (int j = 0; j < matches.length; j++) { 891 String t = matches[j]; 892 boolean invert = t.startsWith("-"); 893 if (invert) { 894 // test negative case, expected to fail 895 t = t.substring(1); 896 assertFalse("expected '"+t+"' to not be matched " + 897 "by pattern '"+pat, p.matcher(t).matches()); 898 assertTrue("expected '"+t+"' to " + 899 "be matched by pattern '"+npat, pn.matcher(t).matches()); 900 } else { 901 assertTrue("expected '"+t+"' to be matched " + 902 "by pattern '"+pat, p.matcher(t).matches()); 903 assertFalse("expected '"+t+"' to " + 904 "not be matched by pattern '"+npat, pn.matcher(t).matches()); 905 } 906 } 907 } 908 909 public void testUnicodeBlocks() throws PatternSyntaxException { 910 Pattern p; 911 Matcher m; 912 int i, j; 913 914 // Test Unicode blocks using \p and \P 915 // FIXME: 916 // Note that LatinExtended-B and ArabicPresentations-B are unrecognized 917 // by the reference JDK. 918 for (i = 0; i < UBlocks.length; i++) { 919 /* 920 * p = Pattern.compile("\\p{"+UBlocks[i].name+"}"); 921 * 922 * if (UBlocks[i].low > 0) { m = 923 * p.matcher(Character.toString((char)(UBlocks[i].low-1))); 924 * assertFalse(m.matches()); } for (j=UBlocks[i].low; j <= 925 * UBlocks[i].high; j++) { m = 926 * p.matcher(Character.toString((char)j)); 927 * assertTrue(m.matches()); } if (UBlocks[i].high < 0xFFFF) { m = 928 * p.matcher(Character.toString((char)(UBlocks[i].high+1))); 929 * assertFalse(m.matches()); } 930 * 931 * p = Pattern.compile("\\P{"+UBlocks[i].name+"}"); 932 * 933 * if (UBlocks[i].low > 0) { m = 934 * p.matcher(Character.toString((char)(UBlocks[i].low-1))); 935 * assertTrue(m.matches()); } for (j=UBlocks[i].low; j < 936 * UBlocks[i].high; j++) { m = 937 * p.matcher(Character.toString((char)j)); 938 * assertFalse(m.matches()); } if (UBlocks[i].high < 0xFFFF) { m = 939 * p.matcher(Character.toString((char)(UBlocks[i].high+1))); 940 * assertTrue(m.matches()); } 941 */ 942 943 p = Pattern.compile("\\p{In" + UBlocks[i].name + "}"); 944// BEGIN android-changed 945// Added the name of the block under test to the assertion to get more output. 946 947 if (UBlocks[i].low > 0) { 948 m = p.matcher(Character.toString((char) (UBlocks[i].low - 1))); 949 assertFalse(UBlocks[i].name, m.matches()); 950 } 951 for (j = UBlocks[i].low; j <= UBlocks[i].high; j++) { 952 m = p.matcher(Character.toString((char) j)); 953 assertTrue(UBlocks[i].name, m.matches()); 954 } 955 if (UBlocks[i].high < 0xFFFF) { 956 m = p.matcher(Character.toString((char) (UBlocks[i].high + 1))); 957 assertFalse(UBlocks[i].name, m.matches()); 958 } 959 960 p = Pattern.compile("\\P{In" + UBlocks[i].name + "}"); 961 962 if (UBlocks[i].low > 0) { 963 m = p.matcher(Character.toString((char) (UBlocks[i].low - 1))); 964 assertTrue(UBlocks[i].name, m.matches()); 965 } 966 for (j = UBlocks[i].low; j < UBlocks[i].high; j++) { 967 m = p.matcher(Character.toString((char) j)); 968 assertFalse(UBlocks[i].name, m.matches()); 969 } 970 if (UBlocks[i].high < 0xFFFF) { 971 m = p.matcher(Character.toString((char) (UBlocks[i].high + 1))); 972 assertTrue(UBlocks[i].name, m.matches()); 973 } 974 975// END android-changed 976 } 977 } 978 public void testCapturingGroups() throws PatternSyntaxException { 979 Pattern p; 980 Matcher m; 981 982 // Test simple capturing groups 983 p = Pattern.compile("(a+)b"); 984 m = p.matcher("aaaaaaaab"); 985 assertTrue(m.matches()); 986 assertEquals(1, m.groupCount()); 987 assertEquals("aaaaaaaa", m.group(1)); 988 989 p = Pattern.compile("((an)+)((as)+)"); 990 m = p.matcher("ananas"); 991 assertTrue(m.matches()); 992 assertEquals(4, m.groupCount()); 993 assertEquals("ananas", m.group(0)); 994 assertEquals("anan", m.group(1)); 995 assertEquals("an", m.group(2)); 996 assertEquals("as", m.group(3)); 997 assertEquals("as", m.group(4)); 998 999 // Test grouping without capture (?:...) 1000 p = Pattern.compile("(?:(?:an)+)(as)"); 1001 m = p.matcher("ananas"); 1002 assertTrue(m.matches()); 1003 assertEquals(1, m.groupCount()); 1004 assertEquals("as", m.group(1)); 1005 try { 1006 m.group(2); 1007 fail("expected IndexOutOfBoundsException"); 1008 } catch (IndexOutOfBoundsException ioobe) { 1009 // expected 1010 } 1011 1012 // Test combination of grouping and capture 1013 // TODO 1014 1015 // Test \<num> sequence with capturing and non-capturing groups 1016 // TODO 1017 1018 // Test \<num> with <num> out of range 1019 p = Pattern.compile("((an)+)as\\1"); 1020 m = p.matcher("ananasanan"); 1021 assertTrue(m.matches()); 1022 1023 try { 1024 p = Pattern.compile("((an)+)as\\4"); 1025 fail("expected PatternSyntaxException"); 1026 } catch (PatternSyntaxException pse) { 1027 // expected 1028 } 1029 1030 } 1031 public void testRepeats() { 1032 Pattern p; 1033 Matcher m; 1034 1035 // Test ? 1036 p = Pattern.compile("(abc)?c"); 1037 m = p.matcher("abcc"); 1038 assertTrue(m.matches()); 1039 m = p.matcher("c"); 1040 assertTrue(m.matches()); 1041 m = p.matcher("cc"); 1042 assertFalse(m.matches()); 1043 m = p.matcher("abcabcc"); 1044 assertFalse(m.matches()); 1045 1046 // Test * 1047 p = Pattern.compile("(abc)*c"); 1048 m = p.matcher("abcc"); 1049 assertTrue(m.matches()); 1050 m = p.matcher("c"); 1051 assertTrue(m.matches()); 1052 m = p.matcher("cc"); 1053 assertFalse(m.matches()); 1054 m = p.matcher("abcabcc"); 1055 assertTrue(m.matches()); 1056 1057 // Test + 1058 p = Pattern.compile("(abc)+c"); 1059 m = p.matcher("abcc"); 1060 assertTrue(m.matches()); 1061 m = p.matcher("c"); 1062 assertFalse(m.matches()); 1063 m = p.matcher("cc"); 1064 assertFalse(m.matches()); 1065 m = p.matcher("abcabcc"); 1066 assertTrue(m.matches()); 1067 1068 // Test {<num>}, including 0, 1 and more 1069 p = Pattern.compile("(abc){0}c"); 1070 m = p.matcher("abcc"); 1071 assertFalse(m.matches()); 1072 m = p.matcher("c"); 1073 assertTrue(m.matches()); 1074 1075 p = Pattern.compile("(abc){1}c"); 1076 m = p.matcher("abcc"); 1077 assertTrue(m.matches()); 1078 m = p.matcher("c"); 1079 assertFalse(m.matches()); 1080 m = p.matcher("abcabcc"); 1081 assertFalse(m.matches()); 1082 1083 p = Pattern.compile("(abc){2}c"); 1084 m = p.matcher("abcc"); 1085 assertFalse(m.matches()); 1086 m = p.matcher("c"); 1087 assertFalse(m.matches()); 1088 m = p.matcher("cc"); 1089 assertFalse(m.matches()); 1090 m = p.matcher("abcabcc"); 1091 assertTrue(m.matches()); 1092 1093 // Test {<num>,}, including 0, 1 and more 1094 // TODO 1095 1096 // Test {<n1>,<n2>}, with n1 < n2, n1 = n2 and n1 > n2 (illegal?) 1097 // TODO 1098 } 1099 public void testAnchors() throws PatternSyntaxException { 1100 Pattern p; 1101 Matcher m; 1102 1103 // Test ^, default and MULTILINE 1104 p = Pattern.compile("^abc\\n^abc", Pattern.MULTILINE); 1105 m = p.matcher("abc\nabc"); 1106 assertTrue(m.matches()); 1107 1108 p = Pattern.compile("^abc\\n^abc"); 1109 m = p.matcher("abc\nabc"); 1110 assertFalse(m.matches()); 1111 1112 // Test $, default and MULTILINE 1113 // TODO 1114 1115 // Test \b (word boundary) 1116 // TODO 1117 1118 // Test \B (not a word boundary) 1119 // TODO 1120 1121 // Test \A (beginning of string) 1122 // TODO 1123 1124 // Test \Z (end of string) 1125 // TODO 1126 1127 // Test \z (end of string) 1128 // TODO 1129 1130 // Test \G 1131 // TODO 1132 1133 // Test positive lookahead using (?=...) 1134 // TODO 1135 1136 // Test negative lookahead using (?!...) 1137 // TODO 1138 1139 // Test positive lookbehind using (?<=...) 1140 // TODO 1141 1142 // Test negative lookbehind using (?<!...) 1143 // TODO 1144 } 1145 public void testMisc() throws PatternSyntaxException { 1146 Pattern p; 1147 Matcher m; 1148 1149 // Test (?>...) 1150 // TODO 1151 1152 // Test (?onflags-offflags) 1153 // Valid flags are i,m,d,s,u,x 1154 // TODO 1155 1156 // Test (?onflags-offflags:...) 1157 // TODO 1158 1159 // Test \Q, \E 1160 p = Pattern.compile("[a-z]+;\\Q[a-z]+;\\Q(foo.*);\\E[0-9]+"); 1161 m = p.matcher("abc;[a-z]+;\\Q(foo.*);411"); 1162 assertTrue(m.matches()); 1163 m = p.matcher("abc;def;foo42;555"); 1164 assertFalse(m.matches()); 1165 m = p.matcher("abc;\\Qdef;\\Qfoo99;\\E123"); 1166 assertFalse(m.matches()); 1167 1168 p = Pattern.compile("[a-z]+;(foo[0-9]-\\Q(...)\\E);[0-9]+"); 1169 m = p.matcher("abc;foo5-(...);123"); 1170 assertTrue(m.matches()); 1171 assertEquals("foo5-(...)", m.group(1)); 1172 m = p.matcher("abc;foo9-(xxx);789"); 1173 assertFalse(m.matches()); 1174 1175 p = Pattern.compile("[a-z]+;(bar[0-9]-[a-z\\Q$-\\E]+);[0-9]+"); 1176 m = p.matcher("abc;bar0-def$-;123"); 1177 assertTrue(m.matches()); 1178 1179 // FIXME: 1180 // This should work the same as the pattern above but fails with the 1181 // the reference JDK 1182 p = Pattern.compile("[a-z]+;(bar[0-9]-[a-z\\Q-$\\E]+);[0-9]+"); 1183 m = p.matcher("abc;bar0-def$-;123"); 1184 // assertTrue(m.matches()); 1185 1186 // FIXME: 1187 // This should work too .. it looks as if just about anything that 1188 // has more 1189 // than one character between \Q and \E is broken in the the reference JDK 1190 p = Pattern.compile("[a-z]+;(bar[0-9]-[a-z\\Q[0-9]\\E]+);[0-9]+"); 1191 m = p.matcher("abc;bar0-def[99]-]0x[;123"); 1192 // assertTrue(m.matches()); 1193 1194 // This is the same as above but with explicit escapes .. and this 1195 // does work 1196 // on the the reference JDK 1197 p = Pattern.compile("[a-z]+;(bar[0-9]-[a-z\\[0\\-9\\]]+);[0-9]+"); 1198 m = p.matcher("abc;bar0-def[99]-]0x[;123"); 1199 assertTrue(m.matches()); 1200 1201 // Test #<comment text> 1202 // TODO 1203 } 1204 public void testCompile1() throws PatternSyntaxException { 1205 Pattern pattern = Pattern 1206 .compile("[0-9A-Za-z][0-9A-Za-z\\x2e\\x3a\\x2d\\x5f]*"); 1207 String name = "iso-8859-1"; 1208 assertTrue(pattern.matcher(name).matches()); 1209 } 1210 public void testCompile2() throws PatternSyntaxException { 1211 String findString = "\\Qimport\\E"; 1212 1213 Pattern pattern = Pattern.compile(findString, 0); 1214 Matcher matcher = pattern.matcher(new String( 1215 "import a.A;\n\n import b.B;\nclass C {}")); 1216 1217 assertTrue(matcher.find(0)); 1218 } 1219 public void testCompile3() throws PatternSyntaxException { 1220 Pattern p; 1221 Matcher m; 1222 p = Pattern.compile("a$"); 1223 m = p.matcher("a\n"); 1224 assertTrue(m.find()); 1225 assertEquals("a", m.group()); 1226 assertFalse(m.find()); 1227 1228 p = Pattern.compile("(a$)"); 1229 m = p.matcher("a\n"); 1230 assertTrue(m.find()); 1231 assertEquals("a", m.group()); 1232 assertEquals("a", m.group(1)); 1233 assertFalse(m.find()); 1234 1235 p = Pattern.compile("^.*$", Pattern.MULTILINE); 1236 1237 m = p.matcher("a\n"); 1238 assertTrue(m.find()); 1239 // System.out.println("["+m.group()+"]"); 1240 assertEquals("a", m.group()); 1241 assertFalse(m.find()); 1242 1243 m = p.matcher("a\nb\n"); 1244 assertTrue(m.find()); 1245 // System.out.println("["+m.group()+"]"); 1246 assertEquals("a", m.group()); 1247 assertTrue(m.find()); 1248 // System.out.println("["+m.group()+"]"); 1249 assertEquals("b", m.group()); 1250 assertFalse(m.find()); 1251 1252 m = p.matcher("a\nb"); 1253 assertTrue(m.find()); 1254 // System.out.println("["+m.group()+"]"); 1255 assertEquals("a", m.group()); 1256 assertTrue(m.find()); 1257 assertEquals("b", m.group()); 1258 assertFalse(m.find()); 1259 1260 m = p.matcher("\naa\r\nbb\rcc\n\n"); 1261 assertTrue(m.find()); 1262 // System.out.println("["+m.group()+"]"); 1263 assertTrue(m.group().equals("")); 1264 assertTrue(m.find()); 1265 // System.out.println("["+m.group()+"]"); 1266 assertEquals("aa", m.group()); 1267 assertTrue(m.find()); 1268 // System.out.println("["+m.group()+"]"); 1269 assertEquals("bb", m.group()); 1270 assertTrue(m.find()); 1271 // System.out.println("["+m.group()+"]"); 1272 assertEquals("cc", m.group()); 1273 assertTrue(m.find()); 1274 // System.out.println("["+m.group()+"]"); 1275 assertTrue(m.group().equals("")); 1276 assertFalse(m.find()); 1277 1278 m = p.matcher("a"); 1279 assertTrue(m.find()); 1280 assertEquals("a", m.group()); 1281 assertFalse(m.find()); 1282 1283// BEGIN android-removed 1284// Makes no sense to duplicate this weird behavior 1285// m = p.matcher(""); 1286// // FIXME: This matches the reference behaviour but is 1287// // inconsistent with matching "a" - ie. the end of the 1288// // target string should match against $ always but this 1289// // appears to work with the null string only when not in 1290// // multiline mode (see below) 1291// assertFalse(m.find()); 1292// END android-removed 1293 1294 p = Pattern.compile("^.*$"); 1295 m = p.matcher(""); 1296 assertTrue(m.find()); 1297 assertTrue(m.group().equals("")); 1298 assertFalse(m.find()); 1299 } 1300 public void testCompile4() throws PatternSyntaxException { 1301 String findString = "\\Qpublic\\E"; 1302 StringBuffer text = new StringBuffer(" public class Class {\n" 1303 + " public class Class {"); 1304 1305 Pattern pattern = Pattern.compile(findString, 0); 1306 Matcher matcher = pattern.matcher(text); 1307 1308 boolean found = matcher.find(); 1309 assertTrue(found); 1310 assertEquals(4, matcher.start()); 1311 if (found) { 1312 // modify text 1313 text.delete(0, text.length()); 1314 text.append("Text have been changed."); 1315 matcher.reset(text); 1316 } 1317 1318 found = matcher.find(); 1319 assertFalse(found); 1320 } 1321 public void testCompile5() throws PatternSyntaxException { 1322 Pattern p = Pattern.compile("^[0-9]"); 1323 String s[] = p.split("12", -1); 1324 assertEquals("", s[0]); 1325 assertEquals("2", s[1]); 1326 assertEquals(2, s.length); 1327 } 1328 1329 // public void testCompile6() { 1330 // String regex = "[\\p{L}[\\p{Mn}[\\p{Pc}[\\p{Nd}[\\p{Nl}[\\p{Sc}]]]]]]+"; 1331 // String regex = "[\\p{L}\\p{Mn}\\p{Pc}\\p{Nd}\\p{Nl}\\p{Sc}]+"; 1332 // try { 1333 // Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE); 1334 // assertTrue(true); 1335 // } catch (PatternSyntaxException e) { 1336 // System.out.println(e.getMessage()); 1337 // assertTrue(false); 1338 // } 1339 // } 1340 1341 private static class UBInfo { 1342 public UBInfo(int low, int high, String name) { 1343 this.name = name; 1344 this.low = low; 1345 this.high = high; 1346 } 1347 1348 public String name; 1349 1350 public int low, high; 1351 } 1352 1353 // A table representing the unicode categories 1354 //private static UBInfo[] UCategories = { 1355 // Lu 1356 // Ll 1357 // Lt 1358 // Lm 1359 // Lo 1360 // Mn 1361 // Mc 1362 // Me 1363 // Nd 1364 // Nl 1365 // No 1366 // Pc 1367 // Pd 1368 // Ps 1369 // Pe 1370 // Pi 1371 // Pf 1372 // Po 1373 // Sm 1374 // Sc 1375 // Sk 1376 // So 1377 // Zs 1378 // Zl 1379 // Zp 1380 // Cc 1381 // Cf 1382 // Cs 1383 // Co 1384 // Cn 1385 //}; 1386 1387 // A table representing the unicode character blocks 1388 private static UBInfo[] UBlocks = { 1389 /* 0000; 007F; Basic Latin */ 1390 new UBInfo(0x0000, 0x007F, "BasicLatin"), // Character.UnicodeBlock.BASIC_LATIN 1391 /* 0080; 00FF; Latin-1 Supplement */ 1392 new UBInfo(0x0080, 0x00FF, "Latin-1Supplement"), // Character.UnicodeBlock.LATIN_1_SUPPLEMENT 1393 /* 0100; 017F; Latin Extended-A */ 1394 new UBInfo(0x0100, 0x017F, "LatinExtended-A"), // Character.UnicodeBlock.LATIN_EXTENDED_A 1395 /* 0180; 024F; Latin Extended-B */ 1396 // new UBInfo (0x0180,0x024F,"InLatinExtended-B"), // 1397 // Character.UnicodeBlock.LATIN_EXTENDED_B 1398 /* 0250; 02AF; IPA Extensions */ 1399 new UBInfo(0x0250, 0x02AF, "IPAExtensions"), // Character.UnicodeBlock.IPA_EXTENSIONS 1400 /* 02B0; 02FF; Spacing Modifier Letters */ 1401 new UBInfo(0x02B0, 0x02FF, "SpacingModifierLetters"), // Character.UnicodeBlock.SPACING_MODIFIER_LETTERS 1402 /* 0300; 036F; Combining Diacritical Marks */ 1403 new UBInfo(0x0300, 0x036F, "CombiningDiacriticalMarks"), // Character.UnicodeBlock.COMBINING_DIACRITICAL_MARKS 1404 /* 0370; 03FF; Greek */ 1405 new UBInfo(0x0370, 0x03FF, "Greek"), // Character.UnicodeBlock.GREEK 1406 /* 0400; 04FF; Cyrillic */ 1407 new UBInfo(0x0400, 0x04FF, "Cyrillic"), // Character.UnicodeBlock.CYRILLIC 1408 /* 0530; 058F; Armenian */ 1409 new UBInfo(0x0530, 0x058F, "Armenian"), // Character.UnicodeBlock.ARMENIAN 1410 /* 0590; 05FF; Hebrew */ 1411 new UBInfo(0x0590, 0x05FF, "Hebrew"), // Character.UnicodeBlock.HEBREW 1412 /* 0600; 06FF; Arabic */ 1413 new UBInfo(0x0600, 0x06FF, "Arabic"), // Character.UnicodeBlock.ARABIC 1414 /* 0700; 074F; Syriac */ 1415 new UBInfo(0x0700, 0x074F, "Syriac"), // Character.UnicodeBlock.SYRIAC 1416 /* 0780; 07BF; Thaana */ 1417 new UBInfo(0x0780, 0x07BF, "Thaana"), // Character.UnicodeBlock.THAANA 1418 /* 0900; 097F; Devanagari */ 1419 new UBInfo(0x0900, 0x097F, "Devanagari"), // Character.UnicodeBlock.DEVANAGARI 1420 /* 0980; 09FF; Bengali */ 1421 new UBInfo(0x0980, 0x09FF, "Bengali"), // Character.UnicodeBlock.BENGALI 1422 /* 0A00; 0A7F; Gurmukhi */ 1423 new UBInfo(0x0A00, 0x0A7F, "Gurmukhi"), // Character.UnicodeBlock.GURMUKHI 1424 /* 0A80; 0AFF; Gujarati */ 1425 new UBInfo(0x0A80, 0x0AFF, "Gujarati"), // Character.UnicodeBlock.GUJARATI 1426 /* 0B00; 0B7F; Oriya */ 1427 new UBInfo(0x0B00, 0x0B7F, "Oriya"), // Character.UnicodeBlock.ORIYA 1428 /* 0B80; 0BFF; Tamil */ 1429 new UBInfo(0x0B80, 0x0BFF, "Tamil"), // Character.UnicodeBlock.TAMIL 1430 /* 0C00; 0C7F; Telugu */ 1431 new UBInfo(0x0C00, 0x0C7F, "Telugu"), // Character.UnicodeBlock.TELUGU 1432 /* 0C80; 0CFF; Kannada */ 1433 new UBInfo(0x0C80, 0x0CFF, "Kannada"), // Character.UnicodeBlock.KANNADA 1434 /* 0D00; 0D7F; Malayalam */ 1435 new UBInfo(0x0D00, 0x0D7F, "Malayalam"), // Character.UnicodeBlock.MALAYALAM 1436 /* 0D80; 0DFF; Sinhala */ 1437 new UBInfo(0x0D80, 0x0DFF, "Sinhala"), // Character.UnicodeBlock.SINHALA 1438 /* 0E00; 0E7F; Thai */ 1439 new UBInfo(0x0E00, 0x0E7F, "Thai"), // Character.UnicodeBlock.THAI 1440 /* 0E80; 0EFF; Lao */ 1441 new UBInfo(0x0E80, 0x0EFF, "Lao"), // Character.UnicodeBlock.LAO 1442 /* 0F00; 0FFF; Tibetan */ 1443 new UBInfo(0x0F00, 0x0FFF, "Tibetan"), // Character.UnicodeBlock.TIBETAN 1444 /* 1000; 109F; Myanmar */ 1445 new UBInfo(0x1000, 0x109F, "Myanmar"), // Character.UnicodeBlock.MYANMAR 1446 /* 10A0; 10FF; Georgian */ 1447 new UBInfo(0x10A0, 0x10FF, "Georgian"), // Character.UnicodeBlock.GEORGIAN 1448 /* 1100; 11FF; Hangul Jamo */ 1449 new UBInfo(0x1100, 0x11FF, "HangulJamo"), // Character.UnicodeBlock.HANGUL_JAMO 1450 /* 1200; 137F; Ethiopic */ 1451 new UBInfo(0x1200, 0x137F, "Ethiopic"), // Character.UnicodeBlock.ETHIOPIC 1452 /* 13A0; 13FF; Cherokee */ 1453 new UBInfo(0x13A0, 0x13FF, "Cherokee"), // Character.UnicodeBlock.CHEROKEE 1454 /* 1400; 167F; Unified Canadian Aboriginal Syllabics */ 1455 new UBInfo(0x1400, 0x167F, "UnifiedCanadianAboriginalSyllabics"), // Character.UnicodeBlock.UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS 1456 /* 1680; 169F; Ogham */ 1457 new UBInfo(0x1680, 0x169F, "Ogham"), // Character.UnicodeBlock.OGHAM 1458 /* 16A0; 16FF; Runic */ 1459 new UBInfo(0x16A0, 0x16FF, "Runic"), // Character.UnicodeBlock.RUNIC 1460 /* 1780; 17FF; Khmer */ 1461 new UBInfo(0x1780, 0x17FF, "Khmer"), // Character.UnicodeBlock.KHMER 1462 /* 1800; 18AF; Mongolian */ 1463 new UBInfo(0x1800, 0x18AF, "Mongolian"), // Character.UnicodeBlock.MONGOLIAN 1464 /* 1E00; 1EFF; Latin Extended Additional */ 1465 new UBInfo(0x1E00, 0x1EFF, "LatinExtendedAdditional"), // Character.UnicodeBlock.LATIN_EXTENDED_ADDITIONAL 1466 /* 1F00; 1FFF; Greek Extended */ 1467 new UBInfo(0x1F00, 0x1FFF, "GreekExtended"), // Character.UnicodeBlock.GREEK_EXTENDED 1468 /* 2000; 206F; General Punctuation */ 1469 new UBInfo(0x2000, 0x206F, "GeneralPunctuation"), // Character.UnicodeBlock.GENERAL_PUNCTUATION 1470 /* 2070; 209F; Superscripts and Subscripts */ 1471 new UBInfo(0x2070, 0x209F, "SuperscriptsandSubscripts"), // Character.UnicodeBlock.SUPERSCRIPTS_AND_SUBSCRIPTS 1472 /* 20A0; 20CF; Currency Symbols */ 1473 new UBInfo(0x20A0, 0x20CF, "CurrencySymbols"), // Character.UnicodeBlock.CURRENCY_SYMBOLS 1474 /* 20D0; 20FF; Combining Marks for Symbols */ 1475 new UBInfo(0x20D0, 0x20FF, "CombiningMarksforSymbols"), // Character.UnicodeBlock.COMBINING_MARKS_FOR_SYMBOLS 1476 /* 2100; 214F; Letterlike Symbols */ 1477 new UBInfo(0x2100, 0x214F, "LetterlikeSymbols"), // Character.UnicodeBlock.LETTERLIKE_SYMBOLS 1478 /* 2150; 218F; Number Forms */ 1479 new UBInfo(0x2150, 0x218F, "NumberForms"), // Character.UnicodeBlock.NUMBER_FORMS 1480 /* 2190; 21FF; Arrows */ 1481 new UBInfo(0x2190, 0x21FF, "Arrows"), // Character.UnicodeBlock.ARROWS 1482 /* 2200; 22FF; Mathematical Operators */ 1483 new UBInfo(0x2200, 0x22FF, "MathematicalOperators"), // Character.UnicodeBlock.MATHEMATICAL_OPERATORS 1484 /* 2300; 23FF; Miscellaneous Technical */ 1485 new UBInfo(0x2300, 0x23FF, "MiscellaneousTechnical"), // Character.UnicodeBlock.MISCELLANEOUS_TECHNICAL 1486 /* 2400; 243F; Control Pictures */ 1487 new UBInfo(0x2400, 0x243F, "ControlPictures"), // Character.UnicodeBlock.CONTROL_PICTURES 1488 /* 2440; 245F; Optical Character Recognition */ 1489 new UBInfo(0x2440, 0x245F, "OpticalCharacterRecognition"), // Character.UnicodeBlock.OPTICAL_CHARACTER_RECOGNITION 1490 /* 2460; 24FF; Enclosed Alphanumerics */ 1491 new UBInfo(0x2460, 0x24FF, "EnclosedAlphanumerics"), // Character.UnicodeBlock.ENCLOSED_ALPHANUMERICS 1492 /* 2500; 257F; Box Drawing */ 1493 new UBInfo(0x2500, 0x257F, "BoxDrawing"), // Character.UnicodeBlock.BOX_DRAWING 1494 /* 2580; 259F; Block Elements */ 1495 new UBInfo(0x2580, 0x259F, "BlockElements"), // Character.UnicodeBlock.BLOCK_ELEMENTS 1496 /* 25A0; 25FF; Geometric Shapes */ 1497 new UBInfo(0x25A0, 0x25FF, "GeometricShapes"), // Character.UnicodeBlock.GEOMETRIC_SHAPES 1498 /* 2600; 26FF; Miscellaneous Symbols */ 1499 new UBInfo(0x2600, 0x26FF, "MiscellaneousSymbols"), // Character.UnicodeBlock.MISCELLANEOUS_SYMBOLS 1500 /* 2700; 27BF; Dingbats */ 1501 new UBInfo(0x2700, 0x27BF, "Dingbats"), // Character.UnicodeBlock.DINGBATS 1502 /* 2800; 28FF; Braille Patterns */ 1503 new UBInfo(0x2800, 0x28FF, "BraillePatterns"), // Character.UnicodeBlock.BRAILLE_PATTERNS 1504 /* 2E80; 2EFF; CJK Radicals Supplement */ 1505 new UBInfo(0x2E80, 0x2EFF, "CJKRadicalsSupplement"), // Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT 1506 /* 2F00; 2FDF; Kangxi Radicals */ 1507 new UBInfo(0x2F00, 0x2FDF, "KangxiRadicals"), // Character.UnicodeBlock.KANGXI_RADICALS 1508 /* 2FF0; 2FFF; Ideographic Description Characters */ 1509 new UBInfo(0x2FF0, 0x2FFF, "IdeographicDescriptionCharacters"), // Character.UnicodeBlock.IDEOGRAPHIC_DESCRIPTION_CHARACTERS 1510 /* 3000; 303F; CJK Symbols and Punctuation */ 1511 new UBInfo(0x3000, 0x303F, "CJKSymbolsandPunctuation"), // Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION 1512 /* 3040; 309F; Hiragana */ 1513 new UBInfo(0x3040, 0x309F, "Hiragana"), // Character.UnicodeBlock.HIRAGANA 1514 /* 30A0; 30FF; Katakana */ 1515 new UBInfo(0x30A0, 0x30FF, "Katakana"), // Character.UnicodeBlock.KATAKANA 1516 /* 3100; 312F; Bopomofo */ 1517 new UBInfo(0x3100, 0x312F, "Bopomofo"), // Character.UnicodeBlock.BOPOMOFO 1518 /* 3130; 318F; Hangul Compatibility Jamo */ 1519 new UBInfo(0x3130, 0x318F, "HangulCompatibilityJamo"), // Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO 1520 /* 3190; 319F; Kanbun */ 1521 new UBInfo(0x3190, 0x319F, "Kanbun"), // Character.UnicodeBlock.KANBUN 1522 /* 31A0; 31BF; Bopomofo Extended */ 1523 new UBInfo(0x31A0, 0x31BF, "BopomofoExtended"), // Character.UnicodeBlock.BOPOMOFO_EXTENDED 1524 /* 3200; 32FF; Enclosed CJK Letters and Months */ 1525 new UBInfo(0x3200, 0x32FF, "EnclosedCJKLettersandMonths"), // Character.UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS 1526 /* 3300; 33FF; CJK Compatibility */ 1527 new UBInfo(0x3300, 0x33FF, "CJKCompatibility"), // Character.UnicodeBlock.CJK_COMPATIBILITY 1528 /* 3400; 4DB5; CJK Unified Ideographs Extension A */ 1529// BEGIN android-changed 1530// Modified this to reflect current Unicode tables (or maybe it was a typo) 1531 new UBInfo(0x3400, 0x4DBF, "CJKUnifiedIdeographsExtensionA"), // Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 1532// END android-changed 1533 /* 4E00; 9FFF; CJK Unified Ideographs */ 1534 new UBInfo(0x4E00, 0x9FFF, "CJKUnifiedIdeographs"), // Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 1535 /* A000; A48F; Yi Syllables */ 1536 new UBInfo(0xA000, 0xA48F, "YiSyllables"), // Character.UnicodeBlock.YI_SYLLABLES 1537 /* A490; A4CF; Yi Radicals */ 1538 new UBInfo(0xA490, 0xA4CF, "YiRadicals"), // Character.UnicodeBlock.YI_RADICALS 1539 /* AC00; D7A3; Hangul Syllables */ 1540// BEGIN android-changed 1541// Modified this to reflect current Unicode tables (or maybe it was a typo) 1542 new UBInfo(0xAC00, 0xD7AF, "HangulSyllables"), // Character.UnicodeBlock.HANGUL_SYLLABLES 1543// END android-changed 1544 /* D800; DB7F; High Surrogates */ 1545 /* DB80; DBFF; High Private Use Surrogates */ 1546 /* DC00; DFFF; Low Surrogates */ 1547 /* E000; F8FF; Private Use */ 1548 /* F900; FAFF; CJK Compatibility Ideographs */ 1549 new UBInfo(0xF900, 0xFAFF, "CJKCompatibilityIdeographs"), // Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 1550 /* FB00; FB4F; Alphabetic Presentation Forms */ 1551 new UBInfo(0xFB00, 0xFB4F, "AlphabeticPresentationForms"), // Character.UnicodeBlock.ALPHABETIC_PRESENTATION_FORMS 1552 /* FB50; FDFF; Arabic Presentation Forms-A */ 1553 new UBInfo(0xFB50, 0xFDFF, "ArabicPresentationForms-A"), // Character.UnicodeBlock.ARABIC_PRESENTATION_FORMS_A 1554 /* FE20; FE2F; Combining Half Marks */ 1555 new UBInfo(0xFE20, 0xFE2F, "CombiningHalfMarks"), // Character.UnicodeBlock.COMBINING_HALF_MARKS 1556 /* FE30; FE4F; CJK Compatibility Forms */ 1557 new UBInfo(0xFE30, 0xFE4F, "CJKCompatibilityForms"), // Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS 1558 /* FE50; FE6F; Small Form Variants */ 1559 new UBInfo(0xFE50, 0xFE6F, "SmallFormVariants"), // Character.UnicodeBlock.SMALL_FORM_VARIANTS 1560 /* FE70; FEFE; Arabic Presentation Forms-B */ 1561 // new UBInfo (0xFE70,0xFEFE,"InArabicPresentationForms-B"), // 1562 // Character.UnicodeBlock.ARABIC_PRESENTATION_FORMS_B 1563 /* FEFF; FEFF; Specials */ 1564// BEGIN android-changed 1565// Modified this to reflect current Unicode tables (or maybe it was a typo) 1566// FEFF is actually still Arabic Presentation Forms B 1567// new UBInfo(0xFEFF, 0xFEFF, "Specials"), // Character.UnicodeBlock.SPECIALS 1568// END android-changed 1569 /* FF00; FFEF; Halfwidth and Fullwidth Forms */ 1570 new UBInfo(0xFF00, 0xFFEF, "HalfwidthandFullwidthForms"), // Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS 1571 /* FFF0; FFFD; Specials */ 1572 // BEGIN android-changed 1573// Modified this to reflect current Unicode tables (or maybe it was a typo) 1574 new UBInfo(0xFFF0, 0xFFFF, "Specials") // Character.UnicodeBlock.SPECIALS 1575// END android-changed 1576 }; 1577} 1578