1/* 2 * Created on May 5, 2004 3 * 4 * Copyright (C) 2004-2015 International Business Machines Corporation and others. 5 * All Rights Reserved. 6 * 7 */ 8package com.ibm.icu.dev.test.rbbi; 9 10import java.io.IOException; 11import java.io.InputStream; 12import java.io.InputStreamReader; 13import java.util.Arrays; 14 15import com.ibm.icu.dev.test.TestFmwk; 16import com.ibm.icu.impl.Utility; 17import com.ibm.icu.lang.UCharacter; 18import com.ibm.icu.text.BreakIterator; 19import com.ibm.icu.text.UTF16; 20import com.ibm.icu.util.ULocale; 21 22 23/** 24 * Rule based break iterator data driven test. 25 * Perform the tests from the file rbbitst.txt. 26 * The test data file is common to both ICU4C and ICU4J. 27 * See the data file for a description of the tests. 28 * 29 */ 30public class RBBITestExtended extends TestFmwk { 31 32 public static void main(String[] args)throws Exception { 33 new RBBITestExtended().run(args); 34 } 35 36 37public RBBITestExtended() { 38 } 39 40 41 42static class TestParams { 43 BreakIterator bi; 44 StringBuffer dataToBreak = new StringBuffer(); 45 int[] expectedBreaks = new int[1000]; 46 int[] srcLine = new int[1000]; 47 int[] srcCol = new int[1000]; 48 ULocale currentLocale = new ULocale("en_US"); 49} 50 51 52public void TestExtended() { 53 TestParams tp = new TestParams(); 54 55 56 // 57 // Open and read the test data file. 58 // 59 StringBuffer testFileBuf = new StringBuffer(); 60 InputStream is = null; 61 try { 62 is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt"); 63 if (is == null) { 64 errln("Could not open test data file rbbitst.txt"); 65 return; 66 } 67 InputStreamReader isr = new InputStreamReader(is, "UTF-8"); 68 try { 69 int c; 70 int count = 0; 71 for (;;) { 72 c = isr.read(); 73 if (c < 0) { 74 break; 75 } 76 count++; 77 if (c == 0xFEFF && count == 1) { 78 // BOM in the test data file. Discard it. 79 continue; 80 } 81 82 UTF16.append(testFileBuf, c); 83 } 84 } finally { 85 isr.close(); 86 } 87 } catch (IOException e) { 88 errln(e.toString()); 89 try { 90 is.close(); 91 } catch (IOException ignored) { 92 } 93 return; 94 } 95 96 String testString = testFileBuf.toString(); 97 98 99 final int PARSE_COMMENT = 1; 100 final int PARSE_TAG = 2; 101 final int PARSE_DATA = 3; 102 final int PARSE_NUM = 4; 103 104 int parseState = PARSE_TAG; 105 106 int savedState = PARSE_TAG; 107 108 final char CH_LF = 0x0a; 109 final char CH_CR = 0x0d; 110 final char CH_HASH = 0x23; 111 /*static const UChar CH_PERIOD = 0x2e;*/ 112 final char CH_LT = 0x3c; 113 final char CH_GT = 0x3e; 114 final char CH_BACKSLASH = 0x5c; 115 final char CH_BULLET = 0x2022; 116 117 int lineNum = 1; 118 int colStart = 0; 119 int column = 0; 120 int charIdx = 0; 121 int i; 122 123 int tagValue = 0; // The numeric value of a <nnn> tag. 124 int len = testString.length(); 125 126 for (charIdx = 0; charIdx < len; ) { 127 int c = UTF16.charAt(testString, charIdx); 128 charIdx++; 129 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 130 // treat CRLF as a unit 131 c = CH_LF; 132 charIdx++; 133 } 134 if (c == CH_LF || c == CH_CR) { 135 lineNum++; 136 colStart = charIdx; 137 } 138 column = charIdx - colStart + 1; 139 140 switch (parseState) { 141 case PARSE_COMMENT: 142 if (c == 0x0a || c == 0x0d) { 143 parseState = savedState; 144 } 145 break; 146 147 case PARSE_TAG: 148 { 149 if (c == CH_HASH) { 150 parseState = PARSE_COMMENT; 151 savedState = PARSE_TAG; 152 break; 153 } 154 if (UCharacter.isWhitespace(c)) { 155 break; 156 } 157 if (testString.startsWith("<word>", charIdx-1)) { 158 tp.bi = BreakIterator.getWordInstance(tp.currentLocale); 159 charIdx += 5; 160 break; 161 } 162 if (testString.startsWith("<char>", charIdx-1)) { 163 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale); 164 charIdx += 5; 165 break; 166 } 167 if (testString.startsWith("<line>", charIdx-1)) { 168 tp.bi = BreakIterator.getLineInstance(tp.currentLocale); 169 charIdx += 5; 170 break; 171 } 172 if (testString.startsWith("<sent>", charIdx-1)) { 173 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale); 174 charIdx += 5; 175 break; 176 } 177 if (testString.startsWith("<title>", charIdx-1)) { 178 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale); 179 charIdx += 6; 180 break; 181 } 182 if (testString.startsWith("<locale ", charIdx-1)) { 183 int closeIndex = testString.indexOf(">", charIdx); 184 if (closeIndex < 0) { 185 errln("line" + lineNum + ": missing close on <locale tag."); 186 break; 187 } 188 String localeName = testString.substring(charIdx+6, closeIndex); 189 localeName = localeName.trim(); 190 tp.currentLocale = new ULocale(localeName); 191 charIdx = closeIndex+1; 192 break; 193 } 194 if (testString.startsWith("<data>", charIdx-1)) { 195 parseState = PARSE_DATA; 196 charIdx += 5; 197 tp.dataToBreak.setLength(0); 198 Arrays.fill(tp.expectedBreaks, 0); 199 Arrays.fill(tp.srcCol, 0); 200 Arrays.fill(tp.srcLine, 0); 201 break; 202 } 203 204 errln("line" + lineNum + ": Tag expected in test file."); 205 return; 206 //parseState = PARSE_COMMENT; 207 //savedState = PARSE_DATA; 208 } 209 210 case PARSE_DATA: 211 if (c == CH_BULLET) { 212 int breakIdx = tp.dataToBreak.length(); 213 tp.expectedBreaks[breakIdx] = -1; 214 tp.srcLine[breakIdx] = lineNum; 215 tp.srcCol[breakIdx] = column; 216 break; 217 } 218 219 if (testString.startsWith("</data>", charIdx-1)) { 220 // Add final entry to mappings from break location to source file position. 221 // Need one extra because last break position returned is after the 222 // last char in the data, not at the last char. 223 int idx = tp.dataToBreak.length(); 224 tp.srcLine[idx] = lineNum; 225 tp.srcCol[idx] = column; 226 227 parseState = PARSE_TAG; 228 charIdx += 6; 229 230 // RUN THE TEST! 231 executeTest(tp); 232 break; 233 } 234 235 if (testString.startsWith("\\N{", charIdx-1)) { 236 int nameEndIdx = testString.indexOf('}', charIdx); 237 if (nameEndIdx == -1) { 238 errln("Error in named character in test file at line " + lineNum + 239 ", col " + column); 240 } 241 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 242 // Get the code point from the name and insert it into the test data. 243 String charName = testString.substring(charIdx+2, nameEndIdx); 244 c = UCharacter.getCharFromName(charName); 245 if (c == -1) { 246 errln("Error in named character in test file at line " + lineNum + 247 ", col " + column); 248 } else { 249 // Named code point was recognized. Insert it 250 // into the test data. 251 UTF16.append(tp.dataToBreak, c); 252 for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { 253 tp.srcLine[i] = lineNum; 254 tp.srcCol[i] = column; 255 } 256 257 } 258 if (nameEndIdx > charIdx) { 259 charIdx = nameEndIdx+1; 260 } 261 break; 262 } 263 264 if (testString.startsWith("<>", charIdx-1)) { 265 charIdx++; 266 int breakIdx = tp.dataToBreak.length(); 267 tp.expectedBreaks[breakIdx] = -1; 268 tp.srcLine[breakIdx] = lineNum; 269 tp.srcCol[breakIdx] = column; 270 break; 271 } 272 273 if (c == CH_LT) { 274 tagValue = 0; 275 parseState = PARSE_NUM; 276 break; 277 } 278 279 if (c == CH_HASH && column==3) { // TODO: why is column off so far? 280 parseState = PARSE_COMMENT; 281 savedState = PARSE_DATA; 282 break; 283 } 284 285 if (c == CH_BACKSLASH) { 286 // Check for \ at end of line, a line continuation. 287 // Advance over (discard) the newline 288 int cp = UTF16.charAt(testString, charIdx); 289 if (cp == CH_CR && charIdx<len && UTF16.charAt(testString, charIdx+1) == CH_LF) { 290 // We have a CR LF 291 // Need an extra increment of the input ptr to move over both of them 292 charIdx++; 293 } 294 if (cp == CH_LF || cp == CH_CR) { 295 lineNum++; 296 column = 0; 297 charIdx++; 298 colStart = charIdx; 299 break; 300 } 301 302 // Let unescape handle the back slash. 303 int charIdxAr[] = new int[1]; 304 charIdxAr[0] = charIdx; 305 cp = Utility.unescapeAt(testString, charIdxAr); 306 if (cp != -1) { 307 // Escape sequence was recognized. Insert the char 308 // into the test data. 309 charIdx = charIdxAr[0]; 310 UTF16.append(tp.dataToBreak, cp); 311 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { 312 tp.srcLine[i] = lineNum; 313 tp.srcCol[i] = column; 314 } 315 316 break; 317 } 318 319 320 // Not a recognized backslash escape sequence. 321 // Take the next char as a literal. 322 // TODO: Should this be an error? 323 c = UTF16.charAt(testString,charIdx); 324 charIdx = UTF16.moveCodePointOffset(testString, charIdx, 1); 325 } 326 327 // Normal, non-escaped data char. 328 UTF16.append(tp.dataToBreak, c); 329 330 // Save the mapping from offset in the data to line/column numbers in 331 // the original input file. Will be used for better error messages only. 332 // If there's an expected break before this char, the slot in the mapping 333 // vector will already be set for this char; don't overwrite it. 334 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { 335 tp.srcLine[i] = lineNum; 336 tp.srcCol[i] = column; 337 } 338 break; 339 340 341 case PARSE_NUM: 342 // We are parsing an expected numeric tag value, like <1234>, 343 // within a chunk of data. 344 if (UCharacter.isWhitespace(c)) { 345 break; 346 } 347 348 if (c == CH_GT) { 349 // Finished the number. Add the info to the expected break data, 350 // and switch parse state back to doing plain data. 351 parseState = PARSE_DATA; 352 if (tagValue == 0) { 353 tagValue = -1; 354 } 355 int breakIdx = tp.dataToBreak.length(); 356 tp.expectedBreaks[breakIdx] = tagValue; 357 tp.srcLine[breakIdx] = lineNum; 358 tp.srcCol[breakIdx] = column; 359 break; 360 } 361 362 if (UCharacter.isDigit(c)) { 363 tagValue = tagValue*10 + UCharacter.digit(c); 364 break; 365 } 366 367 errln("Syntax Error in test file at line "+ lineNum +", col %d" + column); 368 return; 369 370 // parseState = PARSE_COMMENT; // TODO: unreachable. Don't stop on errors. 371 // break; 372 } 373 374 375 376 } 377} 378 379void executeTest(TestParams t) { 380 int bp; 381 int prevBP; 382 int i; 383 384 if (t.bi == null) { 385 return; 386 } 387 388 t.bi.setText(t.dataToBreak.toString()); 389 // 390 // Run the iterator forward 391 // 392 prevBP = -1; 393 for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) { 394 if (prevBP == bp) { 395 // Fail for lack of forward progress. 396 errln("Forward Iteration, no forward progress. Break Pos=" + bp + 397 " File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]); 398 break; 399 } 400 401 // Check that there were we didn't miss an expected break between the last one 402 // and this one. 403 for (i=prevBP+1; i<bp; i++) { 404 if (t.expectedBreaks[i] != 0) { 405 errln("Forward Iteration, break expected, but not found. Pos=" + i + 406 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); 407 } 408 } 409 410 // Check that the break we did find was expected 411 if (t.expectedBreaks[bp] == 0) { 412 errln("Forward Iteration, break found, but not expected. Pos=" + bp + 413 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]); 414 } else { 415 // The break was expected. 416 // Check that the {nnn} tag value is correct. 417 int expectedTagVal = t.expectedBreaks[bp]; 418 if (expectedTagVal == -1) { 419 expectedTagVal = 0; 420 } 421 int line = t.srcLine[bp]; 422 int rs = t.bi.getRuleStatus(); 423 if (rs != expectedTagVal) { 424 errln("Incorrect status for forward break. Pos = " + bp + 425 ". File line,col = " + line + ", " + t.srcCol[bp] + "\n" + 426 " Actual, Expected status = " + rs + ", " + expectedTagVal); 427 } 428 int[] fillInArray = new int[4]; 429 int numStatusVals = t.bi.getRuleStatusVec(fillInArray); 430 assertTrue("", numStatusVals >= 1); 431 assertEquals("", expectedTagVal, fillInArray[0]); 432 } 433 434 435 prevBP = bp; 436 } 437 438 // Verify that there were no missed expected breaks after the last one found 439 for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) { 440 if (t.expectedBreaks[i] != 0) { 441 errln("Forward Iteration, break expected, but not found. Pos=" + i + 442 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); 443 } 444 } 445 446 447 // 448 // Run the iterator backwards, verify that the same breaks are found. 449 // 450 prevBP = t.dataToBreak.length()+2; // start with a phony value for the last break pos seen. 451 for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) { 452 if (prevBP == bp) { 453 // Fail for lack of progress. 454 errln("Reverse Iteration, no progress. Break Pos=" + bp + 455 "File line,col=" + t.srcLine[bp] + " " + t.srcCol[bp]); 456 break; 457 } 458 459 // Check that we didn't miss an expected break between the last one 460 // and this one. (UVector returns zeros for index out of bounds.) 461 for (i=prevBP-1; i>bp; i--) { 462 if (t.expectedBreaks[i] != 0) { 463 errln("Reverse Itertion, break expected, but not found. Pos=" + i + 464 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); 465 } 466 } 467 468 // Check that the break we did find was expected 469 if (t.expectedBreaks[bp] == 0) { 470 errln("Reverse Itertion, break found, but not expected. Pos=" + bp + 471 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]); 472 } else { 473 // The break was expected. 474 // Check that the {nnn} tag value is correct. 475 int expectedTagVal = t.expectedBreaks[bp]; 476 if (expectedTagVal == -1) { 477 expectedTagVal = 0; 478 } 479 int line = t.srcLine[bp]; 480 int rs = t.bi.getRuleStatus(); 481 if (rs != expectedTagVal) { 482 errln("Incorrect status for reverse break. Pos= " + bp + 483 "File line,col= " + line + ", " + t.srcCol[bp] + "\n" + 484 " Actual, Expected status = " + rs + ", " + expectedTagVal); 485 } 486 } 487 488 prevBP = bp; 489 } 490 491 // Verify that there were no missed breaks prior to the last one found 492 for (i=prevBP-1; i>=0; i--) { 493 if (t.expectedBreaks[i] != 0) { 494 errln("Reverse Itertion, break expected, but not found. Pos=" + i + 495 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); 496 } 497 } 498 // Check isBoundary() 499 for (i=0; i<=t.dataToBreak.length(); i++) { 500 boolean boundaryExpected = (t.expectedBreaks[i] != 0); 501 boolean boundaryFound = t.bi.isBoundary(i); 502 if (boundaryExpected != boundaryFound) { 503 errln("isBoundary(" + i + ") incorrect.\n" + 504 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] + 505 " Expected, Actual= " + boundaryExpected + ", " + boundaryFound); 506 } 507 } 508 509 // Check following() 510 for (i=0; i<=t.dataToBreak.length(); i++) { 511 int actualBreak = t.bi.following(i); 512 int expectedBreak = BreakIterator.DONE; 513 for (int j=i+1; j < t.expectedBreaks.length; j++) { 514 if (t.expectedBreaks[j] != 0) { 515 expectedBreak = j; 516 break; 517 } 518 } 519 if (expectedBreak != actualBreak) { 520 errln("following(" + i + ") incorrect.\n" + 521 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] + 522 " Expected, Actual= " + expectedBreak + ", " + actualBreak); 523 } 524 } 525 526 // Check preceding() 527 for (i=t.dataToBreak.length(); i>=0; i--) { 528 int actualBreak = t.bi.preceding(i); 529 int expectedBreak = BreakIterator.DONE; 530 531 for (int j=i-1; j >= 0; j--) { 532 if (t.expectedBreaks[j] != 0) { 533 expectedBreak = j; 534 break; 535 } 536 } 537 if (expectedBreak != actualBreak) { 538 errln("preceding(" + i + ") incorrect.\n" + 539 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] + 540 " Expected, Actual= " + expectedBreak + ", " + actualBreak); 541 } 542 } 543 544} 545 546 547 548 549} 550