1/*
2 * Created on May 5, 2004
3 *
4 * Copyright (C) 2004-2015 International Business Machines Corporation and others.
5 * All Rights Reserved.
6 *
7 */
8package com.ibm.icu.dev.test.rbbi;
9
10import java.io.IOException;
11import java.io.InputStream;
12import java.io.InputStreamReader;
13import java.util.Arrays;
14
15import com.ibm.icu.dev.test.TestFmwk;
16import com.ibm.icu.impl.Utility;
17import com.ibm.icu.lang.UCharacter;
18import com.ibm.icu.text.BreakIterator;
19import com.ibm.icu.text.UTF16;
20import com.ibm.icu.util.ULocale;
21
22
23/**
24 * Rule based break iterator data driven test.
25 *      Perform the tests from the file rbbitst.txt.
26 *      The test data file is common to both ICU4C and ICU4J.
27 *      See the data file for a description of the tests.
28 *
29 */
30public class RBBITestExtended extends TestFmwk {
31
32    public static void main(String[] args)throws Exception {
33        new RBBITestExtended().run(args);
34    }
35
36
37public RBBITestExtended() {
38    }
39
40
41
42static class TestParams {
43    BreakIterator   bi;
44    StringBuffer    dataToBreak    = new StringBuffer();
45    int[]           expectedBreaks = new int[1000];
46    int[]           srcLine        = new int[1000];
47    int[]           srcCol         = new int[1000];
48    ULocale         currentLocale  = new ULocale("en_US");
49}
50
51
52public void TestExtended() {
53    TestParams     tp = new TestParams();
54
55
56    //
57    //  Open and read the test data file.
58    //
59    StringBuffer testFileBuf = new StringBuffer();
60    InputStream is = null;
61    try {
62        is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
63        if (is == null) {
64            errln("Could not open test data file rbbitst.txt");
65            return;
66        }
67        InputStreamReader isr = new InputStreamReader(is, "UTF-8");
68        try {
69            int c;
70            int count = 0;
71            for (;;) {
72                c = isr.read();
73                if (c < 0) {
74                    break;
75                }
76                count++;
77                if (c == 0xFEFF && count == 1) {
78                    // BOM in the test data file. Discard it.
79                    continue;
80                }
81
82                UTF16.append(testFileBuf, c);
83            }
84        } finally {
85            isr.close();
86        }
87    } catch (IOException e) {
88        errln(e.toString());
89        try {
90            is.close();
91        } catch (IOException ignored) {
92        }
93        return;
94    }
95
96    String testString = testFileBuf.toString();
97
98
99    final int  PARSE_COMMENT = 1;
100    final int  PARSE_TAG     = 2;
101    final int  PARSE_DATA    = 3;
102    final int  PARSE_NUM     = 4;
103
104    int parseState = PARSE_TAG;
105
106    int savedState = PARSE_TAG;
107
108    final char CH_LF        = 0x0a;
109    final char CH_CR        = 0x0d;
110    final char CH_HASH      = 0x23;
111    /*static const UChar CH_PERIOD    = 0x2e;*/
112    final char CH_LT        = 0x3c;
113    final char CH_GT        = 0x3e;
114    final char CH_BACKSLASH = 0x5c;
115    final char CH_BULLET    = 0x2022;
116
117    int    lineNum  = 1;
118    int    colStart = 0;
119    int    column   = 0;
120    int    charIdx  = 0;
121    int    i;
122
123    int    tagValue = 0;       // The numeric value of a <nnn> tag.
124    int    len = testString.length();
125
126    for (charIdx = 0; charIdx < len; ) {
127        int  c = UTF16.charAt(testString, charIdx);
128        charIdx++;
129        if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
130            // treat CRLF as a unit
131            c = CH_LF;
132            charIdx++;
133        }
134        if (c == CH_LF || c == CH_CR) {
135            lineNum++;
136            colStart = charIdx;
137        }
138        column = charIdx - colStart + 1;
139
140        switch (parseState) {
141        case PARSE_COMMENT:
142            if (c == 0x0a || c == 0x0d) {
143                parseState = savedState;
144            }
145            break;
146
147        case PARSE_TAG:
148            {
149            if (c == CH_HASH) {
150                parseState = PARSE_COMMENT;
151                savedState = PARSE_TAG;
152                break;
153            }
154            if (UCharacter.isWhitespace(c)) {
155                break;
156            }
157           if (testString.startsWith("<word>", charIdx-1)) {
158                tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
159                charIdx += 5;
160                break;
161            }
162            if (testString.startsWith("<char>", charIdx-1)) {
163                tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale);
164                charIdx += 5;
165                break;
166            }
167            if (testString.startsWith("<line>", charIdx-1)) {
168                tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
169                charIdx += 5;
170                break;
171            }
172            if (testString.startsWith("<sent>", charIdx-1)) {
173                tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
174                charIdx += 5;
175                break;
176            }
177            if (testString.startsWith("<title>", charIdx-1)) {
178                tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
179                charIdx += 6;
180                break;
181            }
182            if (testString.startsWith("<locale ", charIdx-1)) {
183                int closeIndex = testString.indexOf(">", charIdx);
184                if (closeIndex < 0) {
185                    errln("line" + lineNum + ": missing close on <locale  tag.");
186                    break;
187                }
188                String localeName = testString.substring(charIdx+6, closeIndex);
189                localeName = localeName.trim();
190                tp.currentLocale = new ULocale(localeName);
191                charIdx = closeIndex+1;
192                break;
193            }
194            if (testString.startsWith("<data>", charIdx-1)) {
195                parseState = PARSE_DATA;
196                charIdx += 5;
197                tp.dataToBreak.setLength(0);
198                Arrays.fill(tp.expectedBreaks, 0);
199                Arrays.fill(tp.srcCol, 0);
200                Arrays.fill(tp.srcLine, 0);
201                break;
202            }
203
204            errln("line" + lineNum + ": Tag expected in test file.");
205            return;
206            //parseState = PARSE_COMMENT;
207            //savedState = PARSE_DATA;
208            }
209
210        case PARSE_DATA:
211            if (c == CH_BULLET) {
212                int  breakIdx = tp.dataToBreak.length();
213                tp.expectedBreaks[breakIdx] = -1;
214                tp.srcLine[breakIdx]        = lineNum;
215                tp.srcCol[breakIdx]         = column;
216                break;
217            }
218
219            if (testString.startsWith("</data>", charIdx-1))  {
220                // Add final entry to mappings from break location to source file position.
221                //  Need one extra because last break position returned is after the
222                //    last char in the data, not at the last char.
223                int idx = tp.dataToBreak.length();
224                tp.srcLine[idx] = lineNum;
225                tp.srcCol[idx]  = column;
226
227                parseState = PARSE_TAG;
228                charIdx += 6;
229
230                // RUN THE TEST!
231                executeTest(tp);
232                break;
233            }
234
235           if (testString.startsWith("\\N{", charIdx-1)) {
236               int nameEndIdx = testString.indexOf('}', charIdx);
237               if (nameEndIdx == -1) {
238                   errln("Error in named character in test file at line " + lineNum +
239                           ", col " + column);
240               }
241                // Named character, e.g. \N{COMBINING GRAVE ACCENT}
242                // Get the code point from the name and insert it into the test data.
243                String charName = testString.substring(charIdx+2, nameEndIdx);
244                c = UCharacter.getCharFromName(charName);
245                if (c == -1) {
246                    errln("Error in named character in test file at line " + lineNum +
247                            ", col " + column);
248                } else {
249                    // Named code point was recognized.  Insert it
250                    //   into the test data.
251                    UTF16.append(tp.dataToBreak, c);
252                    for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
253                        tp.srcLine[i] = lineNum;
254                        tp.srcCol[i]  = column;
255                    }
256
257                 }
258                if (nameEndIdx > charIdx) {
259                    charIdx = nameEndIdx+1;
260                }
261                break;
262            }
263
264            if (testString.startsWith("<>", charIdx-1)) {
265                charIdx++;
266                int  breakIdx = tp.dataToBreak.length();
267                tp.expectedBreaks[breakIdx] = -1;
268                tp.srcLine[breakIdx]        = lineNum;
269                tp.srcCol[breakIdx]         = column;
270                break;
271            }
272
273            if (c == CH_LT) {
274                tagValue   = 0;
275                parseState = PARSE_NUM;
276                break;
277            }
278
279            if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
280                parseState = PARSE_COMMENT;
281                savedState = PARSE_DATA;
282                break;
283            }
284
285            if (c == CH_BACKSLASH) {
286                // Check for \ at end of line, a line continuation.
287                //     Advance over (discard) the newline
288                int cp = UTF16.charAt(testString, charIdx);
289                if (cp == CH_CR && charIdx<len && UTF16.charAt(testString, charIdx+1) == CH_LF) {
290                    // We have a CR LF
291                    //  Need an extra increment of the input ptr to move over both of them
292                    charIdx++;
293                }
294                if (cp == CH_LF || cp == CH_CR) {
295                    lineNum++;
296                    column   = 0;
297                    charIdx++;
298                    colStart = charIdx;
299                    break;
300                }
301
302                // Let unescape handle the back slash.
303                int  charIdxAr[] = new int[1];
304                charIdxAr[0] = charIdx;
305                cp = Utility.unescapeAt(testString, charIdxAr);
306                if (cp != -1) {
307                    // Escape sequence was recognized.  Insert the char
308                    //   into the test data.
309                    charIdx = charIdxAr[0];
310                    UTF16.append(tp.dataToBreak, cp);
311                    for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
312                        tp.srcLine[i] = lineNum;
313                        tp.srcCol[i]  = column;
314                    }
315
316                    break;
317                }
318
319
320                // Not a recognized backslash escape sequence.
321                // Take the next char as a literal.
322                //  TODO:  Should this be an error?
323                c = UTF16.charAt(testString,charIdx);
324                charIdx = UTF16.moveCodePointOffset(testString, charIdx, 1);
325             }
326
327            // Normal, non-escaped data char.
328            UTF16.append(tp.dataToBreak, c);
329
330            // Save the mapping from offset in the data to line/column numbers in
331            //   the original input file.  Will be used for better error messages only.
332            //   If there's an expected break before this char, the slot in the mapping
333            //     vector will already be set for this char; don't overwrite it.
334            for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
335                tp.srcLine[i] = lineNum;
336                tp.srcCol[i]  = column;
337            }
338            break;
339
340
341        case PARSE_NUM:
342            // We are parsing an expected numeric tag value, like <1234>,
343            //   within a chunk of data.
344            if (UCharacter.isWhitespace(c)) {
345                break;
346            }
347
348            if (c == CH_GT) {
349                // Finished the number.  Add the info to the expected break data,
350                //   and switch parse state back to doing plain data.
351                parseState = PARSE_DATA;
352                if (tagValue == 0) {
353                    tagValue = -1;
354                }
355                int  breakIdx = tp.dataToBreak.length();
356                tp.expectedBreaks[breakIdx] = tagValue;
357                tp.srcLine[breakIdx]        = lineNum;
358                tp.srcCol[breakIdx]         = column;
359                break;
360            }
361
362            if (UCharacter.isDigit(c)) {
363                tagValue = tagValue*10 + UCharacter.digit(c);
364                break;
365            }
366
367            errln("Syntax Error in test file at line "+ lineNum +", col %d" + column);
368            return;
369
370            // parseState = PARSE_COMMENT;   // TODO: unreachable.  Don't stop on errors.
371            // break;
372        }
373
374
375
376    }
377}
378
379void executeTest(TestParams t) {
380    int    bp;
381    int    prevBP;
382    int    i;
383
384    if (t.bi == null) {
385        return;
386    }
387
388    t.bi.setText(t.dataToBreak.toString());
389    //
390    //  Run the iterator forward
391    //
392    prevBP = -1;
393    for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
394        if (prevBP ==  bp) {
395            // Fail for lack of forward progress.
396            errln("Forward Iteration, no forward progress.  Break Pos=" + bp +
397                    "  File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
398            break;
399        }
400
401        // Check that there were we didn't miss an expected break between the last one
402        //  and this one.
403        for (i=prevBP+1; i<bp; i++) {
404            if (t.expectedBreaks[i] != 0) {
405                errln("Forward Iteration, break expected, but not found.  Pos=" + i +
406                    "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
407            }
408        }
409
410        // Check that the break we did find was expected
411        if (t.expectedBreaks[bp] == 0) {
412            errln("Forward Iteration, break found, but not expected.  Pos=" + bp +
413                    "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
414        } else {
415            // The break was expected.
416            //   Check that the {nnn} tag value is correct.
417            int expectedTagVal = t.expectedBreaks[bp];
418            if (expectedTagVal == -1) {
419                expectedTagVal = 0;
420            }
421            int line = t.srcLine[bp];
422            int rs = t.bi.getRuleStatus();
423            if (rs != expectedTagVal) {
424                errln("Incorrect status for forward break.  Pos = " + bp +
425                        ".  File line,col = " + line + ", " + t.srcCol[bp] + "\n" +
426                      "          Actual, Expected status = " + rs + ", " + expectedTagVal);
427            }
428            int[] fillInArray = new int[4];
429            int numStatusVals = t.bi.getRuleStatusVec(fillInArray);
430            assertTrue("", numStatusVals >= 1);
431            assertEquals("", expectedTagVal, fillInArray[0]);
432        }
433
434
435        prevBP = bp;
436    }
437
438    // Verify that there were no missed expected breaks after the last one found
439    for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
440        if (t.expectedBreaks[i] != 0) {
441            errln("Forward Iteration, break expected, but not found.  Pos=" + i +
442                    "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
443       }
444    }
445
446
447    //
448    //  Run the iterator backwards, verify that the same breaks are found.
449    //
450    prevBP = t.dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
451    for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
452        if (prevBP ==  bp) {
453            // Fail for lack of progress.
454            errln("Reverse Iteration, no progress.  Break Pos=" + bp +
455                    "File line,col=" + t.srcLine[bp] + " " +  t.srcCol[bp]);
456            break;
457        }
458
459        // Check that we didn't miss an expected break between the last one
460        //  and this one.  (UVector returns zeros for index out of bounds.)
461        for (i=prevBP-1; i>bp; i--) {
462            if (t.expectedBreaks[i] != 0) {
463                errln("Reverse Itertion, break expected, but not found.  Pos=" + i +
464                    "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
465            }
466        }
467
468        // Check that the break we did find was expected
469        if (t.expectedBreaks[bp] == 0) {
470            errln("Reverse Itertion, break found, but not expected.  Pos=" + bp +
471                    "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
472        } else {
473            // The break was expected.
474            //   Check that the {nnn} tag value is correct.
475            int expectedTagVal = t.expectedBreaks[bp];
476            if (expectedTagVal == -1) {
477                expectedTagVal = 0;
478            }
479            int line = t.srcLine[bp];
480            int rs = t.bi.getRuleStatus();
481            if (rs != expectedTagVal) {
482                errln("Incorrect status for reverse break.  Pos=  " + bp +
483                        "File line,col= " + line + ", " + t.srcCol[bp] + "\n" +
484                      "          Actual, Expected status = " + rs + ", " + expectedTagVal);
485            }
486        }
487
488        prevBP = bp;
489    }
490
491    // Verify that there were no missed breaks prior to the last one found
492    for (i=prevBP-1; i>=0; i--) {
493        if (t.expectedBreaks[i] != 0) {
494            errln("Reverse Itertion, break expected, but not found.  Pos=" + i +
495                    "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
496         }
497    }
498    // Check isBoundary()
499    for (i=0; i<=t.dataToBreak.length(); i++) {
500        boolean boundaryExpected = (t.expectedBreaks[i] != 0);
501        boolean boundaryFound    = t.bi.isBoundary(i);
502        if (boundaryExpected != boundaryFound) {
503            errln("isBoundary(" + i + ") incorrect.\n" +
504                  "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
505                  "    Expected, Actual= " + boundaryExpected + ", " + boundaryFound);
506        }
507    }
508
509    // Check following()
510    for (i=0; i<=t.dataToBreak.length(); i++) {
511        int actualBreak = t.bi.following(i);
512        int expectedBreak = BreakIterator.DONE;
513        for (int j=i+1; j < t.expectedBreaks.length; j++) {
514            if (t.expectedBreaks[j] != 0) {
515                expectedBreak = j;
516                break;
517            }
518        }
519        if (expectedBreak != actualBreak) {
520            errln("following(" + i + ") incorrect.\n" +
521                    "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
522                    "    Expected, Actual= " + expectedBreak + ", " + actualBreak);
523        }
524    }
525
526    // Check preceding()
527    for (i=t.dataToBreak.length(); i>=0; i--) {
528        int actualBreak = t.bi.preceding(i);
529        int expectedBreak = BreakIterator.DONE;
530
531        for (int j=i-1; j >= 0; j--) {
532            if (t.expectedBreaks[j] != 0) {
533                expectedBreak = j;
534                break;
535            }
536        }
537        if (expectedBreak != actualBreak) {
538            errln("preceding(" + i + ") incorrect.\n" +
539                    "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
540                    "    Expected, Actual= " + expectedBreak + ", " + actualBreak);
541        }
542    }
543
544}
545
546
547
548
549}
550