1/*
2*******************************************************************************
3*
4*   Copyright (C) 2009, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  bidiconf.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2009oct16
14*   created by: Markus W. Scherer
15*
16*   BiDi conformance test, using the Unicode BidiTest.txt file.
17*/
18
19#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22#include "unicode/utypes.h"
23#include "unicode/ubidi.h"
24#include "unicode/errorcode.h"
25#include "unicode/localpointer.h"
26#include "unicode/putil.h"
27#include "unicode/unistr.h"
28#include "intltest.h"
29#include "uparse.h"
30
31class BiDiConformanceTest : public IntlTest {
32public:
33    BiDiConformanceTest() :
34        directionBits(0), lineNumber(0), levelsCount(0), orderingCount(0),
35        errorCount(0) {}
36
37    void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
38
39    void TestBidiTest();
40private:
41    char *getUnidataPath(char path[]);
42
43    UBool parseLevels(const char *start);
44    UBool parseOrdering(const char *start);
45    UBool parseInputStringFromBiDiClasses(const char *&start);
46
47    UBool checkLevels(const UBiDiLevel actualLevels[], int32_t actualCount,
48                      const char *paraLevelName);
49    UBool checkOrdering(UBiDi *ubidi, const char *paraLevelName);
50
51    void printErrorLine(const char *paraLevelName);
52
53    char line[10000];
54    UBiDiLevel levels[1000];
55    uint32_t directionBits;
56    int32_t ordering[1000];
57    int32_t lineNumber;
58    int32_t levelsCount;
59    int32_t orderingCount;
60    int32_t errorCount;
61    UnicodeString inputString;
62};
63
64extern IntlTest *createBiDiConformanceTest() {
65    return new BiDiConformanceTest();
66}
67
68void BiDiConformanceTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char *par) {
69    if(exec) {
70        logln("TestSuite BiDiConformanceTest: ");
71    }
72    switch (index) {
73        TESTCASE(0, TestBidiTest);
74        default:
75            name="";
76            break; // needed to end the loop
77    }
78}
79
80// TODO: Move to a common place (IntlTest?) to avoid duplication with UnicodeTest (ucdtest.cpp).
81char *BiDiConformanceTest::getUnidataPath(char path[]) {
82    IcuTestErrorCode errorCode(*this, "getUnidataPath");
83    const int kUnicodeDataTxtLength=15;  // strlen("UnicodeData.txt")
84
85    // Look inside ICU_DATA first.
86    strcpy(path, pathToDataDirectory());
87    strcat(path, "unidata" U_FILE_SEP_STRING "UnicodeData.txt");
88    FILE *f=fopen(path, "r");
89    if(f!=NULL) {
90        fclose(f);
91        *(strchr(path, 0)-kUnicodeDataTxtLength)=0;  // Remove the basename.
92        return path;
93    }
94
95    // As a fallback, try to guess where the source data was located
96    // at the time ICU was built, and look there.
97#   ifdef U_TOPSRCDIR
98        strcpy(path, U_TOPSRCDIR  U_FILE_SEP_STRING "data");
99#   else
100        strcpy(path, loadTestData(errorCode));
101        strcat(path, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".."
102                     U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".."
103                     U_FILE_SEP_STRING "data");
104#   endif
105    strcat(path, U_FILE_SEP_STRING);
106    strcat(path, "unidata" U_FILE_SEP_STRING "UnicodeData.txt");
107    f=fopen(path, "r");
108    if(f!=NULL) {
109        fclose(f);
110        *(strchr(path, 0)-kUnicodeDataTxtLength)=0;  // Remove the basename.
111        return path;
112    }
113    return NULL;
114}
115
116U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
117
118// TODO: Make "public" in uparse.h.
119#define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n')
120
121UBool BiDiConformanceTest::parseLevels(const char *start) {
122    directionBits=0;
123    levelsCount=0;
124    while(*start!=0 && *(start=u_skipWhitespace(start))!=0) {
125        if(*start=='x') {
126            levels[levelsCount++]=UBIDI_DEFAULT_LTR;
127            ++start;
128        } else {
129            char *end;
130            uint32_t value=(uint32_t)strtoul(start, &end, 10);
131            if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=0) || value>(UBIDI_MAX_EXPLICIT_LEVEL+1)) {
132                errln("@Levels: parse error at %s", start);
133                return FALSE;
134            }
135            levels[levelsCount++]=(UBiDiLevel)value;
136            directionBits|=(1<<(value&1));
137            start=end;
138        }
139    }
140    return TRUE;
141}
142
143UBool BiDiConformanceTest::parseOrdering(const char *start) {
144    orderingCount=0;
145    while(*start!=0 && *(start=u_skipWhitespace(start))!=0) {
146        char *end;
147        uint32_t value=(uint32_t)strtoul(start, &end, 10);
148        if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=0) || value>=1000) {
149            errln("@Reorder: parse error at %s", start);
150            return FALSE;
151        }
152        ordering[orderingCount++]=(int32_t)value;
153        start=end;
154    }
155    return TRUE;
156}
157
158static const UChar charFromBiDiClass[U_CHAR_DIRECTION_COUNT]={
159    0x6c,   // 'l' for L
160    0x52,   // 'R' for R
161    0x33,   // '3' for EN
162    0x2d,   // '-' for ES
163    0x25,   // '%' for ET
164    0x39,   // '9' for AN
165    0x2c,   // ',' for CS
166    0x2f,   // '/' for B
167    0x5f,   // '_' for S
168    0x20,   // ' ' for WS
169    0x3d,   // '=' for ON
170    0x65,   // 'e' for LRE
171    0x6f,   // 'o' for LRO
172    0x41,   // 'A' for AL
173    0x45,   // 'E' for RLE
174    0x4f,   // 'O' for RLO
175    0x2a,   // '*' for PDF
176    0x60,   // '`' for NSM
177    0x7c    // '|' for BN
178};
179
180U_CDECL_BEGIN
181
182static UCharDirection U_CALLCONV
183biDiConfUBiDiClassCallback(const void *context, UChar32 c) {
184    for(int i=0; i<U_CHAR_DIRECTION_COUNT; ++i) {
185        if(c==charFromBiDiClass[i]) {
186            return (UCharDirection)i;
187        }
188    }
189    // Character not in our hardcoded table.
190    // Should not occur during testing.
191    return U_BIDI_CLASS_DEFAULT;
192}
193
194U_CDECL_END
195
196static const int8_t biDiClassNameLengths[U_CHAR_DIRECTION_COUNT+1]={
197    1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 0
198};
199
200UBool BiDiConformanceTest::parseInputStringFromBiDiClasses(const char *&start) {
201    inputString.remove();
202    /*
203     * Lengthy but fast BiDi class parser.
204     * A simple parser could terminate or extract the name string and use
205     *   int32_t biDiClassInt=u_getPropertyValueEnum(UCHAR_BIDI_CLASS, bidiClassString);
206     * but that makes this test take significantly more time.
207     */
208    while(*start!=0 && *(start=u_skipWhitespace(start))!=0 && *start!=';') {
209        UCharDirection biDiClass=U_CHAR_DIRECTION_COUNT;
210        // Compare each character once until we have a match on
211        // a complete, short BiDi class name.
212        if(start[0]=='L') {
213            if(start[1]=='R') {
214                if(start[2]=='E') {
215                    biDiClass=U_LEFT_TO_RIGHT_EMBEDDING;
216                } else if(start[2]=='O') {
217                    biDiClass=U_LEFT_TO_RIGHT_OVERRIDE;
218                }
219            } else {
220                biDiClass=U_LEFT_TO_RIGHT;
221            }
222        } else if(start[0]=='R') {
223            if(start[1]=='L') {
224                if(start[2]=='E') {
225                    biDiClass=U_RIGHT_TO_LEFT_EMBEDDING;
226                } else if(start[2]=='O') {
227                    biDiClass=U_RIGHT_TO_LEFT_OVERRIDE;
228                }
229            } else {
230                biDiClass=U_RIGHT_TO_LEFT;
231            }
232        } else if(start[0]=='E') {
233            if(start[1]=='N') {
234                biDiClass=U_EUROPEAN_NUMBER;
235            } else if(start[1]=='S') {
236                biDiClass=U_EUROPEAN_NUMBER_SEPARATOR;
237            } else if(start[1]=='T') {
238                biDiClass=U_EUROPEAN_NUMBER_TERMINATOR;
239            }
240        } else if(start[0]=='A') {
241            if(start[1]=='L') {
242                biDiClass=U_RIGHT_TO_LEFT_ARABIC;
243            } else if(start[1]=='N') {
244                biDiClass=U_ARABIC_NUMBER;
245            }
246        } else if(start[0]=='C' && start[1]=='S') {
247            biDiClass=U_COMMON_NUMBER_SEPARATOR;
248        } else if(start[0]=='B') {
249            if(start[1]=='N') {
250                biDiClass=U_BOUNDARY_NEUTRAL;
251            } else {
252                biDiClass=U_BLOCK_SEPARATOR;
253            }
254        } else if(start[0]=='S') {
255            biDiClass=U_SEGMENT_SEPARATOR;
256        } else if(start[0]=='W' && start[1]=='S') {
257            biDiClass=U_WHITE_SPACE_NEUTRAL;
258        } else if(start[0]=='O' && start[1]=='N') {
259            biDiClass=U_OTHER_NEUTRAL;
260        } else if(start[0]=='P' && start[1]=='D' && start[2]=='F') {
261            biDiClass=U_POP_DIRECTIONAL_FORMAT;
262        } else if(start[0]=='N' && start[1]=='S' && start[2]=='M') {
263            biDiClass=U_DIR_NON_SPACING_MARK;
264        }
265        // Now we verify that the class name is terminated properly,
266        // and not just the start of a longer word.
267        int8_t biDiClassNameLength=biDiClassNameLengths[biDiClass];
268        char c=start[biDiClassNameLength];
269        if(biDiClass==U_CHAR_DIRECTION_COUNT || (!U_IS_INV_WHITESPACE(c) && c!=';' && c!=0)) {
270            errln("BiDi class string not recognized at %s", start);
271            return FALSE;
272        }
273        inputString.append(charFromBiDiClass[biDiClass]);
274        start+=biDiClassNameLength;
275    }
276    return TRUE;
277}
278
279void BiDiConformanceTest::TestBidiTest() {
280    IcuTestErrorCode errorCode(*this, "TestBidiTest");
281    const char *sourceTestDataPath=getSourceTestData(errorCode);
282    if(errorCode.logIfFailureAndReset("unable to find the source/test/testdata "
283                                      "folder (getSourceTestData())")) {
284        return;
285    }
286    char bidiTestPath[400];
287    strcpy(bidiTestPath, sourceTestDataPath);
288    strcat(bidiTestPath, "BidiTest.txt");
289    LocalStdioFilePointer bidiTestFile(fopen(bidiTestPath, "r"));
290    if(bidiTestFile.isNull()) {
291        errln("unable to open %s", bidiTestPath);
292        return;
293    }
294    LocalUBiDiPointer ubidi(ubidi_open());
295    ubidi_setClassCallback(ubidi.getAlias(), biDiConfUBiDiClassCallback, NULL,
296                           NULL, NULL, errorCode);
297    if(errorCode.logIfFailureAndReset("ubidi_setClassCallback()")) {
298        return;
299    }
300    lineNumber=0;
301    levelsCount=0;
302    orderingCount=0;
303    errorCount=0;
304    while(errorCount<10 && fgets(line, (int)sizeof(line), bidiTestFile.getAlias())!=NULL) {
305        ++lineNumber;
306        // Remove trailing comments and whitespace.
307        char *commentStart=strchr(line, '#');
308        if(commentStart!=NULL) {
309            *commentStart=0;
310        }
311        u_rtrim(line);
312        const char *start=u_skipWhitespace(line);
313        if(*start==0) {
314            continue;  // Skip empty and comment-only lines.
315        }
316        if(*start=='@') {
317            ++start;
318            if(0==strncmp(start, "Levels:", 7)) {
319                if(!parseLevels(start+7)) {
320                    return;
321                }
322            } else if(0==strncmp(start, "Reorder:", 8)) {
323                if(!parseOrdering(start+8)) {
324                    return;
325                }
326            }
327            // Skip unknown @Xyz: ...
328        } else {
329            if(!parseInputStringFromBiDiClasses(start)) {
330                return;
331            }
332            start=u_skipWhitespace(start);
333            if(*start!=';') {
334                errln("missing ; separator on input line %s", line);
335                return;
336            }
337            start=u_skipWhitespace(start+1);
338            char *end;
339            uint32_t bitset=(uint32_t)strtoul(start, &end, 10);
340            if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0)) {
341                errln("input bitset parse error at %s", start);
342                return;
343            }
344            // Loop over the bitset.
345            static const UBiDiLevel paraLevels[]={ UBIDI_DEFAULT_LTR, 0, 1 };
346            static const char *const paraLevelNames[]={ "auto/LTR", "LTR", "RTL" };
347            for(int i=0; i<=2; ++i) {
348                if(bitset&(1<<i)) {
349                    ubidi_setPara(ubidi.getAlias(), inputString.getBuffer(), inputString.length(),
350                                  paraLevels[i], NULL, errorCode);
351                    const UBiDiLevel *actualLevels=ubidi_getLevels(ubidi.getAlias(), errorCode);
352                    if(errorCode.logIfFailureAndReset("ubidi_setPara() or ubidi_getLevels()")) {
353                        errln("Input line %d: %s", (int)lineNumber, line);
354                        return;
355                    }
356                    if(!checkLevels(actualLevels, ubidi_getProcessedLength(ubidi.getAlias()),
357                                    paraLevelNames[i])) {
358                        // continue outerLoop;  does not exist in C++
359                        // so just break out of the inner loop.
360                        break;
361                    }
362                    if(!checkOrdering(ubidi.getAlias(), paraLevelNames[i])) {
363                        // continue outerLoop;  does not exist in C++
364                        // so just break out of the inner loop.
365                        break;
366                    }
367                }
368            }
369        }
370    }
371}
372
373static UChar printLevel(UBiDiLevel level) {
374    if(level<UBIDI_DEFAULT_LTR) {
375        return 0x30+level;
376    } else {
377        return 0x78;  // 'x'
378    }
379}
380
381static uint32_t getDirectionBits(const UBiDiLevel actualLevels[], int32_t actualCount) {
382    uint32_t actualDirectionBits=0;
383    for(int32_t i=0; i<actualCount; ++i) {
384        actualDirectionBits|=(1<<(actualLevels[i]&1));
385    }
386    return actualDirectionBits;
387}
388
389UBool BiDiConformanceTest::checkLevels(const UBiDiLevel actualLevels[], int32_t actualCount,
390                                       const char *paraLevelName) {
391    UBool isOk=TRUE;
392    if(levelsCount!=actualCount) {
393        errln("Wrong number of level values; expected %d actual %d",
394              (int)levelsCount, (int)actualCount);
395        isOk=FALSE;
396    } else {
397        for(int32_t i=0; i<actualCount; ++i) {
398            if(levels[i]!=actualLevels[i] && levels[i]<UBIDI_DEFAULT_LTR) {
399                if(directionBits!=3 && directionBits==getDirectionBits(actualLevels, actualCount)) {
400                    // ICU used a shortcut:
401                    // Since the text is unidirectional, it did not store the resolved
402                    // levels but just returns all levels as the paragraph level 0 or 1.
403                    // The reordering result is the same, so this is fine.
404                    break;
405                } else {
406                    errln("Wrong level value at index %d; expected %d actual %d",
407                          (int)i, levels[i], actualLevels[i]);
408                    isOk=FALSE;
409                    break;
410                }
411            }
412        }
413    }
414    if(!isOk) {
415        printErrorLine(paraLevelName);
416        UnicodeString els("Expected levels:   ");
417        int32_t i;
418        for(i=0; i<levelsCount; ++i) {
419            els.append((UChar)0x20).append(printLevel(levels[i]));
420        }
421        UnicodeString als("Actual   levels:   ");
422        for(i=0; i<actualCount; ++i) {
423            als.append((UChar)0x20).append(printLevel(actualLevels[i]));
424        }
425        errln(els);
426        errln(als);
427    }
428    return isOk;
429}
430
431// Note: ubidi_setReorderingOptions(ubidi, UBIDI_OPTION_REMOVE_CONTROLS);
432// does not work for custom BiDi class assignments
433// and anyway also removes LRM/RLM/ZWJ/ZWNJ which is not desirable here.
434// Therefore we just skip the indexes for BiDi controls while comparing
435// with the expected ordering that has them omitted.
436UBool BiDiConformanceTest::checkOrdering(UBiDi *ubidi, const char *paraLevelName) {
437    UBool isOk=TRUE;
438    IcuTestErrorCode errorCode(*this, "TestBidiTest/checkOrdering()");
439    int32_t resultLength=ubidi_getResultLength(ubidi);  // visual length including BiDi controls
440    int32_t i, visualIndex;
441    // Note: It should be faster to call ubidi_countRuns()/ubidi_getVisualRun()
442    // and loop over each run's indexes, but that seems unnecessary for this test code.
443    for(i=visualIndex=0; i<resultLength; ++i) {
444        int32_t logicalIndex=ubidi_getLogicalIndex(ubidi, i, errorCode);
445        if(errorCode.logIfFailureAndReset("ubidi_getLogicalIndex()")) {
446            errln("Input line %d: %s", (int)lineNumber, line);
447            return FALSE;
448        }
449        if(levels[logicalIndex]>=UBIDI_DEFAULT_LTR) {
450            continue;  // BiDi control, omitted from expected ordering.
451        }
452        if(visualIndex<orderingCount && logicalIndex!=ordering[visualIndex]) {
453            errln("Wrong ordering value at visual index %d; expected %d actual %d",
454                  (int)visualIndex, ordering[visualIndex], logicalIndex);
455            isOk=FALSE;
456            break;
457        }
458        ++visualIndex;
459    }
460    // visualIndex is now the visual length minus the BiDi controls,
461    // which should match the length of the BidiTest.txt ordering.
462    if(isOk && orderingCount!=visualIndex) {
463        errln("Wrong number of ordering values; expected %d actual %d",
464              (int)orderingCount, (int)visualIndex);
465        isOk=FALSE;
466    }
467    if(!isOk) {
468        printErrorLine(paraLevelName);
469        UnicodeString eord("Expected ordering: ");
470        for(i=0; i<orderingCount; ++i) {
471            eord.append((UChar)0x20).append((UChar)(0x30+ordering[i]));
472        }
473        UnicodeString aord("Actual   ordering: ");
474        for(i=0; i<resultLength; ++i) {
475            int32_t logicalIndex=ubidi_getLogicalIndex(ubidi, i, errorCode);
476            if(levels[logicalIndex]<UBIDI_DEFAULT_LTR) {
477                aord.append((UChar)0x20).append((UChar)(0x30+logicalIndex));
478            }
479        }
480        errln(eord);
481        errln(aord);
482    }
483    return isOk;
484}
485
486void BiDiConformanceTest::printErrorLine(const char *paraLevelName) {
487    ++errorCount;
488    errln("Input line %5d:   %s", (int)lineNumber, line);
489    errln(UnicodeString("Input string:       ")+inputString);
490    errln("Para level:         %s", paraLevelName);
491}
492