1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4************************************************************************
5* Copyright (c) 1997-2016, International Business Machines
6* Corporation and others.  All Rights Reserved.
7************************************************************************
8*/
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_NORMALIZATION
13
14#include "unicode/uchar.h"
15#include "unicode/normlzr.h"
16#include "unicode/uniset.h"
17#include "unicode/putil.h"
18#include "cmemory.h"
19#include "cstring.h"
20#include "filestrm.h"
21#include "normconf.h"
22#include <stdio.h>
23
24#define CASE(id,test,exec) case id:                          \
25                          name = #test;                 \
26                          if (exec) {                   \
27                              logln(#test "---");       \
28                              logln((UnicodeString)""); \
29                              test();                   \
30                          }                             \
31                          break
32
33void NormalizerConformanceTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* /*par*/) {
34    switch (index) {
35        CASE(0, TestConformance, exec);
36#if !UCONFIG_NO_FILE_IO && !UCONFIG_NO_LEGACY_CONVERSION
37        CASE(1, TestConformance32, exec);
38#endif
39        // CASE(2, TestCase6);
40        default: name = ""; break;
41    }
42}
43
44#define FIELD_COUNT 5
45
46NormalizerConformanceTest::NormalizerConformanceTest() :
47    normalizer(UnicodeString(), UNORM_NFC) {}
48
49NormalizerConformanceTest::~NormalizerConformanceTest() {}
50
51// more interesting conformance test cases, not in the unicode.org NormalizationTest.txt
52static const char *moreCases[]={
53    // Markus 2001aug30
54    "0061 0332 0308;00E4 0332;0061 0332 0308;00E4 0332;0061 0332 0308; # Markus 0",
55
56    // Markus 2001oct26 - test edge case for iteration: U+0f73.cc==0 but decomposition.lead.cc==129
57    "0061 0301 0F73;00E1 0F71 0F72;0061 0F71 0F72 0301;00E1 0F71 0F72;0061 0F71 0F72 0301; # Markus 1"
58};
59
60void NormalizerConformanceTest::compare(const UnicodeString& s1, const UnicodeString& s2){
61    UErrorCode status=U_ZERO_ERROR;
62     // TODO: Re-enable this tests after UTC fixes UAX 21
63    if(s1.indexOf((UChar32)0x0345)>=0)return;
64    if(Normalizer::compare(s1,s2,U_FOLD_CASE_DEFAULT,status)!=0){
65        errln("Normalizer::compare() failed for s1: " + prettify(s1) + " s2: " +prettify(s2));
66    }
67}
68
69FileStream *
70NormalizerConformanceTest::openNormalizationTestFile(const char *filename) {
71    char unidataPath[2000];
72    const char *folder;
73    FileStream *input;
74    UErrorCode errorCode;
75
76    // look inside ICU_DATA first
77    folder=pathToDataDirectory();
78    if(folder!=NULL) {
79        strcpy(unidataPath, folder);
80        strcat(unidataPath, "unidata" U_FILE_SEP_STRING);
81        strcat(unidataPath, filename);
82        input=T_FileStream_open(unidataPath, "rb");
83        if(input!=NULL) {
84            return input;
85        }
86    }
87
88    // find icu/source/data/unidata relative to the test data
89    errorCode=U_ZERO_ERROR;
90    folder=loadTestData(errorCode);
91    if(U_SUCCESS(errorCode)) {
92        strcpy(unidataPath, folder);
93        strcat(unidataPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".."
94                     U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".."
95                     U_FILE_SEP_STRING "data" U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
96        strcat(unidataPath, filename);
97        input=T_FileStream_open(unidataPath, "rb");
98        if(input!=NULL) {
99            return input;
100        }
101    }
102
103    // look in icu/source/test/testdata/out/build
104    errorCode=U_ZERO_ERROR;
105    folder=loadTestData(errorCode);
106    if(U_SUCCESS(errorCode)) {
107        strcpy(unidataPath, folder);
108        strcat(unidataPath, U_FILE_SEP_STRING);
109        strcat(unidataPath, filename);
110        input=T_FileStream_open(unidataPath, "rb");
111        if(input!=NULL) {
112            return input;
113        }
114    }
115
116    // look in icu/source/test/testdata
117    errorCode=U_ZERO_ERROR;
118    folder=loadTestData(errorCode);
119    if(U_SUCCESS(errorCode)) {
120        strcpy(unidataPath, folder);
121        strcat(unidataPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING);
122        strcat(unidataPath, filename);
123        input=T_FileStream_open(unidataPath, "rb");
124        if(input!=NULL) {
125            return input;
126        }
127    }
128
129    // find icu/source/data/unidata relative to U_TOPSRCDIR
130#if defined(U_TOPSRCDIR)
131    strcpy(unidataPath, U_TOPSRCDIR U_FILE_SEP_STRING "data" U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
132    strcat(unidataPath, filename);
133    input=T_FileStream_open(unidataPath, "rb");
134    if(input!=NULL) {
135        return input;
136    }
137
138    strcpy(unidataPath, U_TOPSRCDIR U_FILE_SEP_STRING "test" U_FILE_SEP_STRING "testdata" U_FILE_SEP_STRING);
139    strcat(unidataPath, filename);
140    input=T_FileStream_open(unidataPath, "rb");
141    if(input!=NULL) {
142        return input;
143    }
144#endif
145
146    dataerrln("Failed to open %s", filename);
147    return NULL;
148}
149
150/**
151 * Test the conformance of Normalizer to
152 * http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
153 */
154void NormalizerConformanceTest::TestConformance() {
155    TestConformance(openNormalizationTestFile("NormalizationTest.txt"), 0);
156}
157
158void NormalizerConformanceTest::TestConformance32() {
159    TestConformance(openNormalizationTestFile("NormalizationTest-3.2.0.txt"), UNORM_UNICODE_3_2);
160}
161
162void NormalizerConformanceTest::TestConformance(FileStream *input, int32_t options) {
163    enum { BUF_SIZE = 1024 };
164    char lineBuf[BUF_SIZE];
165    UnicodeString fields[FIELD_COUNT];
166    UErrorCode status = U_ZERO_ERROR;
167    int32_t passCount = 0;
168    int32_t failCount = 0;
169    UChar32 c;
170
171    if(input==NULL) {
172        return;
173    }
174
175    // UnicodeSet for all code points that are not mentioned in NormalizationTest.txt
176    UnicodeSet other(0, 0x10ffff);
177
178    int32_t count, countMoreCases = UPRV_LENGTHOF(moreCases);
179    for (count = 1;;++count) {
180        if (!T_FileStream_eof(input)) {
181            T_FileStream_readLine(input, lineBuf, (int32_t)sizeof(lineBuf));
182        } else {
183            // once NormalizationTest.txt is finished, use moreCases[]
184            if(count > countMoreCases) {
185                count = 0;
186            } else if(count == countMoreCases) {
187                // all done
188                break;
189            }
190            uprv_strcpy(lineBuf, moreCases[count]);
191        }
192        if (lineBuf[0] == 0 || lineBuf[0] == '\n' || lineBuf[0] == '\r') continue;
193
194        // Expect 5 columns of this format:
195        // 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments>
196
197        // Parse out the comment.
198        if (lineBuf[0] == '#') continue;
199
200        // Read separator lines starting with '@'
201        if (lineBuf[0] == '@') {
202            logln(lineBuf);
203            continue;
204        }
205
206        // Parse out the fields
207        if (!hexsplit(lineBuf, ';', fields, FIELD_COUNT)) {
208            errln((UnicodeString)"Unable to parse line " + count);
209            break; // Syntax error
210        }
211
212        // Remove a single code point from the "other" UnicodeSet
213        if(fields[0].length()==fields[0].moveIndex32(0, 1)) {
214            c=fields[0].char32At(0);
215            if(0xac20<=c && c<=0xd73f && quick) {
216                // not an exhaustive test run: skip most Hangul syllables
217                if(c==0xac20) {
218                    other.remove(0xac20, 0xd73f);
219                }
220                continue;
221            }
222            other.remove(c);
223        }
224
225        if (checkConformance(fields, lineBuf, options, status)) {
226            ++passCount;
227        } else {
228            ++failCount;
229            if(status == U_FILE_ACCESS_ERROR) {
230              dataerrln("Something is wrong with the normalizer, skipping the rest of the test.");
231              break;
232            }
233        }
234        if ((count % 1000) == 0) {
235            logln("Line %d", count);
236        }
237    }
238
239    T_FileStream_close(input);
240
241    /*
242     * Test that all characters that are not mentioned
243     * as single code points in column 1
244     * do not change under any normalization.
245     */
246
247    // remove U+ffff because that is the end-of-iteration sentinel value
248    other.remove(0xffff);
249
250    for(c=0; c<=0x10ffff; quick ? c+=113 : ++c) {
251        if(0x30000<=c && c<0xe0000) {
252            c=0xe0000;
253        }
254        if(!other.contains(c)) {
255            continue;
256        }
257
258        fields[0]=fields[1]=fields[2]=fields[3]=fields[4].setTo(c);
259        sprintf(lineBuf, "not mentioned code point U+%04lx", (long)c);
260
261        if (checkConformance(fields, lineBuf, options, status)) {
262            ++passCount;
263        } else {
264            ++failCount;
265            if(status == U_FILE_ACCESS_ERROR) {
266              dataerrln("Something is wrong with the normalizer, skipping the rest of the test.: %s", u_errorName(status));
267              break;
268            }
269        }
270        if ((c % 0x1000) == 0) {
271            logln("Code point U+%04lx", c);
272        }
273    }
274
275    if (failCount != 0) {
276        dataerrln((UnicodeString)"Total: " + failCount + " lines/code points failed, " +
277              passCount + " lines/code points passed");
278    } else {
279        logln((UnicodeString)"Total: " + passCount + " lines/code points passed");
280    }
281}
282
283/**
284 * Verify the conformance of the given line of the Unicode
285 * normalization (UTR 15) test suite file.  For each line,
286 * there are five columns, corresponding to field[0]..field[4].
287 *
288 * The following invariants must be true for all conformant implementations
289 *  c2 == NFC(c1) == NFC(c2) == NFC(c3)
290 *  c3 == NFD(c1) == NFD(c2) == NFD(c3)
291 *  c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
292 *  c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
293 *
294 * @param field the 5 columns
295 * @param line the source line from the test suite file
296 * @return true if the test passes
297 */
298UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
299                                                  const char *line,
300                                                  int32_t options,
301                                                  UErrorCode &status) {
302    UBool pass = TRUE, result;
303    //UErrorCode status = U_ZERO_ERROR;
304    UnicodeString out, fcd;
305    int32_t fieldNum;
306
307    for (int32_t i=0; i<FIELD_COUNT; ++i) {
308        fieldNum = i+1;
309        if (i<3) {
310            Normalizer::normalize(field[i], UNORM_NFC, options, out, status);
311            if (U_FAILURE(status)) {
312                dataerrln("Error running normalize UNORM_NFC: %s", u_errorName(status));
313            } else {
314                pass &= assertEqual("C", field[i], out, field[1], "c2!=C(c", fieldNum);
315                iterativeNorm(field[i], UNORM_NFC, options, out, +1);
316                pass &= assertEqual("C(+1)", field[i], out, field[1], "c2!=C(c", fieldNum);
317                iterativeNorm(field[i], UNORM_NFC, options, out, -1);
318                pass &= assertEqual("C(-1)", field[i], out, field[1], "c2!=C(c", fieldNum);
319            }
320
321            Normalizer::normalize(field[i], UNORM_NFD, options, out, status);
322            if (U_FAILURE(status)) {
323                dataerrln("Error running normalize UNORM_NFD: %s", u_errorName(status));
324            } else {
325                pass &= assertEqual("D", field[i], out, field[2], "c3!=D(c", fieldNum);
326                iterativeNorm(field[i], UNORM_NFD, options, out, +1);
327                pass &= assertEqual("D(+1)", field[i], out, field[2], "c3!=D(c", fieldNum);
328                iterativeNorm(field[i], UNORM_NFD, options, out, -1);
329                pass &= assertEqual("D(-1)", field[i], out, field[2], "c3!=D(c", fieldNum);
330            }
331        }
332        Normalizer::normalize(field[i], UNORM_NFKC, options, out, status);
333        if (U_FAILURE(status)) {
334            dataerrln("Error running normalize UNORM_NFKC: %s", u_errorName(status));
335        } else {
336            pass &= assertEqual("KC", field[i], out, field[3], "c4!=KC(c", fieldNum);
337            iterativeNorm(field[i], UNORM_NFKC, options, out, +1);
338            pass &= assertEqual("KC(+1)", field[i], out, field[3], "c4!=KC(c", fieldNum);
339            iterativeNorm(field[i], UNORM_NFKC, options, out, -1);
340            pass &= assertEqual("KC(-1)", field[i], out, field[3], "c4!=KC(c", fieldNum);
341        }
342
343        Normalizer::normalize(field[i], UNORM_NFKD, options, out, status);
344        if (U_FAILURE(status)) {
345            dataerrln("Error running normalize UNORM_NFKD: %s", u_errorName(status));
346        } else {
347            pass &= assertEqual("KD", field[i], out, field[4], "c5!=KD(c", fieldNum);
348            iterativeNorm(field[i], UNORM_NFKD, options, out, +1);
349            pass &= assertEqual("KD(+1)", field[i], out, field[4], "c5!=KD(c", fieldNum);
350            iterativeNorm(field[i], UNORM_NFKD, options, out, -1);
351            pass &= assertEqual("KD(-1)", field[i], out, field[4], "c5!=KD(c", fieldNum);
352        }
353    }
354    compare(field[1],field[2]);
355    compare(field[0],field[1]);
356    // test quick checks
357    if(UNORM_NO == Normalizer::quickCheck(field[1], UNORM_NFC, options, status)) {
358        errln("Normalizer error: quickCheck(NFC(s), UNORM_NFC) is UNORM_NO");
359        pass = FALSE;
360    }
361    if(UNORM_NO == Normalizer::quickCheck(field[2], UNORM_NFD, options, status)) {
362        errln("Normalizer error: quickCheck(NFD(s), UNORM_NFD) is UNORM_NO");
363        pass = FALSE;
364    }
365    if(UNORM_NO == Normalizer::quickCheck(field[3], UNORM_NFKC, options, status)) {
366        errln("Normalizer error: quickCheck(NFKC(s), UNORM_NFKC) is UNORM_NO");
367        pass = FALSE;
368    }
369    if(UNORM_NO == Normalizer::quickCheck(field[4], UNORM_NFKD, options, status)) {
370        errln("Normalizer error: quickCheck(NFKD(s), UNORM_NFKD) is UNORM_NO");
371        pass = FALSE;
372    }
373
374    // branch on options==0 for better code coverage
375    if(options==0) {
376        result = Normalizer::isNormalized(field[1], UNORM_NFC, status);
377    } else {
378        result = Normalizer::isNormalized(field[1], UNORM_NFC, options, status);
379    }
380    if(!result) {
381        dataerrln("Normalizer error: isNormalized(NFC(s), UNORM_NFC) is FALSE");
382        pass = FALSE;
383    }
384    if(field[0]!=field[1] && Normalizer::isNormalized(field[0], UNORM_NFC, options, status)) {
385        errln("Normalizer error: isNormalized(s, UNORM_NFC) is TRUE");
386        pass = FALSE;
387    }
388    if(!Normalizer::isNormalized(field[3], UNORM_NFKC, options, status)) {
389        dataerrln("Normalizer error: isNormalized(NFKC(s), UNORM_NFKC) is FALSE");
390        pass = FALSE;
391    }
392    if(field[0]!=field[3] && Normalizer::isNormalized(field[0], UNORM_NFKC, options, status)) {
393        errln("Normalizer error: isNormalized(s, UNORM_NFKC) is TRUE");
394        pass = FALSE;
395    }
396
397    // test FCD quick check and "makeFCD"
398    Normalizer::normalize(field[0], UNORM_FCD, options, fcd, status);
399    if(UNORM_NO == Normalizer::quickCheck(fcd, UNORM_FCD, options, status)) {
400        errln("Normalizer error: quickCheck(FCD(s), UNORM_FCD) is UNORM_NO");
401        pass = FALSE;
402    }
403    if(UNORM_NO == Normalizer::quickCheck(field[2], UNORM_FCD, options, status)) {
404        errln("Normalizer error: quickCheck(NFD(s), UNORM_FCD) is UNORM_NO");
405        pass = FALSE;
406    }
407    if(UNORM_NO == Normalizer::quickCheck(field[4], UNORM_FCD, options, status)) {
408        errln("Normalizer error: quickCheck(NFKD(s), UNORM_FCD) is UNORM_NO");
409        pass = FALSE;
410    }
411
412    Normalizer::normalize(fcd, UNORM_NFD, options, out, status);
413    if(out != field[2]) {
414        dataerrln("Normalizer error: NFD(FCD(s))!=NFD(s)");
415        pass = FALSE;
416    }
417
418    if (U_FAILURE(status)) {
419        dataerrln("Normalizer::normalize returned error status: %s", u_errorName(status));
420        pass = FALSE;
421    }
422
423    if(field[0]!=field[2]) {
424        // two strings that are canonically equivalent must test
425        // equal under a canonical caseless match
426        // see UAX #21 Case Mappings and Jitterbug 2021 and
427        // Unicode Technical Committee meeting consensus 92-C31
428        int32_t rc;
429
430        status=U_ZERO_ERROR;
431        rc=Normalizer::compare(field[0], field[2], (options<<UNORM_COMPARE_NORM_OPTIONS_SHIFT)|U_COMPARE_IGNORE_CASE, status);
432        if(U_FAILURE(status)) {
433            dataerrln("Normalizer::compare(case-insensitive) sets %s", u_errorName(status));
434            pass=FALSE;
435        } else if(rc!=0) {
436            errln("Normalizer::compare(original, NFD, case-insensitive) returned %d instead of 0 for equal", rc);
437            pass=FALSE;
438        }
439    }
440
441    if (!pass) {
442        dataerrln("FAIL: %s", line);
443    }
444    return pass;
445}
446
447/**
448 * Do a normalization using the iterative API in the given direction.
449 * @param dir either +1 or -1
450 */
451void NormalizerConformanceTest::iterativeNorm(const UnicodeString& str,
452                                              UNormalizationMode mode, int32_t options,
453                                              UnicodeString& result,
454                                              int8_t dir) {
455    UErrorCode status = U_ZERO_ERROR;
456    normalizer.setText(str, status);
457    normalizer.setMode(mode);
458    normalizer.setOption(-1, 0);        // reset all options
459    normalizer.setOption(options, 1);   // set desired options
460    result.truncate(0);
461    if (U_FAILURE(status)) {
462        return;
463    }
464    UChar32 ch;
465    if (dir > 0) {
466        for (ch = normalizer.first(); ch != Normalizer::DONE;
467             ch = normalizer.next()) {
468            result.append(ch);
469        }
470    } else {
471        for (ch = normalizer.last(); ch != Normalizer::DONE;
472             ch = normalizer.previous()) {
473            result.insert(0, ch);
474        }
475    }
476}
477
478/**
479 * @param op name of normalization form, e.g., "KC"
480 * @param s string being normalized
481 * @param got value received
482 * @param exp expected value
483 * @param msg description of this test
484 * @param return true if got == exp
485 */
486UBool NormalizerConformanceTest::assertEqual(const char *op,
487                                             const UnicodeString& s,
488                                             const UnicodeString& got,
489                                             const UnicodeString& exp,
490                                             const char *msg,
491                                             int32_t field)
492{
493    if (exp == got)
494        return TRUE;
495
496    char *sChars, *gotChars, *expChars;
497    UnicodeString sPretty(prettify(s));
498    UnicodeString gotPretty(prettify(got));
499    UnicodeString expPretty(prettify(exp));
500
501    sChars = new char[sPretty.length() + 1];
502    gotChars = new char[gotPretty.length() + 1];
503    expChars = new char[expPretty.length() + 1];
504
505    sPretty.extract(0, sPretty.length(), sChars, sPretty.length() + 1);
506    sChars[sPretty.length()] = 0;
507    gotPretty.extract(0, gotPretty.length(), gotChars, gotPretty.length() + 1);
508    gotChars[gotPretty.length()] = 0;
509    expPretty.extract(0, expPretty.length(), expChars, expPretty.length() + 1);
510    expChars[expPretty.length()] = 0;
511
512    errln("    %s%d)%s(%s)=%s, exp. %s", msg, field, op, sChars, gotChars, expChars);
513
514    delete []sChars;
515    delete []gotChars;
516    delete []expChars;
517    return FALSE;
518}
519
520/**
521 * Split a string into pieces based on the given delimiter
522 * character.  Then, parse the resultant fields from hex into
523 * characters.  That is, "0040 0400;0C00;0899" -> new String[] {
524 * "\u0040\u0400", "\u0C00", "\u0899" }.  The output is assumed to
525 * be of the proper length already, and exactly output.length
526 * fields are parsed.  If there are too few an exception is
527 * thrown.  If there are too many the extras are ignored.
528 *
529 * @return FALSE upon failure
530 */
531UBool NormalizerConformanceTest::hexsplit(const char *s, char delimiter,
532                                          UnicodeString output[], int32_t outputLength) {
533    const char *t = s;
534    char *end = NULL;
535    UChar32 c;
536    int32_t i;
537    for (i=0; i<outputLength; ++i) {
538        // skip whitespace
539        while(*t == ' ' || *t == '\t') {
540            ++t;
541        }
542
543        // read a sequence of code points
544        output[i].remove();
545        for(;;) {
546            c = (UChar32)uprv_strtoul(t, &end, 16);
547
548            if( (char *)t == end ||
549                (uint32_t)c > 0x10ffff ||
550                (*end != ' ' && *end != '\t' && *end != delimiter)
551            ) {
552                errln(UnicodeString("Bad field ", "") + (i + 1) + " in " + UnicodeString(s, ""));
553                return FALSE;
554            }
555
556            output[i].append(c);
557
558            t = (const char *)end;
559
560            // skip whitespace
561            while(*t == ' ' || *t == '\t') {
562                ++t;
563            }
564
565            if(*t == delimiter) {
566                ++t;
567                break;
568            }
569            if(*t == 0) {
570                if((i + 1) == outputLength) {
571                    return TRUE;
572                } else {
573                    errln(UnicodeString("Missing field(s) in ", "") + s + " only " + (i + 1) + " out of " + outputLength);
574                    return FALSE;
575                }
576            }
577        }
578    }
579    return TRUE;
580}
581
582// Specific tests for debugging.  These are generally failures taken from
583// the conformance file, but culled out to make debugging easier.
584
585void NormalizerConformanceTest::TestCase6(void) {
586    _testOneLine("0385;0385;00A8 0301;0020 0308 0301;0020 0308 0301;");
587}
588
589void NormalizerConformanceTest::_testOneLine(const char *line) {
590  UErrorCode status = U_ZERO_ERROR;
591    UnicodeString fields[FIELD_COUNT];
592    if (!hexsplit(line, ';', fields, FIELD_COUNT)) {
593        errln((UnicodeString)"Unable to parse line " + line);
594    } else {
595        checkConformance(fields, line, 0, status);
596    }
597}
598
599#endif /* #if !UCONFIG_NO_NORMALIZATION */
600