1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1997-2011, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/*******************************************************************************
7*
8* File CUCDTST.C
9*
10* Modification History:
11*        Name                     Description
12*     Madhu Katragadda            Ported for C API, added tests for string functions
13********************************************************************************
14*/
15
16#include <string.h>
17#include <math.h>
18#include <stdlib.h>
19
20#include "unicode/utypes.h"
21#include "unicode/uchar.h"
22#include "unicode/putil.h"
23#include "unicode/ustring.h"
24#include "unicode/uloc.h"
25#include "unicode/unorm2.h"
26
27#include "cintltst.h"
28#include "putilimp.h"
29#include "uparse.h"
30#include "ucase.h"
31#include "ubidi_props.h"
32#include "uprops.h"
33#include "uset_imp.h"
34#include "usc_impl.h"
35#include "udatamem.h" /* for testing ucase_openBinary() */
36#include "cucdapi.h"
37
38#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
39
40/* prototypes --------------------------------------------------------------- */
41
42static void TestUpperLower(void);
43static void TestLetterNumber(void);
44static void TestMisc(void);
45static void TestPOSIX(void);
46static void TestControlPrint(void);
47static void TestIdentifier(void);
48static void TestUnicodeData(void);
49static void TestCodeUnit(void);
50static void TestCodePoint(void);
51static void TestCharLength(void);
52static void TestCharNames(void);
53static void TestMirroring(void);
54static void TestUScriptRunAPI(void);
55static void TestAdditionalProperties(void);
56static void TestNumericProperties(void);
57static void TestPropertyNames(void);
58static void TestPropertyValues(void);
59static void TestConsistency(void);
60static void TestUCase(void);
61static void TestUBiDiProps(void);
62static void TestCaseFolding(void);
63
64/* internal methods used */
65static int32_t MakeProp(char* str);
66static int32_t MakeDir(char* str);
67
68/* helpers ------------------------------------------------------------------ */
69
70static void
71parseUCDFile(const char *filename,
72             char *fields[][2], int32_t fieldCount,
73             UParseLineFn *lineFn, void *context,
74             UErrorCode *pErrorCode) {
75    char path[256];
76    char backupPath[256];
77
78    if(U_FAILURE(*pErrorCode)) {
79        return;
80    }
81
82    /* Look inside ICU_DATA first */
83    strcpy(path, u_getDataDirectory());
84    strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
85    strcat(path, filename);
86
87    /* As a fallback, try to guess where the source data was located
88     *    at the time ICU was built, and look there.
89     */
90    strcpy(backupPath, ctest_dataSrcDir());
91    strcat(backupPath, U_FILE_SEP_STRING);
92    strcat(backupPath, "unidata" U_FILE_SEP_STRING);
93    strcat(backupPath, filename);
94
95    u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
96    if(*pErrorCode==U_FILE_ACCESS_ERROR) {
97        *pErrorCode=U_ZERO_ERROR;
98        u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
99    }
100    if(U_FAILURE(*pErrorCode)) {
101        log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
102    }
103}
104
105/* test data ---------------------------------------------------------------- */
106
107static const UChar  LAST_CHAR_CODE_IN_FILE = 0xFFFD;
108static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
109static const int32_t tagValues[] =
110    {
111    /* Mn */ U_NON_SPACING_MARK,
112    /* Mc */ U_COMBINING_SPACING_MARK,
113    /* Me */ U_ENCLOSING_MARK,
114    /* Nd */ U_DECIMAL_DIGIT_NUMBER,
115    /* Nl */ U_LETTER_NUMBER,
116    /* No */ U_OTHER_NUMBER,
117    /* Zs */ U_SPACE_SEPARATOR,
118    /* Zl */ U_LINE_SEPARATOR,
119    /* Zp */ U_PARAGRAPH_SEPARATOR,
120    /* Cc */ U_CONTROL_CHAR,
121    /* Cf */ U_FORMAT_CHAR,
122    /* Cs */ U_SURROGATE,
123    /* Co */ U_PRIVATE_USE_CHAR,
124    /* Cn */ U_UNASSIGNED,
125    /* Lu */ U_UPPERCASE_LETTER,
126    /* Ll */ U_LOWERCASE_LETTER,
127    /* Lt */ U_TITLECASE_LETTER,
128    /* Lm */ U_MODIFIER_LETTER,
129    /* Lo */ U_OTHER_LETTER,
130    /* Pc */ U_CONNECTOR_PUNCTUATION,
131    /* Pd */ U_DASH_PUNCTUATION,
132    /* Ps */ U_START_PUNCTUATION,
133    /* Pe */ U_END_PUNCTUATION,
134    /* Po */ U_OTHER_PUNCTUATION,
135    /* Sm */ U_MATH_SYMBOL,
136    /* Sc */ U_CURRENCY_SYMBOL,
137    /* Sk */ U_MODIFIER_SYMBOL,
138    /* So */ U_OTHER_SYMBOL,
139    /* Pi */ U_INITIAL_PUNCTUATION,
140    /* Pf */ U_FINAL_PUNCTUATION
141    };
142
143static const char dirStrings[][5] = {
144    "L",
145    "R",
146    "EN",
147    "ES",
148    "ET",
149    "AN",
150    "CS",
151    "B",
152    "S",
153    "WS",
154    "ON",
155    "LRE",
156    "LRO",
157    "AL",
158    "RLE",
159    "RLO",
160    "PDF",
161    "NSM",
162    "BN"
163};
164
165void addUnicodeTest(TestNode** root);
166
167void addUnicodeTest(TestNode** root)
168{
169    addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
170    addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
171    addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
172    addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
173    addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
174    addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
175    addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
176    addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
177    addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
178    addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
179    addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
180    addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
181    addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
182    addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
183    addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
184    addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
185    addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
186    addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
187    addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
188    addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
189    addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
190    addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
191    addTest(root, &TestUCase, "tsutil/cucdtst/TestUCase");
192    addTest(root, &TestUBiDiProps, "tsutil/cucdtst/TestUBiDiProps");
193    addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
194}
195
196/*==================================================== */
197/* test u_toupper() and u_tolower()                    */
198/*==================================================== */
199static void TestUpperLower()
200{
201    const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
202    const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
203    U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
204    U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
205    int32_t i;
206
207    U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
208    U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
209
210/*
211Checks LetterLike Symbols which were previously a source of confusion
212[Bertrand A. D. 02/04/98]
213*/
214    for (i=0x2100;i<0x2138;i++)
215    {
216        /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
217        if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
218        {
219            if (i != (int)u_tolower(i)) /* itself */
220                log_err("Failed case conversion with itself: U+%04x\n", i);
221            if (i != (int)u_toupper(i))
222                log_err("Failed case conversion with itself: U+%04x\n", i);
223        }
224    }
225
226    for(i=0; i < u_strlen(upper); i++){
227        if(u_tolower(upper[i]) != lower[i]){
228            log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
229        }
230    }
231
232    log_verbose("testing upper lower\n");
233    for (i = 0; i < 21; i++) {
234
235        if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
236        {
237            log_err("Failed isLowerCase test at  %c\n", upperTest[i]);
238        }
239        else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
240         {
241            log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
242        }
243        else if (upperTest[i] != u_tolower(lowerTest[i]))
244        {
245            log_err("Failed case conversion from %c  To %c :\n", lowerTest[i], upperTest[i]);
246        }
247        else if (lowerTest[i] != u_toupper(upperTest[i]))
248         {
249            log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
250        }
251        else if (upperTest[i] != u_tolower(upperTest[i]))
252        {
253            log_err("Failed case conversion with itself: %c\n", upperTest[i]);
254        }
255        else if (lowerTest[i] != u_toupper(lowerTest[i]))
256        {
257            log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
258        }
259    }
260    log_verbose("done testing upper lower\n");
261
262    log_verbose("testing u_istitle\n");
263    {
264        static const UChar expected[] = {
265            0x1F88,
266            0x1F89,
267            0x1F8A,
268            0x1F8B,
269            0x1F8C,
270            0x1F8D,
271            0x1F8E,
272            0x1F8F,
273            0x1F88,
274            0x1F89,
275            0x1F8A,
276            0x1F8B,
277            0x1F8C,
278            0x1F8D,
279            0x1F8E,
280            0x1F8F,
281            0x1F98,
282            0x1F99,
283            0x1F9A,
284            0x1F9B,
285            0x1F9C,
286            0x1F9D,
287            0x1F9E,
288            0x1F9F,
289            0x1F98,
290            0x1F99,
291            0x1F9A,
292            0x1F9B,
293            0x1F9C,
294            0x1F9D,
295            0x1F9E,
296            0x1F9F,
297            0x1FA8,
298            0x1FA9,
299            0x1FAA,
300            0x1FAB,
301            0x1FAC,
302            0x1FAD,
303            0x1FAE,
304            0x1FAF,
305            0x1FA8,
306            0x1FA9,
307            0x1FAA,
308            0x1FAB,
309            0x1FAC,
310            0x1FAD,
311            0x1FAE,
312            0x1FAF,
313            0x1FBC,
314            0x1FBC,
315            0x1FCC,
316            0x1FCC,
317            0x1FFC,
318            0x1FFC,
319        };
320        int32_t num = sizeof(expected)/sizeof(expected[0]);
321        for(i=0; i<num; i++){
322            if(!u_istitle(expected[i])){
323                log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
324            }
325        }
326
327    }
328}
329
330/* compare two sets and verify that their difference or intersection is empty */
331static UBool
332showADiffB(const USet *a, const USet *b,
333           const char *a_name, const char *b_name,
334           UBool expect, UBool diffIsError) {
335    USet *aa;
336    int32_t i, start, end, length;
337    UErrorCode errorCode;
338
339    /*
340     * expect:
341     * TRUE  -> a-b should be empty, that is, b should contain all of a
342     * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
343     */
344    if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
345        return TRUE;
346    }
347
348    /* clone a to aa because a is const */
349    aa=uset_open(1, 0);
350    if(aa==NULL) {
351        /* unusual problem - out of memory? */
352        return FALSE;
353    }
354    uset_addAll(aa, a);
355
356    /* compute the set in question */
357    if(expect) {
358        /* a-b */
359        uset_removeAll(aa, b);
360    } else {
361        /* a&b */
362        uset_retainAll(aa, b);
363    }
364
365    /* aa is not empty because of the initial tests above; show its contents */
366    errorCode=U_ZERO_ERROR;
367    i=0;
368    for(;;) {
369        length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
370        if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
371            break; /* done */
372        }
373        if(U_FAILURE(errorCode)) {
374            log_err("error comparing %s with %s at difference item %d: %s\n",
375                a_name, b_name, i, u_errorName(errorCode));
376            break;
377        }
378        if(length!=0) {
379            break; /* done with code points, got a string or -1 */
380        }
381
382        if(diffIsError) {
383            if(expect) {
384                log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
385            } else {
386                log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
387            }
388        } else {
389            if(expect) {
390                log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
391            } else {
392                log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
393            }
394        }
395
396        ++i;
397    }
398
399    uset_close(aa);
400    return FALSE;
401}
402
403static UBool
404showAMinusB(const USet *a, const USet *b,
405            const char *a_name, const char *b_name,
406            UBool diffIsError) {
407    return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
408}
409
410static UBool
411showAIntersectB(const USet *a, const USet *b,
412                const char *a_name, const char *b_name,
413                UBool diffIsError) {
414    return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
415}
416
417static UBool
418compareUSets(const USet *a, const USet *b,
419             const char *a_name, const char *b_name,
420             UBool diffIsError) {
421    /*
422     * Use an arithmetic & not a logical && so that both branches
423     * are always taken and all differences are shown.
424     */
425    return
426        showAMinusB(a, b, a_name, b_name, diffIsError) &
427        showAMinusB(b, a, b_name, a_name, diffIsError);
428}
429
430/* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
431static void TestLetterNumber()
432{
433    UChar i = 0x0000;
434
435    log_verbose("Testing for isalpha\n");
436    for (i = 0x0041; i < 0x005B; i++) {
437        if (!u_isalpha(i))
438        {
439            log_err("Failed isLetter test at  %.4X\n", i);
440        }
441    }
442    for (i = 0x0660; i < 0x066A; i++) {
443        if (u_isalpha(i))
444        {
445            log_err("Failed isLetter test with numbers at %.4X\n", i);
446        }
447    }
448
449    log_verbose("Testing for isdigit\n");
450    for (i = 0x0660; i < 0x066A; i++) {
451        if (!u_isdigit(i))
452        {
453            log_verbose("Failed isNumber test at %.4X\n", i);
454        }
455    }
456
457    log_verbose("Testing for isalnum\n");
458    for (i = 0x0041; i < 0x005B; i++) {
459        if (!u_isalnum(i))
460        {
461            log_err("Failed isAlNum test at  %.4X\n", i);
462        }
463    }
464    for (i = 0x0660; i < 0x066A; i++) {
465        if (!u_isalnum(i))
466        {
467            log_err("Failed isAlNum test at  %.4X\n", i);
468        }
469    }
470
471    {
472        /*
473         * The following checks work only starting from Unicode 4.0.
474         * Check the version number here.
475         */
476        static UVersionInfo u401={ 4, 0, 1, 0 };
477        UVersionInfo version;
478        u_getUnicodeVersion(version);
479        if(version[0]<4 || 0==memcmp(version, u401, 4)) {
480            return;
481        }
482    }
483
484    {
485        /*
486         * Sanity check:
487         * Verify that exactly the digit characters have decimal digit values.
488         * This assumption is used in the implementation of u_digit()
489         * (which checks nt=de)
490         * compared with the parallel java.lang.Character.digit()
491         * (which checks Nd).
492         *
493         * This was not true in Unicode 3.2 and earlier.
494         * Unicode 4.0 fixed discrepancies.
495         * Unicode 4.0.1 re-introduced problems in this area due to an
496         * unintentionally incomplete last-minute change.
497         */
498        U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
499        U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
500
501        USet *digits, *decimalValues;
502        UErrorCode errorCode;
503
504        U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
505        U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
506        errorCode=U_ZERO_ERROR;
507        digits=uset_openPattern(digitsPattern, 6, &errorCode);
508        decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
509
510        if(U_SUCCESS(errorCode)) {
511            compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
512        }
513
514        uset_close(digits);
515        uset_close(decimalValues);
516    }
517}
518
519static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
520                                const UChar32 *sampleChars, int32_t sampleCharsLength,
521                                UBool expected) {
522    int32_t i;
523    for (i = 0; i < sampleCharsLength; ++i) {
524        UBool result = propFn(sampleChars[i]);
525        if (result != expected) {
526            log_err("error: character property function %s(U+%04x)=%d is wrong\n",
527                    propName, sampleChars[i], result);
528        }
529    }
530}
531
532/* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
533static void TestMisc()
534{
535    static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
536    static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
537    static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
538    static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
539    static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
540    static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
541/*    static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
542    static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
543    static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
544    static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
545    static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
546
547    static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
548
549    uint32_t mask;
550
551    int32_t i;
552    char icuVersion[U_MAX_VERSION_STRING_LENGTH];
553    UVersionInfo realVersion;
554
555    memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
556
557    testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, LENGTHOF(sampleSpaces), TRUE);
558    testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, LENGTHOF(sampleNonSpaces), FALSE);
559
560    testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
561                        sampleSpaces, LENGTHOF(sampleSpaces), TRUE);
562    testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
563                        sampleNonSpaces, LENGTHOF(sampleNonSpaces), FALSE);
564
565    testSampleCharProps(u_isWhitespace, "u_isWhitespace",
566                        sampleWhiteSpaces, LENGTHOF(sampleWhiteSpaces), TRUE);
567    testSampleCharProps(u_isWhitespace, "u_isWhitespace",
568                        sampleNonWhiteSpaces, LENGTHOF(sampleNonWhiteSpaces), FALSE);
569
570    testSampleCharProps(u_isdefined, "u_isdefined",
571                        sampleDefined, LENGTHOF(sampleDefined), TRUE);
572    testSampleCharProps(u_isdefined, "u_isdefined",
573                        sampleUndefined, LENGTHOF(sampleUndefined), FALSE);
574
575    testSampleCharProps(u_isbase, "u_isbase", sampleBase, LENGTHOF(sampleBase), TRUE);
576    testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, LENGTHOF(sampleNonBase), FALSE);
577
578    testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, LENGTHOF(sampleDigits), TRUE);
579    testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, LENGTHOF(sampleNonDigits), FALSE);
580
581    for (i = 0; i < LENGTHOF(sampleDigits); i++) {
582        if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
583            log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
584                    sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
585        }
586    }
587
588    /* Tests the ICU version #*/
589    u_getVersion(realVersion);
590    u_versionToString(realVersion, icuVersion);
591    if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
592    {
593        log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
594    }
595#if defined(ICU_VERSION)
596    /* test only happens where we have configure.in with VERSION - sanity check. */
597    if(strcmp(U_ICU_VERSION, ICU_VERSION))
598    {
599        log_err("ICU version mismatch: Header says %s, build environment says %s.\n",  U_ICU_VERSION, ICU_VERSION);
600    }
601#endif
602
603    /* test U_GC_... */
604    if(
605        U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
606        U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
607        U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
608        U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
609        U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
610        U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
611    ) {
612        log_err("error: U_GET_GC_MASK does not work properly\n");
613    }
614
615    mask=0;
616    mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
617
618    mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
619    mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
620    mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
621    mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
622    mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
623
624    mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
625    mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
626    mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
627
628    mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
629    mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
630    mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
631
632    mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
633    mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
634    mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
635
636    mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
637    mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
638    mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
639    mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
640
641    mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
642    mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
643    mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
644    mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
645    mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
646
647    mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
648    mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
649    mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
650    mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
651
652    mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
653    mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
654
655    if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
656        log_err("error: problems with U_GC_XX_MASK constants\n");
657    }
658
659    mask=0;
660    mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
661    mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
662    mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
663    mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
664    mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
665    mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
666    mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
667
668    if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
669        log_err("error: problems with U_GC_Y_MASK constants\n");
670    }
671    {
672        static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
673        for(i=0; i<10; i++){
674            if(digit[i]!=u_forDigit(i,10)){
675                log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
676            }
677        }
678    }
679
680    /* test u_digit() */
681    {
682        static const struct {
683            UChar32 c;
684            int8_t radix, value;
685        } data[]={
686            /* base 16 */
687            { 0x0031, 16, 1 },
688            { 0x0038, 16, 8 },
689            { 0x0043, 16, 12 },
690            { 0x0066, 16, 15 },
691            { 0x00e4, 16, -1 },
692            { 0x0662, 16, 2 },
693            { 0x06f5, 16, 5 },
694            { 0xff13, 16, 3 },
695            { 0xff41, 16, 10 },
696
697            /* base 8 */
698            { 0x0031, 8, 1 },
699            { 0x0038, 8, -1 },
700            { 0x0043, 8, -1 },
701            { 0x0066, 8, -1 },
702            { 0x00e4, 8, -1 },
703            { 0x0662, 8, 2 },
704            { 0x06f5, 8, 5 },
705            { 0xff13, 8, 3 },
706            { 0xff41, 8, -1 },
707
708            /* base 36 */
709            { 0x5a, 36, 35 },
710            { 0x7a, 36, 35 },
711            { 0xff3a, 36, 35 },
712            { 0xff5a, 36, 35 },
713
714            /* wrong radix values */
715            { 0x0031, 1, -1 },
716            { 0xff3a, 37, -1 }
717        };
718
719        for(i=0; i<LENGTHOF(data); ++i) {
720            if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
721                log_err("u_digit(U+%04x, %d)=%d expected %d\n",
722                        data[i].c,
723                        data[i].radix,
724                        u_digit(data[i].c, data[i].radix),
725                        data[i].value);
726            }
727        }
728    }
729}
730
731/* test C/POSIX-style functions --------------------------------------------- */
732
733/* bit flags */
734#define ISAL     1
735#define ISLO     2
736#define ISUP     4
737
738#define ISDI     8
739#define ISXD  0x10
740
741#define ISAN  0x20
742
743#define ISPU  0x40
744#define ISGR  0x80
745#define ISPR 0x100
746
747#define ISSP 0x200
748#define ISBL 0x400
749#define ISCN 0x800
750
751/* C/POSIX-style functions, in the same order as the bit flags */
752typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
753
754static const struct {
755    IsPOSIXClass *fn;
756    const char *name;
757} posixClasses[]={
758    { u_isalpha, "isalpha" },
759    { u_islower, "islower" },
760    { u_isupper, "isupper" },
761    { u_isdigit, "isdigit" },
762    { u_isxdigit, "isxdigit" },
763    { u_isalnum, "isalnum" },
764    { u_ispunct, "ispunct" },
765    { u_isgraph, "isgraph" },
766    { u_isprint, "isprint" },
767    { u_isspace, "isspace" },
768    { u_isblank, "isblank" },
769    { u_iscntrl, "iscntrl" }
770};
771
772static const struct {
773    UChar32 c;
774    uint32_t posixResults;
775} posixData[]={
776    { 0x0008,                                                        ISCN },    /* backspace */
777    { 0x0009,                                              ISSP|ISBL|ISCN },    /* TAB */
778    { 0x000a,                                              ISSP|     ISCN },    /* LF */
779    { 0x000c,                                              ISSP|     ISCN },    /* FF */
780    { 0x000d,                                              ISSP|     ISCN },    /* CR */
781    { 0x0020,                                         ISPR|ISSP|ISBL      },    /* space */
782    { 0x0021,                               ISPU|ISGR|ISPR                },    /* ! */
783    { 0x0033,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* 3 */
784    { 0x0040,                               ISPU|ISGR|ISPR                },    /* @ */
785    { 0x0041, ISAL|     ISUP|     ISXD|ISAN|     ISGR|ISPR                },    /* A */
786    { 0x007a, ISAL|ISLO|               ISAN|     ISGR|ISPR                },    /* z */
787    { 0x007b,                               ISPU|ISGR|ISPR                },    /* { */
788    { 0x0085,                                              ISSP|     ISCN },    /* NEL */
789    { 0x00a0,                                         ISPR|ISSP|ISBL      },    /* NBSP */
790    { 0x00a4,                                    ISGR|ISPR                },    /* currency sign */
791    { 0x00e4, ISAL|ISLO|               ISAN|     ISGR|ISPR                },    /* a-umlaut */
792    { 0x0300,                                    ISGR|ISPR                },    /* combining grave */
793    { 0x0600,                                                        ISCN },    /* arabic number sign */
794    { 0x0627, ISAL|                    ISAN|     ISGR|ISPR                },    /* alef */
795    { 0x0663,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* arabic 3 */
796    { 0x2002,                                         ISPR|ISSP|ISBL      },    /* en space */
797    { 0x2007,                                         ISPR|ISSP|ISBL      },    /* figure space */
798    { 0x2009,                                         ISPR|ISSP|ISBL      },    /* thin space */
799    { 0x200b,                                                        ISCN },    /* ZWSP */
800  /*{ 0x200b,                                         ISPR|ISSP           },*/    /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
801    { 0x200e,                                                        ISCN },    /* LRM */
802    { 0x2028,                                         ISPR|ISSP|     ISCN },    /* LS */
803    { 0x2029,                                         ISPR|ISSP|     ISCN },    /* PS */
804    { 0x20ac,                                    ISGR|ISPR                },    /* Euro */
805    { 0xff15,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth 5 */
806    { 0xff25, ISAL|     ISUP|     ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth E */
807    { 0xff35, ISAL|     ISUP|          ISAN|     ISGR|ISPR                },    /* fullwidth U */
808    { 0xff45, ISAL|ISLO|          ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth e */
809    { 0xff55, ISAL|ISLO|               ISAN|     ISGR|ISPR                }     /* fullwidth u */
810};
811
812static void
813TestPOSIX() {
814    uint32_t mask;
815    int32_t cl, i;
816    UBool expect;
817
818    mask=1;
819    for(cl=0; cl<12; ++cl) {
820        for(i=0; i<LENGTHOF(posixData); ++i) {
821            expect=(UBool)((posixData[i].posixResults&mask)!=0);
822            if(posixClasses[cl].fn(posixData[i].c)!=expect) {
823                log_err("u_%s(U+%04x)=%s is wrong\n",
824                    posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
825            }
826        }
827        mask<<=1;
828    }
829}
830
831/* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
832static void TestControlPrint()
833{
834    const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
835    const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
836    const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
837    const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
838    UChar32 c;
839
840    testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, LENGTHOF(sampleControl), TRUE);
841    testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, LENGTHOF(sampleNonControl), FALSE);
842
843    testSampleCharProps(u_isprint, "u_isprint",
844                        samplePrintable, LENGTHOF(samplePrintable), TRUE);
845    testSampleCharProps(u_isprint, "u_isprint",
846                        sampleNonPrintable, LENGTHOF(sampleNonPrintable), FALSE);
847
848    /* test all ISO 8 controls */
849    for(c=0; c<=0x9f; ++c) {
850        if(c==0x20) {
851            /* skip ASCII graphic characters and continue with DEL */
852            c=0x7f;
853        }
854        if(!u_iscntrl(c)) {
855            log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
856        }
857        if(!u_isISOControl(c)) {
858            log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
859        }
860        if(u_isprint(c)) {
861            log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
862        }
863    }
864
865    /* test all Latin-1 graphic characters */
866    for(c=0x20; c<=0xff; ++c) {
867        if(c==0x7f) {
868            c=0xa0;
869        } else if(c==0xad) {
870            /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
871            ++c;
872        }
873        if(!u_isprint(c)) {
874            log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
875        }
876    }
877}
878
879/* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
880static void TestIdentifier()
881{
882    const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
883    const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
884    const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
885    const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
886    const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
887    const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
888    const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
889    const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
890    const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
891    const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
892
893    testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
894                        sampleJavaIDStart, LENGTHOF(sampleJavaIDStart), TRUE);
895    testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
896                        sampleNonJavaIDStart, LENGTHOF(sampleNonJavaIDStart), FALSE);
897
898    testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
899                        sampleJavaIDPart, LENGTHOF(sampleJavaIDPart), TRUE);
900    testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
901                        sampleNonJavaIDPart, LENGTHOF(sampleNonJavaIDPart), FALSE);
902
903    /* IDPart should imply IDStart */
904    testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
905                        sampleJavaIDStart, LENGTHOF(sampleJavaIDStart), TRUE);
906
907    testSampleCharProps(u_isIDStart, "u_isIDStart",
908                        sampleUnicodeIDStart, LENGTHOF(sampleUnicodeIDStart), TRUE);
909    testSampleCharProps(u_isIDStart, "u_isIDStart",
910                        sampleNonUnicodeIDStart, LENGTHOF(sampleNonUnicodeIDStart), FALSE);
911
912    testSampleCharProps(u_isIDPart, "u_isIDPart",
913                        sampleUnicodeIDPart, LENGTHOF(sampleUnicodeIDPart), TRUE);
914    testSampleCharProps(u_isIDPart, "u_isIDPart",
915                        sampleNonUnicodeIDPart, LENGTHOF(sampleNonUnicodeIDPart), FALSE);
916
917    /* IDPart should imply IDStart */
918    testSampleCharProps(u_isIDPart, "u_isIDPart",
919                        sampleUnicodeIDStart, LENGTHOF(sampleUnicodeIDStart), TRUE);
920
921    testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
922                        sampleIDIgnore, LENGTHOF(sampleIDIgnore), TRUE);
923    testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
924                        sampleNonIDIgnore, LENGTHOF(sampleNonIDIgnore), FALSE);
925}
926
927/* for each line of UnicodeData.txt, check some of the properties */
928/*
929 * ### TODO
930 * This test fails incorrectly if the First or Last code point of a repetitive area
931 * is overridden, which is allowed and is encouraged for the PUAs.
932 * Currently, this means that both area First/Last and override lines are
933 * tested against the properties from the API,
934 * and the area boundary will not match and cause an error.
935 *
936 * This function should detect area boundaries and skip them for the test of individual
937 * code points' properties.
938 * Then it should check that the areas contain all the same properties except where overridden.
939 * For this, it would have had to set a flag for which code points were listed explicitly.
940 */
941static void U_CALLCONV
942unicodeDataLineFn(void *context,
943                  char *fields[][2], int32_t fieldCount,
944                  UErrorCode *pErrorCode)
945{
946    char buffer[100];
947    char *end;
948    uint32_t value;
949    UChar32 c;
950    int32_t i;
951    int8_t type;
952
953    /* get the character code, field 0 */
954    c=strtoul(fields[0][0], &end, 16);
955    if(end<=fields[0][0] || end!=fields[0][1]) {
956        log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
957        return;
958    }
959    if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
960        log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
961        return;
962    }
963
964    /* get general category, field 2 */
965    *fields[2][1]=0;
966    type = (int8_t)tagValues[MakeProp(fields[2][0])];
967    if(u_charType(c)!=type) {
968        log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
969    }
970    if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
971        log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
972    }
973
974    /* get canonical combining class, field 3 */
975    value=strtoul(fields[3][0], &end, 10);
976    if(end<=fields[3][0] || end!=fields[3][1]) {
977        log_err("error: syntax error in field 3 at code 0x%lx\n", c);
978        return;
979    }
980    if(value>255) {
981        log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
982        return;
983    }
984#if !UCONFIG_NO_NORMALIZATION
985    if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
986        log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
987    }
988#endif
989
990    /* get BiDi category, field 4 */
991    *fields[4][1]=0;
992    i=MakeDir(fields[4][0]);
993    if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
994        log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
995    }
996
997    /* get ISO Comment, field 11 */
998    *fields[11][1]=0;
999    i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1000    if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1001        log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1002            c, u_errorName(*pErrorCode),
1003            U_FAILURE(*pErrorCode) ? buffer : "[error]",
1004            fields[11][0]);
1005    }
1006
1007    /* get uppercase mapping, field 12 */
1008    if(fields[12][0]!=fields[12][1]) {
1009        value=strtoul(fields[12][0], &end, 16);
1010        if(end!=fields[12][1]) {
1011            log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1012            return;
1013        }
1014        if((UChar32)value!=u_toupper(c)) {
1015            log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1016        }
1017    } else {
1018        /* no case mapping: the API must map the code point to itself */
1019        if(c!=u_toupper(c)) {
1020            log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1021        }
1022    }
1023
1024    /* get lowercase mapping, field 13 */
1025    if(fields[13][0]!=fields[13][1]) {
1026        value=strtoul(fields[13][0], &end, 16);
1027        if(end!=fields[13][1]) {
1028            log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1029            return;
1030        }
1031        if((UChar32)value!=u_tolower(c)) {
1032            log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1033        }
1034    } else {
1035        /* no case mapping: the API must map the code point to itself */
1036        if(c!=u_tolower(c)) {
1037            log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1038        }
1039    }
1040
1041    /* get titlecase mapping, field 14 */
1042    if(fields[14][0]!=fields[14][1]) {
1043        value=strtoul(fields[14][0], &end, 16);
1044        if(end!=fields[14][1]) {
1045            log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1046            return;
1047        }
1048        if((UChar32)value!=u_totitle(c)) {
1049            log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1050        }
1051    } else {
1052        /* no case mapping: the API must map the code point to itself */
1053        if(c!=u_totitle(c)) {
1054            log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1055        }
1056    }
1057}
1058
1059static UBool U_CALLCONV
1060enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1061    static const UChar32 test[][2]={
1062        {0x41, U_UPPERCASE_LETTER},
1063        {0x308, U_NON_SPACING_MARK},
1064        {0xfffe, U_GENERAL_OTHER_TYPES},
1065        {0xe0041, U_FORMAT_CHAR},
1066        {0xeffff, U_UNASSIGNED}
1067    };
1068
1069    int32_t i, count;
1070
1071    if(0!=strcmp((const char *)context, "a1")) {
1072        log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1073        return FALSE;
1074    }
1075
1076    count=LENGTHOF(test);
1077    for(i=0; i<count; ++i) {
1078        if(start<=test[i][0] && test[i][0]<limit) {
1079            if(type!=(UCharCategory)test[i][1]) {
1080                log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1081                        start, limit, (long)type, test[i][0], test[i][1]);
1082            }
1083            /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1084            return i==(count-1) ? FALSE : TRUE;
1085        }
1086    }
1087
1088    if(start>test[count-1][0]) {
1089        log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1090                start, limit, (long)type);
1091        return FALSE;
1092    }
1093
1094    return TRUE;
1095}
1096
1097static UBool U_CALLCONV
1098enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1099    /* default Bidi classes for unassigned code points */
1100    static const int32_t defaultBidi[][2]={ /* { limit, class } */
1101        { 0x0590, U_LEFT_TO_RIGHT },
1102        { 0x0600, U_RIGHT_TO_LEFT },
1103        { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1104        { 0x0900, U_RIGHT_TO_LEFT },
1105        { 0xFB1D, U_LEFT_TO_RIGHT },
1106        { 0xFB50, U_RIGHT_TO_LEFT },
1107        { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1108        { 0xFE70, U_LEFT_TO_RIGHT },
1109        { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1110        { 0x10800, U_LEFT_TO_RIGHT },
1111        { 0x11000, U_RIGHT_TO_LEFT },
1112        { 0x1E800, U_LEFT_TO_RIGHT },  /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1113        { 0x1F000, U_RIGHT_TO_LEFT },
1114        { 0x110000, U_LEFT_TO_RIGHT }
1115    };
1116
1117    UChar32 c;
1118    int32_t i;
1119    UCharDirection shouldBeDir;
1120
1121    /*
1122     * LineBreak.txt specifies:
1123     *   #  - Assigned characters that are not listed explicitly are given the value
1124     *   #    "AL".
1125     *   #  - Unassigned characters are given the value "XX".
1126     *
1127     * PUA characters are listed explicitly with "XX".
1128     * Verify that no assigned character has "XX".
1129     */
1130    if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1131        c=start;
1132        while(c<limit) {
1133            if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1134                log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1135            }
1136            ++c;
1137        }
1138    }
1139
1140    /*
1141     * Verify default Bidi classes.
1142     * For recent Unicode versions, see UCD.html.
1143     *
1144     * For older Unicode versions:
1145     * See table 3-7 "Bidirectional Character Types" in UAX #9.
1146     * http://www.unicode.org/reports/tr9/
1147     *
1148     * See also DerivedBidiClass.txt for Cn code points!
1149     *
1150     * Unicode 4.0.1/Public Review Issue #28 (http://www.unicode.org/review/resolved-pri.html)
1151     * changed some default values.
1152     * In particular, non-characters and unassigned Default Ignorable Code Points
1153     * change from L to BN.
1154     *
1155     * UCD.html version 4.0.1 does not yet reflect these changes.
1156     */
1157    if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1158        /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1159        c=start;
1160        for(i=0; i<LENGTHOF(defaultBidi) && c<limit; ++i) {
1161            if((int32_t)c<defaultBidi[i][0]) {
1162                while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1163                    if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1164                        shouldBeDir=U_BOUNDARY_NEUTRAL;
1165                    } else {
1166                        shouldBeDir=(UCharDirection)defaultBidi[i][1];
1167                    }
1168
1169                    if( u_charDirection(c)!=shouldBeDir ||
1170                        u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1171                    ) {
1172                        log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1173                            c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1174                    }
1175                    ++c;
1176                }
1177            }
1178        }
1179    }
1180
1181    return TRUE;
1182}
1183
1184/* tests for several properties */
1185static void TestUnicodeData()
1186{
1187    UVersionInfo expectVersionArray;
1188    UVersionInfo versionArray;
1189    char *fields[15][2];
1190    UErrorCode errorCode;
1191    UChar32 c;
1192    int8_t type;
1193
1194    u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1195    u_getUnicodeVersion(versionArray);
1196    if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1197    {
1198        log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1199        versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1200    }
1201
1202#if defined(ICU_UNICODE_VERSION)
1203    /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1204    if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1205    {
1206         log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1207    }
1208#endif
1209
1210    if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1211        log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1212    }
1213
1214    errorCode=U_ZERO_ERROR;
1215    parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, NULL, &errorCode);
1216    if(U_FAILURE(errorCode)) {
1217        return; /* if we couldn't parse UnicodeData.txt, we should return */
1218    }
1219
1220    /* sanity check on repeated properties */
1221    for(c=0xfffe; c<=0x10ffff;) {
1222        type=u_charType(c);
1223        if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1224            log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1225        }
1226        if(type!=U_UNASSIGNED) {
1227            log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1228        }
1229        if((c&0xffff)==0xfffe) {
1230            ++c;
1231        } else {
1232            c+=0xffff;
1233        }
1234    }
1235
1236    /* test that PUA is not "unassigned" */
1237    for(c=0xe000; c<=0x10fffd;) {
1238        type=u_charType(c);
1239        if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1240            log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1241        }
1242        if(type==U_UNASSIGNED) {
1243            log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1244        } else if(type!=U_PRIVATE_USE_CHAR) {
1245            log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1246        }
1247        if(c==0xf8ff) {
1248            c=0xf0000;
1249        } else if(c==0xffffd) {
1250            c=0x100000;
1251        } else {
1252            ++c;
1253        }
1254    }
1255
1256    /* test u_enumCharTypes() */
1257    u_enumCharTypes(enumTypeRange, "a1");
1258
1259    /* check default properties */
1260    u_enumCharTypes(enumDefaultsRange, NULL);
1261}
1262
1263static void TestCodeUnit(){
1264    const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1265
1266    int32_t i;
1267
1268    for(i=0; i<(int32_t)(sizeof(codeunit)/sizeof(codeunit[0])); i++){
1269        UChar c=codeunit[i];
1270        if(i<4){
1271            if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1272                log_err("ERROR: U+%04x is a single", c);
1273            }
1274
1275        }
1276        if(i >= 4 && i< 8){
1277            if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1278                log_err("ERROR: U+%04x is a first surrogate", c);
1279            }
1280        }
1281        if(i >= 8 && i< 12){
1282            if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1283                log_err("ERROR: U+%04x is a second surrogate", c);
1284            }
1285        }
1286    }
1287
1288}
1289
1290static void TestCodePoint(){
1291    const UChar32 codePoint[]={
1292        /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1293        0xd800,
1294        0xdbff,
1295        0xdc00,
1296        0xdfff,
1297        0xdc04,
1298        0xd821,
1299        /*not a surrogate, valid, isUnicodeChar , not Error*/
1300        0x20ac,
1301        0xd7ff,
1302        0xe000,
1303        0xe123,
1304        0x0061,
1305        0xe065,
1306        0x20402,
1307        0x24506,
1308        0x23456,
1309        0x20402,
1310        0x10402,
1311        0x23456,
1312        /*not a surrogate, not valid, isUnicodeChar, isError */
1313        0x0015,
1314        0x009f,
1315        /*not a surrogate, not valid, not isUnicodeChar, isError */
1316        0xffff,
1317        0xfffe,
1318    };
1319    int32_t i;
1320    for(i=0; i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0])); i++){
1321        UChar32 c=codePoint[i];
1322        if(i<6){
1323            if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){
1324                log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1325            }
1326            if(UTF_IS_VALID(c)){
1327                log_err("ERROR: isValid() failed for U+%04x\n", c);
1328            }
1329            if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1330                log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1331            }
1332            if(UTF_IS_ERROR(c)){
1333                log_err("ERROR: isError() failed for U+%04x\n", c);
1334            }
1335        }else if(i >=6 && i<18){
1336            if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1337                log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1338            }
1339            if(!UTF_IS_VALID(c)){
1340                log_err("ERROR: isValid() failed for U+%04x\n", c);
1341            }
1342            if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1343                log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1344            }
1345            if(UTF_IS_ERROR(c)){
1346                log_err("ERROR: isError() failed for U+%04x\n", c);
1347            }
1348        }else if(i >=18 && i<20){
1349            if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1350                log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1351            }
1352            if(UTF_IS_VALID(c)){
1353                log_err("ERROR: isValid() failed for U+%04x\n", c);
1354            }
1355            if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1356                log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1357            }
1358            if(!UTF_IS_ERROR(c)){
1359                log_err("ERROR: isError() failed for U+%04x\n", c);
1360            }
1361        }
1362        else if(i >=18 && i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0]))){
1363            if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1364                log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1365            }
1366            if(UTF_IS_VALID(c)){
1367                log_err("ERROR: isValid() failed for U+%04x\n", c);
1368            }
1369            if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1370                log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1371            }
1372            if(!UTF_IS_ERROR(c)){
1373                log_err("ERROR: isError() failed for U+%04x\n", c);
1374            }
1375        }
1376    }
1377
1378    if(
1379        !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1380        !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1381        U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1382        U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1383    ) {
1384        log_err("error with U_IS_BMP()\n");
1385    }
1386
1387    if(
1388        U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1389        U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1390        U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1391        !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1392    ) {
1393        log_err("error with U_IS_SUPPLEMENTARY()\n");
1394    }
1395}
1396
1397static void TestCharLength()
1398{
1399    const int32_t codepoint[]={
1400        1, 0x0061,
1401        1, 0xe065,
1402        1, 0x20ac,
1403        2, 0x20402,
1404        2, 0x23456,
1405        2, 0x24506,
1406        2, 0x20402,
1407        2, 0x10402,
1408        1, 0xd7ff,
1409        1, 0xe000
1410    };
1411
1412    int32_t i;
1413    UBool multiple;
1414    for(i=0; i<(int32_t)(sizeof(codepoint)/sizeof(codepoint[0])); i=(int16_t)(i+2)){
1415        UChar32 c=codepoint[i+1];
1416        if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){
1417            log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], UTF_CHAR_LENGTH(c));
1418        }
1419        multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1420        if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1421            log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1422        }
1423    }
1424}
1425
1426/*internal functions ----*/
1427static int32_t MakeProp(char* str)
1428{
1429    int32_t result = 0;
1430    char* matchPosition =0;
1431
1432    matchPosition = strstr(tagStrings, str);
1433    if (matchPosition == 0)
1434    {
1435        log_err("unrecognized type letter ");
1436        log_err(str);
1437    }
1438    else
1439        result = (int32_t)((matchPosition - tagStrings) / 2);
1440    return result;
1441}
1442
1443static int32_t MakeDir(char* str)
1444{
1445    int32_t pos = 0;
1446    for (pos = 0; pos < 19; pos++) {
1447        if (strcmp(str, dirStrings[pos]) == 0) {
1448            return pos;
1449        }
1450    }
1451    return -1;
1452}
1453
1454/* test u_charName() -------------------------------------------------------- */
1455
1456static const struct {
1457    uint32_t code;
1458    const char *name, *oldName, *extName, *alias;
1459} names[]={
1460    {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"},
1461    {0x01a2, "LATIN CAPITAL LETTER OI",
1462             "LATIN CAPITAL LETTER O I",
1463             "LATIN CAPITAL LETTER OI",
1464             "LATIN CAPITAL LETTER GHA"},
1465    {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK",
1466             "LATIN SMALL LETTER DOTLESS J BAR HOOK",
1467             "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" },
1468    {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1469             "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1470             "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1471    {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" },
1472    {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" },
1473    {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" },
1474    {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" },
1475    {0xd800, "", "", "<lead surrogate-D800>" },
1476    {0xdc00, "", "", "<trail surrogate-DC00>" },
1477    {0xff08, "FULLWIDTH LEFT PARENTHESIS", "FULLWIDTH OPENING PARENTHESIS", "FULLWIDTH LEFT PARENTHESIS" },
1478    {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" },
1479    {0xffff, "", "", "<noncharacter-FFFF>" },
1480    {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1481              "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1482              "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1483    {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" }
1484};
1485
1486static UBool
1487enumCharNamesFn(void *context,
1488                UChar32 code, UCharNameChoice nameChoice,
1489                const char *name, int32_t length) {
1490    int32_t *pCount=(int32_t *)context;
1491    const char *expected;
1492    int i;
1493
1494    if(length<=0 || length!=(int32_t)strlen(name)) {
1495        /* should not be called with an empty string or invalid length */
1496        log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1497        return TRUE;
1498    }
1499
1500    ++*pCount;
1501    for(i=0; i<sizeof(names)/sizeof(names[0]); ++i) {
1502        if(code==(UChar32)names[i].code) {
1503            switch (nameChoice) {
1504                case U_EXTENDED_CHAR_NAME:
1505                    if(0!=strcmp(name, names[i].extName)) {
1506                        log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1507                    }
1508                    break;
1509                case U_UNICODE_CHAR_NAME:
1510                    if(0!=strcmp(name, names[i].name)) {
1511                        log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1512                    }
1513                    break;
1514                case U_UNICODE_10_CHAR_NAME:
1515                    expected=names[i].oldName;
1516                    if(expected[0]==0 || 0!=strcmp(name, expected)) {
1517                        log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1518                    }
1519                    break;
1520                case U_CHAR_NAME_ALIAS:
1521                    expected=names[i].alias;
1522                    if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1523                        log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1524                    }
1525                    break;
1526                case U_CHAR_NAME_CHOICE_COUNT:
1527                    break;
1528            }
1529            break;
1530        }
1531    }
1532    return TRUE;
1533}
1534
1535struct enumExtCharNamesContext {
1536    uint32_t length;
1537    int32_t last;
1538};
1539
1540static UBool
1541enumExtCharNamesFn(void *context,
1542                UChar32 code, UCharNameChoice nameChoice,
1543                const char *name, int32_t length) {
1544    struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1545
1546    if (ecncp->last != (int32_t) code - 1) {
1547        if (ecncp->last < 0) {
1548            log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1549        } else {
1550            log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1551        }
1552    }
1553    ecncp->last = (int32_t) code;
1554
1555    if (!*name) {
1556        log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1557    }
1558
1559    return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1560}
1561
1562/**
1563 * This can be made more efficient by moving it into putil.c and having
1564 * it directly access the ebcdic translation tables.
1565 * TODO: If we get this method in putil.c, then delete it from here.
1566 */
1567static UChar
1568u_charToUChar(char c) {
1569    UChar uc;
1570    u_charsToUChars(&c, &uc, 1);
1571    return uc;
1572}
1573
1574static void
1575TestCharNames() {
1576    static char name[80];
1577    UErrorCode errorCode=U_ZERO_ERROR;
1578    struct enumExtCharNamesContext extContext;
1579    const char *expected;
1580    int32_t length;
1581    UChar32 c;
1582    int32_t i;
1583
1584    log_verbose("Testing uprv_getMaxCharNameLength()\n");
1585    length=uprv_getMaxCharNameLength();
1586    if(length==0) {
1587        /* no names data available */
1588        return;
1589    }
1590    if(length<83) { /* Unicode 3.2 max char name length */
1591        log_err("uprv_getMaxCharNameLength()=%d is too short");
1592    }
1593    /* ### TODO same tests for max ISO comment length as for max name length */
1594
1595    log_verbose("Testing u_charName()\n");
1596    for(i=0; i<(int32_t)(sizeof(names)/sizeof(names[0])); ++i) {
1597        /* modern Unicode character name */
1598        length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1599        if(U_FAILURE(errorCode)) {
1600            log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1601            return;
1602        }
1603        if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1604            log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1605        }
1606
1607        /* find the modern name */
1608        if (*names[i].name) {
1609            c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1610            if(U_FAILURE(errorCode)) {
1611                log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1612                return;
1613            }
1614            if(c!=(UChar32)names[i].code) {
1615                log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1616            }
1617        }
1618
1619        /* Unicode 1.0 character name */
1620        length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1621        if(U_FAILURE(errorCode)) {
1622            log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1623            return;
1624        }
1625        if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1626            log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1627        }
1628
1629        /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1630        if(names[i].oldName[0]!=0 /* && length>0 */) {
1631            c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1632            if(U_FAILURE(errorCode)) {
1633                log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1634                return;
1635            }
1636            if(c!=(UChar32)names[i].code) {
1637                log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1638            }
1639        }
1640
1641        /* Unicode character name alias */
1642        length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1643        if(U_FAILURE(errorCode)) {
1644            log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1645            return;
1646        }
1647        expected=names[i].alias;
1648        if(expected==NULL) {
1649            expected="";
1650        }
1651        if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1652            log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1653                    names[i].code, name, length, expected);
1654        }
1655
1656        /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1657        if(expected[0]!=0 /* && length>0 */) {
1658            c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1659            if(U_FAILURE(errorCode)) {
1660                log_err("u_charFromName(%s - alias) error %s\n",
1661                        expected, u_errorName(errorCode));
1662                return;
1663            }
1664            if(c!=(UChar32)names[i].code) {
1665                log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1666                        expected, c, names[i].code);
1667            }
1668        }
1669    }
1670
1671    /* test u_enumCharNames() */
1672    length=0;
1673    errorCode=U_ZERO_ERROR;
1674    u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1675    if(U_FAILURE(errorCode) || length<94140) {
1676        log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1677    }
1678
1679    extContext.length = 0;
1680    extContext.last = -1;
1681    errorCode=U_ZERO_ERROR;
1682    u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1683    if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1684        log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1685    }
1686
1687    /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1688    if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1689        log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1690    }
1691
1692    /* Test getCharNameCharacters */
1693    if(!getTestOption(QUICK_OPTION)) {
1694        enum { BUFSIZE = 256 };
1695        UErrorCode ec = U_ZERO_ERROR;
1696        char buf[BUFSIZE];
1697        int32_t maxLength;
1698        UChar32 cp;
1699        UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1700        int32_t l1, l2;
1701        UBool map[256];
1702        UBool ok;
1703
1704        USet* set = uset_open(1, 0); /* empty set */
1705        USet* dumb = uset_open(1, 0); /* empty set */
1706
1707        /*
1708         * uprv_getCharNameCharacters() will likely return more lowercase
1709         * letters than actual character names contain because
1710         * it includes all the characters in lowercased names of
1711         * general categories, for the full possible set of extended names.
1712         */
1713        {
1714            USetAdder sa={
1715                NULL,
1716                uset_add,
1717                uset_addRange,
1718                uset_addString,
1719                NULL /* don't need remove() */
1720            };
1721            sa.set=set;
1722            uprv_getCharNameCharacters(&sa);
1723        }
1724
1725        /* build set the dumb (but sure-fire) way */
1726        for (i=0; i<256; ++i) {
1727            map[i] = FALSE;
1728        }
1729
1730        maxLength=0;
1731        for (cp=0; cp<0x110000; ++cp) {
1732            int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1733                                     buf, BUFSIZE, &ec);
1734            if (U_FAILURE(ec)) {
1735                log_err("FAIL: u_charName failed when it shouldn't\n");
1736                uset_close(set);
1737                uset_close(dumb);
1738                return;
1739            }
1740            if(len>maxLength) {
1741                maxLength=len;
1742            }
1743
1744            for (i=0; i<len; ++i) {
1745                if (!map[(uint8_t) buf[i]]) {
1746                    uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1747                    map[(uint8_t) buf[i]] = TRUE;
1748                }
1749            }
1750
1751            /* test for leading/trailing whitespace */
1752            if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1753                log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1754            }
1755        }
1756
1757        if(map[(uint8_t)'\t']) {
1758            log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1759        }
1760
1761        length=uprv_getMaxCharNameLength();
1762        if(length!=maxLength) {
1763            log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1764                    length, maxLength);
1765        }
1766
1767        /* compare the sets.  Where is my uset_equals?!! */
1768        ok=TRUE;
1769        for(i=0; i<256; ++i) {
1770            if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1771                if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1772                    /* ignore lowercase a-z that are in set but not in dumb */
1773                    ok=TRUE;
1774                } else {
1775                    ok=FALSE;
1776                    break;
1777                }
1778            }
1779        }
1780
1781        l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1782        l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1783        if (U_FAILURE(ec)) {
1784            log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1785            uset_close(set);
1786            uset_close(dumb);
1787            return;
1788        }
1789
1790        if (l1 >= BUFSIZE) {
1791            l1 = BUFSIZE-1;
1792            pat[l1] = 0;
1793        }
1794        if (l2 >= BUFSIZE) {
1795            l2 = BUFSIZE-1;
1796            dumbPat[l2] = 0;
1797        }
1798
1799        if (!ok) {
1800            log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1801                    aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1802        } else if(getTestOption(VERBOSITY_OPTION)) {
1803            log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
1804        }
1805
1806        uset_close(set);
1807        uset_close(dumb);
1808    }
1809
1810    /* ### TODO: test error cases and other interesting things */
1811}
1812
1813/* test u_isMirrored() and u_charMirror() ----------------------------------- */
1814
1815static void
1816TestMirroring() {
1817    USet *set;
1818    UErrorCode errorCode;
1819
1820    UChar32 start, end, c2, c3;
1821    int32_t i;
1822
1823    U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1824
1825    U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1826
1827    log_verbose("Testing u_isMirrored()\n");
1828    if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
1829         !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
1830        )
1831    ) {
1832        log_err("u_isMirrored() does not work correctly\n");
1833    }
1834
1835    log_verbose("Testing u_charMirror()\n");
1836    if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
1837         u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
1838         u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
1839         /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
1840         u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
1841         )
1842    ) {
1843        log_err("u_charMirror() does not work correctly\n");
1844    }
1845
1846    /* verify that Bidi_Mirroring_Glyph roundtrips */
1847    errorCode=U_ZERO_ERROR;
1848    set=uset_openPattern(mirroredPattern, 17, &errorCode);
1849
1850    if (U_FAILURE(errorCode)) {
1851        log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
1852    } else {
1853        for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
1854            do {
1855                c2=u_charMirror(start);
1856                c3=u_charMirror(c2);
1857                if(c3!=start) {
1858                    log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
1859                }
1860            } while(++start<=end);
1861        }
1862    }
1863
1864    uset_close(set);
1865}
1866
1867
1868struct RunTestData
1869{
1870    const char *runText;
1871    UScriptCode runCode;
1872};
1873
1874typedef struct RunTestData RunTestData;
1875
1876static void
1877CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
1878                const char *prefix)
1879{
1880    int32_t run, runStart, runLimit;
1881    UScriptCode runCode;
1882
1883    /* iterate over all the runs */
1884    run = 0;
1885    while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
1886        if (runStart != runStarts[run]) {
1887            log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
1888                prefix, run, runStarts[run], runStart);
1889        }
1890
1891        if (runLimit != runStarts[run + 1]) {
1892            log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
1893                prefix, run, runStarts[run + 1], runLimit);
1894        }
1895
1896        if (runCode != testData[run].runCode) {
1897            log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
1898                prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
1899        }
1900
1901        run += 1;
1902
1903        /* stop when we've seen all the runs we expect to see */
1904        if (run >= nRuns) {
1905            break;
1906        }
1907    }
1908
1909    /* Complain if we didn't see then number of runs we expected */
1910    if (run != nRuns) {
1911        log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
1912    }
1913}
1914
1915static void
1916TestUScriptRunAPI()
1917{
1918    static const RunTestData testData1[] = {
1919        {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
1920        {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
1921        {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
1922        {"English (", USCRIPT_LATIN},
1923        {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
1924        {") ", USCRIPT_LATIN},
1925        {"\\u6F22\\u5B75", USCRIPT_HAN},
1926        {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
1927        {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
1928        {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
1929    };
1930
1931    static const RunTestData testData2[] = {
1932       {"((((((((((abc))))))))))", USCRIPT_LATIN}
1933    };
1934
1935    static const struct {
1936      const RunTestData *testData;
1937      int32_t nRuns;
1938    } testDataEntries[] = {
1939        {testData1, LENGTHOF(testData1)},
1940        {testData2, LENGTHOF(testData2)}
1941    };
1942
1943    static const int32_t nTestEntries = LENGTHOF(testDataEntries);
1944    int32_t testEntry;
1945
1946    for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
1947        UChar testString[1024];
1948        int32_t runStarts[256];
1949        int32_t nTestRuns = testDataEntries[testEntry].nRuns;
1950        const RunTestData *testData = testDataEntries[testEntry].testData;
1951
1952        int32_t run, stringLimit;
1953        UScriptRun *scriptRun = NULL;
1954        UErrorCode err;
1955
1956        /*
1957         * Fill in the test string and the runStarts array.
1958         */
1959        stringLimit = 0;
1960        for (run = 0; run < nTestRuns; run += 1) {
1961            runStarts[run] = stringLimit;
1962            stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
1963            /*stringLimit -= 1;*/
1964        }
1965
1966        /* The limit of the last run */
1967        runStarts[nTestRuns] = stringLimit;
1968
1969        /*
1970         * Make sure that calling uscript_OpenRun with a NULL text pointer
1971         * and a non-zero text length returns the correct error.
1972         */
1973        err = U_ZERO_ERROR;
1974        scriptRun = uscript_openRun(NULL, stringLimit, &err);
1975
1976        if (err != U_ILLEGAL_ARGUMENT_ERROR) {
1977            log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
1978        }
1979
1980        if (scriptRun != NULL) {
1981            log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
1982            uscript_closeRun(scriptRun);
1983        }
1984
1985        /*
1986         * Make sure that calling uscript_OpenRun with a non-NULL text pointer
1987         * and a zero text length returns the correct error.
1988         */
1989        err = U_ZERO_ERROR;
1990        scriptRun = uscript_openRun(testString, 0, &err);
1991
1992        if (err != U_ILLEGAL_ARGUMENT_ERROR) {
1993            log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
1994        }
1995
1996        if (scriptRun != NULL) {
1997            log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
1998            uscript_closeRun(scriptRun);
1999        }
2000
2001        /*
2002         * Make sure that calling uscript_openRun with a NULL text pointer
2003         * and a zero text length doesn't return an error.
2004         */
2005        err = U_ZERO_ERROR;
2006        scriptRun = uscript_openRun(NULL, 0, &err);
2007
2008        if (U_FAILURE(err)) {
2009            log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2010        }
2011
2012        /* Make sure that the empty iterator doesn't find any runs */
2013        if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2014            log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2015        }
2016
2017        /*
2018         * Make sure that calling uscript_setRunText with a NULL text pointer
2019         * and a non-zero text length returns the correct error.
2020         */
2021        err = U_ZERO_ERROR;
2022        uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2023
2024        if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2025            log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2026        }
2027
2028        /*
2029         * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2030         * and a zero text length returns the correct error.
2031         */
2032        err = U_ZERO_ERROR;
2033        uscript_setRunText(scriptRun, testString, 0, &err);
2034
2035        if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2036            log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2037        }
2038
2039        /*
2040         * Now call uscript_setRunText on the empty iterator
2041         * and make sure that it works.
2042         */
2043        err = U_ZERO_ERROR;
2044        uscript_setRunText(scriptRun, testString, stringLimit, &err);
2045
2046        if (U_FAILURE(err)) {
2047            log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2048        } else {
2049            CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2050        }
2051
2052        uscript_closeRun(scriptRun);
2053
2054        /*
2055         * Now open an interator over the testString
2056         * using uscript_openRun and make sure that it works
2057         */
2058        scriptRun = uscript_openRun(testString, stringLimit, &err);
2059
2060        if (U_FAILURE(err)) {
2061            log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2062        } else {
2063            CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2064        }
2065
2066        /* Now reset the iterator, and make sure
2067         * that it still works.
2068         */
2069        uscript_resetRun(scriptRun);
2070
2071        CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2072
2073        /* Close the iterator */
2074        uscript_closeRun(scriptRun);
2075    }
2076}
2077
2078/* test additional, non-core properties */
2079static void
2080TestAdditionalProperties() {
2081    /* test data for u_charAge() */
2082    static const struct {
2083        UChar32 c;
2084        UVersionInfo version;
2085    } charAges[]={
2086        {0x41,    { 1, 1, 0, 0 }},
2087        {0xffff,  { 1, 1, 0, 0 }},
2088        {0x20ab,  { 2, 0, 0, 0 }},
2089        {0x2fffe, { 2, 0, 0, 0 }},
2090        {0x20ac,  { 2, 1, 0, 0 }},
2091        {0xfb1d,  { 3, 0, 0, 0 }},
2092        {0x3f4,   { 3, 1, 0, 0 }},
2093        {0x10300, { 3, 1, 0, 0 }},
2094        {0x220,   { 3, 2, 0, 0 }},
2095        {0xff60,  { 3, 2, 0, 0 }}
2096    };
2097
2098    /* test data for u_hasBinaryProperty() */
2099    static const int32_t
2100    props[][3]={ /* code point, property, value */
2101        { 0x0627, UCHAR_ALPHABETIC, TRUE },
2102        { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2103        { 0x2028, UCHAR_ALPHABETIC, FALSE },
2104
2105        { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2106        { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2107
2108        { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2109        { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2110
2111        { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2112        { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2113
2114        /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2115        { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2116        { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2117        { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2118        { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2119
2120        { 0x058a, UCHAR_DASH, TRUE },
2121        { 0x007e, UCHAR_DASH, FALSE },
2122
2123        { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2124        { 0x3000, UCHAR_DIACRITIC, FALSE },
2125
2126        { 0x0e46, UCHAR_EXTENDER, TRUE },
2127        { 0x0020, UCHAR_EXTENDER, FALSE },
2128
2129#if !UCONFIG_NO_NORMALIZATION
2130        { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2131        { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2132        { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
2133
2134        { 0x110a, UCHAR_NFD_INERT, TRUE },      /* Jamo L */
2135        { 0x0308, UCHAR_NFD_INERT, FALSE },
2136
2137        { 0x1164, UCHAR_NFKD_INERT, TRUE },     /* Jamo V */
2138        { 0x1d79d, UCHAR_NFKD_INERT, FALSE },   /* math compat version of xi */
2139
2140        { 0x0021, UCHAR_NFC_INERT, TRUE },      /* ! */
2141        { 0x0061, UCHAR_NFC_INERT, FALSE },     /* a */
2142        { 0x00e4, UCHAR_NFC_INERT, FALSE },     /* a-umlaut */
2143        { 0x0102, UCHAR_NFC_INERT, FALSE },     /* a-breve */
2144        { 0xac1c, UCHAR_NFC_INERT, FALSE },     /* Hangul LV */
2145        { 0xac1d, UCHAR_NFC_INERT, TRUE },      /* Hangul LVT */
2146
2147        { 0x1d79d, UCHAR_NFKC_INERT, FALSE },   /* math compat version of xi */
2148        { 0x2a6d6, UCHAR_NFKC_INERT, TRUE },    /* Han, last of CJK ext. B */
2149
2150        { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2151        { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2152        { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2153        { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2154        { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2155        { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
2156#endif
2157
2158        { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2159        { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2160        { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2161
2162        { 0x30fb, UCHAR_HYPHEN, TRUE },
2163        { 0xfe58, UCHAR_HYPHEN, FALSE },
2164
2165        { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2166        { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2167        { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2168
2169        { 0x2172, UCHAR_ID_START, TRUE },
2170        { 0x007a, UCHAR_ID_START, TRUE },
2171        { 0x0039, UCHAR_ID_START, FALSE },
2172
2173        { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2174        { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2175        { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2176
2177        { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2178        { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2179
2180        { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2181        { 0x0345, UCHAR_LOWERCASE, TRUE },
2182        { 0x0030, UCHAR_LOWERCASE, FALSE },
2183
2184        { 0x1d7a9, UCHAR_MATH, TRUE },
2185        { 0x2135, UCHAR_MATH, TRUE },
2186        { 0x0062, UCHAR_MATH, FALSE },
2187
2188        { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2189        { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2190        { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2191
2192        { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2193        { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2194        { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2195
2196        { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2197        { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2198
2199        { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2200        { 0x2162, UCHAR_UPPERCASE, TRUE },
2201        { 0x0345, UCHAR_UPPERCASE, FALSE },
2202
2203        { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2204        { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2205        { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2206
2207        { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2208        { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2209        { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2210
2211        { 0x16ee, UCHAR_XID_START, TRUE },
2212        { 0x23456, UCHAR_XID_START, TRUE },
2213        { 0x1d1aa, UCHAR_XID_START, FALSE },
2214
2215        /*
2216         * Version break:
2217         * The following properties are only supported starting with the
2218         * Unicode version indicated in the second field.
2219         */
2220        { -1, 0x320, 0 },
2221
2222        { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2223        { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2224        { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2225
2226        { 0x0149, UCHAR_DEPRECATED, TRUE },         /* changed in Unicode 5.2 */
2227        { 0x0341, UCHAR_DEPRECATED, FALSE },        /* changed in Unicode 5.2 */
2228        { 0xe0041, UCHAR_DEPRECATED, TRUE },        /* changed from Unicode 5 to 5.1 */
2229        { 0xe0100, UCHAR_DEPRECATED, FALSE },
2230
2231        { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2232        { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2233        { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2234        { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE },     /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2235
2236        { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2237        { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2238        { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE },    /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2239        { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2240
2241        { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2242        { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2243
2244        { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2245        { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2246
2247        { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2248        { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2249
2250        { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2251        { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2252
2253        { 0x2e9b, UCHAR_RADICAL, TRUE },
2254        { 0x4e00, UCHAR_RADICAL, FALSE },
2255
2256        { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2257        { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2258
2259        { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2260        { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2261
2262        { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2263
2264        { 0x002e, UCHAR_S_TERM, TRUE },
2265        { 0x0061, UCHAR_S_TERM, FALSE },
2266
2267        { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2268        { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2269        { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2270        { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2271
2272        /* enum/integer type properties */
2273
2274        /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2275        /* test default Bidi classes for unassigned code points */
2276        { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2277        { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2278        { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2279        { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2280        { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2281        { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2282        { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2283        { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2284        { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2285        { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2286        { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2287
2288        { 0x0605, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2289        { 0x061c, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2290        { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2291        { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2292        { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2293        { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2294        { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2295        { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2296
2297        { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2298        { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2299        { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2300        { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2301        { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2302        { 0x1AFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2303        { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2304        { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2305        { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2306        { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2307        { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2308
2309        /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2310        { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2311
2312        { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2313        { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2314        { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2315        { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2316        { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2317        { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2318        { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2319        { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2320        { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2321
2322        { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2323        { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2324        { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2325        { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2326        { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2327        { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2328        { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2329        { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2330        { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2331        { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2332        { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2333        { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2334        { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2335        { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2336        { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2337        { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2338        { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2339
2340        /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2341        { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2342        { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER },     /* changed in Unicode 5.2 */
2343
2344        { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2345        { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2346        { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2347        { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2348        { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2349
2350        { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2351        { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2352        { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2353        { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2354        { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2355        { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2356        { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2357        { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2358
2359        /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2360        { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2361        { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2362        { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2363        { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2364        { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2365        { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2366        { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2367        { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2368        { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2369        { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2370        { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2371        { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2372        { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2373        { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2374        { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2375
2376        /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2377
2378        /* UCHAR_SCRIPT tested in TestUScriptCodeAPI() */
2379
2380        { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2381        { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2382        { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2383        { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2384        { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2385        { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2386        { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2387
2388        { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2389        { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2390        { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2391        { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2392
2393        { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2394        { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2395        { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2396        { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2397        { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2398        { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2399
2400        { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2401        { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2402        { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2403        { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2404
2405        { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2406        { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2407        { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2408        { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2409        { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2410        { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2411        { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2412
2413        { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2414        { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2415        { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2416        { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2417
2418        { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2419        { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2420        { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2421        { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2422
2423        { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2424        { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2425        { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2426        { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2427        { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2428
2429        { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2430
2431        { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2432
2433        { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2434        { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2435        { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2436
2437        { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2438        { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2439        { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2440        { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2441        { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2442
2443        { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2444        { 0x2c8e,  UCHAR_BLOCK, UBLOCK_COPTIC },
2445        { 0xfe17,  UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2446
2447        { 0x1a00,  UCHAR_SCRIPT, USCRIPT_BUGINESE },
2448        { 0x2cea,  UCHAR_SCRIPT, USCRIPT_COPTIC },
2449        { 0xa82b,  UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2450        { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2451
2452        { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2453        { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2454        { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2455        { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2456        { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2457        { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2458
2459        { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2460        { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2461        { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2462        { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2463
2464        { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2465        { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2466        { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2467        { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2468
2469        { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2470        { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2471        { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2472        { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2473
2474        { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2475
2476        /* test some script codes >127 */
2477        { 0xa6e6,  UCHAR_SCRIPT, USCRIPT_BAMUM },
2478        { 0xa4d0,  UCHAR_SCRIPT, USCRIPT_LISU },
2479        { 0x10a7f,  UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2480
2481        { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2482
2483        /* value changed in Unicode 6.0 */
2484        { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2485
2486        /* undefined UProperty values */
2487        { 0x61, 0x4a7, 0 },
2488        { 0x234bc, 0x15ed, 0 }
2489    };
2490
2491    UVersionInfo version;
2492    UChar32 c;
2493    int32_t i, result, uVersion;
2494    UProperty which;
2495
2496    /* what is our Unicode version? */
2497    u_getUnicodeVersion(version);
2498    uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2499
2500    u_charAge(0x20, version);
2501    if(version[0]==0) {
2502        /* no additional properties available */
2503        log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2504        return;
2505    }
2506
2507    /* test u_charAge() */
2508    for(i=0; i<sizeof(charAges)/sizeof(charAges[0]); ++i) {
2509        u_charAge(charAges[i].c, version);
2510        if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2511            log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2512                charAges[i].c,
2513                version[0], version[1], version[2], version[3],
2514                charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2515        }
2516    }
2517
2518    if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2519        u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2520        u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 ||   /* j2478 */
2521        u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2522        u_getIntPropertyMinValue(0x2345)!=0
2523    ) {
2524        log_err("error: u_getIntPropertyMinValue() wrong\n");
2525    }
2526    if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2527        log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2528    }
2529    if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2530        log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2531    }
2532    if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2533        log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2534    }
2535    if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2536        log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2537    }
2538    if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2539        log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2540    }
2541    if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2542        log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2543    }
2544    if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2545        log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2546    }
2547    if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2548        log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2549    }
2550    if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2551        log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2552    }
2553    if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2554        log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2555    }
2556    if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2557        log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2558    }
2559    if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2560        log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2561    }
2562    if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2563        log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2564    }
2565    /*JB#2410*/
2566    if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2567        log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2568    }
2569    if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2570        log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2571    }
2572    if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) !=  (int32_t) (U_JG_COUNT -1)) {
2573        log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2574    }
2575    if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2576        log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2577    }
2578    if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2579        log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2580    }
2581
2582    /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2583    for(i=0; i<sizeof(props)/sizeof(props[0]); ++i) {
2584        const char *whichName;
2585
2586        if(props[i][0]<0) {
2587            /* Unicode version break */
2588            if(uVersion<props[i][1]) {
2589                break; /* do not test properties that are not yet supported */
2590            } else {
2591                continue; /* skip this row */
2592            }
2593        }
2594
2595        c=(UChar32)props[i][0];
2596        which=(UProperty)props[i][1];
2597        whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2598
2599        if(which<UCHAR_INT_START) {
2600            result=u_hasBinaryProperty(c, which);
2601            if(result!=props[i][2]) {
2602                log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2603                        c, whichName, result, i);
2604            }
2605        }
2606
2607        result=u_getIntPropertyValue(c, which);
2608        if(result!=props[i][2]) {
2609            log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2610                    c, whichName, result, props[i][2], i);
2611        }
2612
2613        /* test separate functions, too */
2614        switch((UProperty)props[i][1]) {
2615        case UCHAR_ALPHABETIC:
2616            if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2617                log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2618                        props[i][0], result, i);
2619            }
2620            break;
2621        case UCHAR_LOWERCASE:
2622            if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2623                log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2624                        props[i][0], result, i);
2625            }
2626            break;
2627        case UCHAR_UPPERCASE:
2628            if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2629                log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2630                        props[i][0], result, i);
2631            }
2632            break;
2633        case UCHAR_WHITE_SPACE:
2634            if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2635                log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2636                        props[i][0], result, i);
2637            }
2638            break;
2639        default:
2640            break;
2641        }
2642    }
2643}
2644
2645static void
2646TestNumericProperties(void) {
2647    /* see UnicodeData.txt, DerivedNumericValues.txt */
2648    static const struct {
2649        UChar32 c;
2650        int32_t type;
2651        double numValue;
2652    } values[]={
2653        { 0x0F33, U_NT_NUMERIC, -1./2. },
2654        { 0x0C66, U_NT_DECIMAL, 0 },
2655        { 0x96f6, U_NT_NUMERIC, 0 },
2656        { 0xa833, U_NT_NUMERIC, 1./16. },
2657        { 0x2152, U_NT_NUMERIC, 1./10. },
2658        { 0x2151, U_NT_NUMERIC, 1./9. },
2659        { 0x1245f, U_NT_NUMERIC, 1./8. },
2660        { 0x2150, U_NT_NUMERIC, 1./7. },
2661        { 0x2159, U_NT_NUMERIC, 1./6. },
2662        { 0x09f6, U_NT_NUMERIC, 3./16. },
2663        { 0x2155, U_NT_NUMERIC, 1./5. },
2664        { 0x00BD, U_NT_NUMERIC, 1./2. },
2665        { 0x0031, U_NT_DECIMAL, 1. },
2666        { 0x4e00, U_NT_NUMERIC, 1. },
2667        { 0x58f1, U_NT_NUMERIC, 1. },
2668        { 0x10320, U_NT_NUMERIC, 1. },
2669        { 0x0F2B, U_NT_NUMERIC, 3./2. },
2670        { 0x00B2, U_NT_DIGIT, 2. },
2671        { 0x5f10, U_NT_NUMERIC, 2. },
2672        { 0x1813, U_NT_DECIMAL, 3. },
2673        { 0x5f0e, U_NT_NUMERIC, 3. },
2674        { 0x2173, U_NT_NUMERIC, 4. },
2675        { 0x8086, U_NT_NUMERIC, 4. },
2676        { 0x278E, U_NT_DIGIT, 5. },
2677        { 0x1D7F2, U_NT_DECIMAL, 6. },
2678        { 0x247A, U_NT_DIGIT, 7. },
2679        { 0x7396, U_NT_NUMERIC, 9. },
2680        { 0x1372, U_NT_NUMERIC, 10. },
2681        { 0x216B, U_NT_NUMERIC, 12. },
2682        { 0x16EE, U_NT_NUMERIC, 17. },
2683        { 0x249A, U_NT_NUMERIC, 19. },
2684        { 0x303A, U_NT_NUMERIC, 30. },
2685        { 0x5345, U_NT_NUMERIC, 30. },
2686        { 0x32B2, U_NT_NUMERIC, 37. },
2687        { 0x1375, U_NT_NUMERIC, 40. },
2688        { 0x10323, U_NT_NUMERIC, 50. },
2689        { 0x0BF1, U_NT_NUMERIC, 100. },
2690        { 0x964c, U_NT_NUMERIC, 100. },
2691        { 0x217E, U_NT_NUMERIC, 500. },
2692        { 0x2180, U_NT_NUMERIC, 1000. },
2693        { 0x4edf, U_NT_NUMERIC, 1000. },
2694        { 0x2181, U_NT_NUMERIC, 5000. },
2695        { 0x137C, U_NT_NUMERIC, 10000. },
2696        { 0x4e07, U_NT_NUMERIC, 10000. },
2697        { 0x4ebf, U_NT_NUMERIC, 100000000. },
2698        { 0x5146, U_NT_NUMERIC, 1000000000000. },
2699        { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
2700        { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
2701        { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
2702        { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
2703        { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
2704        { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
2705        { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
2706        { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
2707    };
2708
2709    double nv;
2710    UChar32 c;
2711    int32_t i, type;
2712
2713    for(i=0; i<LENGTHOF(values); ++i) {
2714        c=values[i].c;
2715        type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
2716        nv=u_getNumericValue(c);
2717
2718        if(type!=values[i].type) {
2719            log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
2720        }
2721        if(0.000001 <= fabs(nv - values[i].numValue)) {
2722            log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
2723        }
2724    }
2725}
2726
2727/**
2728 * Test the property names and property value names API.
2729 */
2730static void
2731TestPropertyNames(void) {
2732    int32_t p, v, choice=0, rev;
2733    UBool atLeastSomething = FALSE;
2734
2735    for (p=0; ; ++p) {
2736        UProperty propEnum = (UProperty)p;
2737        UBool sawProp = FALSE;
2738        if(p > 10 && !atLeastSomething) {
2739          log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
2740          return;
2741        }
2742
2743        for (choice=0; ; ++choice) {
2744            const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
2745            if (name) {
2746                if (!sawProp)
2747                    log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
2748                log_verbose("%d=\"%s\"", choice, name);
2749                sawProp = TRUE;
2750                atLeastSomething = TRUE;
2751
2752                /* test reverse mapping */
2753                rev = u_getPropertyEnum(name);
2754                if (rev != p) {
2755                    log_err("Property round-trip failure: %d -> %s -> %d\n",
2756                            p, name, rev);
2757                }
2758            }
2759            if (!name && choice>0) break;
2760        }
2761        if (sawProp) {
2762            /* looks like a valid property; check the values */
2763            const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
2764            int32_t max = 0;
2765            if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
2766                max = 255;
2767            } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
2768                /* it's far too slow to iterate all the way up to
2769                   the real max, U_GC_P_MASK */
2770                max = U_GC_NL_MASK;
2771            } else if (p == UCHAR_BLOCK) {
2772                /* UBlockCodes, unlike other values, start at 1 */
2773                max = 1;
2774            }
2775            log_verbose("\n");
2776            for (v=-1; ; ++v) {
2777                UBool sawValue = FALSE;
2778                for (choice=0; ; ++choice) {
2779                    const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
2780                    if (vname) {
2781                        if (!sawValue) log_verbose(" %s, value %d:", pname, v);
2782                        log_verbose("%d=\"%s\"", choice, vname);
2783                        sawValue = TRUE;
2784
2785                        /* test reverse mapping */
2786                        rev = u_getPropertyValueEnum(propEnum, vname);
2787                        if (rev != v) {
2788                            log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
2789                                    pname, v, vname, rev);
2790                        }
2791                    }
2792                    if (!vname && choice>0) break;
2793                }
2794                if (sawValue) {
2795                    log_verbose("\n");
2796                }
2797                if (!sawValue && v>=max) break;
2798            }
2799        }
2800        if (!sawProp) {
2801            if (p>=UCHAR_STRING_LIMIT) {
2802                break;
2803            } else if (p>=UCHAR_DOUBLE_LIMIT) {
2804                p = UCHAR_STRING_START - 1;
2805            } else if (p>=UCHAR_MASK_LIMIT) {
2806                p = UCHAR_DOUBLE_START - 1;
2807            } else if (p>=UCHAR_INT_LIMIT) {
2808                p = UCHAR_MASK_START - 1;
2809            } else if (p>=UCHAR_BINARY_LIMIT) {
2810                p = UCHAR_INT_START - 1;
2811            }
2812        }
2813    }
2814}
2815
2816/**
2817 * Test the property values API.  See JB#2410.
2818 */
2819static void
2820TestPropertyValues(void) {
2821    int32_t i, p, min, max;
2822    UErrorCode ec;
2823
2824    /* Min should be 0 for everything. */
2825    /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
2826    for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
2827        UProperty propEnum = (UProperty)p;
2828        min = u_getIntPropertyMinValue(propEnum);
2829        if (min != 0) {
2830            if (p == UCHAR_BLOCK) {
2831                /* This is okay...for now.  See JB#2487.
2832                   TODO Update this for JB#2487. */
2833            } else {
2834                const char* name;
2835                name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
2836                if (name == NULL)
2837                    name = "<ERROR>";
2838                log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
2839                        name, min);
2840            }
2841        }
2842    }
2843
2844    if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
2845        u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
2846        log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
2847    }
2848
2849    /* Max should be -1 for invalid properties. */
2850    max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
2851    if (max != -1) {
2852        log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
2853                max);
2854    }
2855
2856    /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
2857    for (i=0; i<2; ++i) {
2858        int32_t script;
2859        const char* desc;
2860        ec = U_ZERO_ERROR;
2861        switch (i) {
2862        case 0:
2863            script = uscript_getScript(-1, &ec);
2864            desc = "uscript_getScript(-1)";
2865            break;
2866        case 1:
2867            script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
2868            desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
2869            break;
2870        default:
2871            log_err("Internal test error. Too many scripts\n");
2872            return;
2873        }
2874        /* We don't explicitly test ec.  It should be U_FAILURE but it
2875           isn't documented as such. */
2876        if (script != (int32_t)USCRIPT_INVALID_CODE) {
2877            log_err("FAIL: %s = %d, exp. 0\n",
2878                    desc, script);
2879        }
2880    }
2881}
2882
2883/* various tests for consistency of UCD data and API behavior */
2884static void
2885TestConsistency() {
2886    char buffer[300];
2887    USet *set1, *set2, *set3, *set4;
2888    UErrorCode errorCode;
2889
2890    UChar32 start, end;
2891    int32_t i, length;
2892
2893    U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
2894    U_STRING_DECL(dashPattern, "[:Dash:]", 8);
2895    U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
2896    U_STRING_DECL(formatPattern, "[:Cf:]", 6);
2897    U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
2898
2899    U_STRING_DECL(mathBlocksPattern,
2900        "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
2901        1+32+46+46+45+43+1+1); /* +1 for NUL */
2902    U_STRING_DECL(mathPattern, "[:Math:]", 8);
2903    U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
2904    U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
2905    U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
2906
2907    U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
2908    U_STRING_INIT(dashPattern, "[:Dash:]", 8);
2909    U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
2910    U_STRING_INIT(formatPattern, "[:Cf:]", 6);
2911    U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
2912
2913    U_STRING_INIT(mathBlocksPattern,
2914        "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
2915        1+32+46+46+45+43+1+1); /* +1 for NUL */
2916    U_STRING_INIT(mathPattern, "[:Math:]", 8);
2917    U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
2918    U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
2919    U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
2920
2921    /*
2922     * It used to be that UCD.html and its precursors said
2923     * "Those dashes used to mark connections between pieces of words,
2924     *  plus the Katakana middle dot."
2925     *
2926     * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
2927     * but not from Hyphen.
2928     * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
2929     * Therefore, do not show errors when testing the Hyphen property.
2930     */
2931    log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
2932                "known to the UTC and not considered errors.\n");
2933
2934    errorCode=U_ZERO_ERROR;
2935    set1=uset_openPattern(hyphenPattern, 10, &errorCode);
2936    set2=uset_openPattern(dashPattern, 8, &errorCode);
2937    if(U_SUCCESS(errorCode)) {
2938        /* remove the Katakana middle dot(s) from set1 */
2939        uset_remove(set1, 0x30fb);
2940        uset_remove(set1, 0xff65); /* halfwidth variant */
2941        showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
2942    } else {
2943        log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
2944    }
2945
2946    /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
2947    set3=uset_openPattern(formatPattern, 6, &errorCode);
2948    set4=uset_openPattern(alphaPattern, 14, &errorCode);
2949    if(U_SUCCESS(errorCode)) {
2950        showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
2951        showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
2952        showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
2953    } else {
2954        log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
2955    }
2956
2957    uset_close(set1);
2958    uset_close(set2);
2959    uset_close(set3);
2960    uset_close(set4);
2961
2962    /*
2963     * Check that each lowercase character has "small" in its name
2964     * and not "capital".
2965     * There are some such characters, some of which seem odd.
2966     * Use the verbose flag to see these notices.
2967     */
2968    errorCode=U_ZERO_ERROR;
2969    set1=uset_openPattern(lowerPattern, 13, &errorCode);
2970    if(U_SUCCESS(errorCode)) {
2971        for(i=0;; ++i) {
2972            length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
2973            if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
2974                break; /* done */
2975            }
2976            if(U_FAILURE(errorCode)) {
2977                log_err("error iterating over [:Lowercase:] at item %d: %s\n",
2978                        i, u_errorName(errorCode));
2979                break;
2980            }
2981            if(length!=0) {
2982                break; /* done with code points, got a string or -1 */
2983            }
2984
2985            while(start<=end) {
2986                length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
2987                if(U_FAILURE(errorCode)) {
2988                    log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
2989                    errorCode=U_ZERO_ERROR;
2990                }
2991                if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
2992                    strstr(buffer, "SMALL CAPITAL")==NULL
2993                ) {
2994                    log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
2995                }
2996                ++start;
2997            }
2998        }
2999    } else {
3000        log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3001    }
3002    uset_close(set1);
3003
3004    /* verify that all assigned characters in Math blocks are exactly Math characters */
3005    errorCode=U_ZERO_ERROR;
3006    set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3007    set2=uset_openPattern(mathPattern, 8, &errorCode);
3008    set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3009    if(U_SUCCESS(errorCode)) {
3010        uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3011        uset_complement(set3);      /* assigned characters */
3012        uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3013        compareUSets(set1, set2,
3014                     "[assigned Math block chars]", "[math blocks]&[:Math:]",
3015                     TRUE);
3016    } else {
3017        log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3018    }
3019    uset_close(set1);
3020    uset_close(set2);
3021    uset_close(set3);
3022
3023    /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3024    errorCode=U_ZERO_ERROR;
3025    set1=uset_openPattern(unknownPattern, 14, &errorCode);
3026    set2=uset_openPattern(reservedPattern, 20, &errorCode);
3027    if(U_SUCCESS(errorCode)) {
3028        compareUSets(set1, set2,
3029                     "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3030                     TRUE);
3031    } else {
3032        log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3033    }
3034    uset_close(set1);
3035    uset_close(set2);
3036}
3037
3038/*
3039 * Starting with ICU4C 3.4, the core Unicode properties files
3040 * (uprops.icu, ucase.icu, ubidi.icu, unorm.icu)
3041 * are hardcoded in the common DLL and therefore not included
3042 * in the data package any more.
3043 * Test requiring these files are disabled so that
3044 * we need not jump through hoops (like adding snapshots of these files
3045 * to testdata).
3046 * See Jitterbug 4497.
3047 */
3048#define HARDCODED_DATA_4497 1
3049
3050/* API coverage for ucase.c */
3051static void TestUCase() {
3052#if !HARDCODED_DATA_4497
3053    UDataMemory *pData;
3054    UCaseProps *csp;
3055    const UCaseProps *ccsp;
3056    UErrorCode errorCode;
3057
3058    /* coverage for ucase_openBinary() */
3059    errorCode=U_ZERO_ERROR;
3060    pData=udata_open(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, &errorCode);
3061    if(U_FAILURE(errorCode)) {
3062        log_data_err("unable to open " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n",
3063                    u_errorName(errorCode));
3064        return;
3065    }
3066
3067    csp=ucase_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3068    if(U_FAILURE(errorCode)) {
3069        log_err("ucase_openBinary() fails for the contents of " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n",
3070                u_errorName(errorCode));
3071        udata_close(pData);
3072        return;
3073    }
3074
3075    if(UCASE_LOWER!=ucase_getType(csp, 0xdf)) { /* verify islower(sharp s) */
3076        log_err("ucase_openBinary() does not seem to return working UCaseProps\n");
3077    }
3078
3079    ucase_close(csp);
3080    udata_close(pData);
3081
3082    /* coverage for ucase_getDummy() */
3083    errorCode=U_ZERO_ERROR;
3084    ccsp=ucase_getDummy(&errorCode);
3085    if(ucase_tolower(ccsp, 0x41)!=0x41) {
3086        log_err("ucase_tolower(dummy, A)!=A\n");
3087    }
3088#endif
3089}
3090
3091/* API coverage for ubidi_props.c */
3092static void TestUBiDiProps() {
3093#if !HARDCODED_DATA_4497
3094    UDataMemory *pData;
3095    UBiDiProps *bdp;
3096    const UBiDiProps *cbdp;
3097    UErrorCode errorCode;
3098
3099    /* coverage for ubidi_openBinary() */
3100    errorCode=U_ZERO_ERROR;
3101    pData=udata_open(NULL, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &errorCode);
3102    if(U_FAILURE(errorCode)) {
3103        log_data_err("unable to open " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3104                    u_errorName(errorCode));
3105        return;
3106    }
3107
3108    bdp=ubidi_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3109    if(U_FAILURE(errorCode)) {
3110        log_err("ubidi_openBinary() fails for the contents of " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3111                u_errorName(errorCode));
3112        udata_close(pData);
3113        return;
3114    }
3115
3116    if(0x2215!=ubidi_getMirror(bdp, 0x29F5)) { /* verify some data */
3117        log_err("ubidi_openBinary() does not seem to return working UBiDiProps\n");
3118    }
3119
3120    ubidi_closeProps(bdp);
3121    udata_close(pData);
3122
3123    /* coverage for ubidi_getDummy() */
3124    errorCode=U_ZERO_ERROR;
3125    cbdp=ubidi_getDummy(&errorCode);
3126    if(ubidi_getClass(cbdp, 0x20)!=0) {
3127        log_err("ubidi_getClass(dummy, space)!=0\n");
3128    }
3129#endif
3130}
3131
3132/* test case folding, compare return values with CaseFolding.txt ------------ */
3133
3134/* bit set for which case foldings for a character have been tested already */
3135enum {
3136    CF_SIMPLE=1,
3137    CF_FULL=2,
3138    CF_TURKIC=4,
3139    CF_ALL=7
3140};
3141
3142static void
3143testFold(UChar32 c, int which,
3144         UChar32 simple, UChar32 turkic,
3145         const UChar *full, int32_t fullLength,
3146         const UChar *turkicFull, int32_t turkicFullLength) {
3147    UChar s[2], t[32];
3148    UChar32 c2;
3149    int32_t length, length2;
3150
3151    UErrorCode errorCode=U_ZERO_ERROR;
3152
3153    length=0;
3154    U16_APPEND_UNSAFE(s, length, c);
3155
3156    if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3157        log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3158    }
3159    if((which&CF_FULL)!=0) {
3160        length2=u_strFoldCase(t, LENGTHOF(t), s, length, 0, &errorCode);
3161        if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3162            log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3163        }
3164    }
3165    if((which&CF_TURKIC)!=0) {
3166        if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3167            log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3168        }
3169
3170        length2=u_strFoldCase(t, LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3171        if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3172            log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3173        }
3174    }
3175}
3176
3177/* test that c case-folds to itself */
3178static void
3179testFoldToSelf(UChar32 c, int which) {
3180    UChar s[2];
3181    int32_t length;
3182
3183    length=0;
3184    U16_APPEND_UNSAFE(s, length, c);
3185    testFold(c, which, c, c, s, length, s, length);
3186}
3187
3188struct CaseFoldingData {
3189    USet *notSeen;
3190    UChar32 prev, prevSimple;
3191    UChar prevFull[32];
3192    int32_t prevFullLength;
3193    int which;
3194};
3195typedef struct CaseFoldingData CaseFoldingData;
3196
3197static void U_CALLCONV
3198caseFoldingLineFn(void *context,
3199                  char *fields[][2], int32_t fieldCount,
3200                  UErrorCode *pErrorCode) {
3201    CaseFoldingData *pData=(CaseFoldingData *)context;
3202    char *end;
3203    UChar full[32];
3204    UChar32 c, prev, simple;
3205    int32_t count;
3206    int which;
3207    char status;
3208
3209    /* get code point */
3210    c=(UChar32)strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
3211    end=(char *)u_skipWhitespace(end);
3212    if(end<=fields[0][0] || end!=fields[0][1]) {
3213        log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3214        *pErrorCode=U_PARSE_ERROR;
3215        return;
3216    }
3217
3218    /* get the status of this mapping */
3219    status=*u_skipWhitespace(fields[1][0]);
3220    if(status!='C' && status!='S' && status!='F' && status!='T') {
3221        log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3222        *pErrorCode=U_PARSE_ERROR;
3223        return;
3224    }
3225
3226    /* get the mapping */
3227    count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3228    if(U_FAILURE(*pErrorCode)) {
3229        log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3230        return;
3231    }
3232
3233    /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3234    if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3235        simple=c;
3236    }
3237
3238    if(c!=(prev=pData->prev)) {
3239        /*
3240         * Test remaining mappings for the previous code point.
3241         * If a turkic folding was not mentioned, then it should fold the same
3242         * as the regular simple case folding.
3243         */
3244        UChar s[2];
3245        int32_t length;
3246
3247        length=0;
3248        U16_APPEND_UNSAFE(s, length, prev);
3249        testFold(prev, (~pData->which)&CF_ALL,
3250                 prev, pData->prevSimple,
3251                 s, length,
3252                 pData->prevFull, pData->prevFullLength);
3253        pData->prev=pData->prevSimple=c;
3254        length=0;
3255        U16_APPEND_UNSAFE(pData->prevFull, length, c);
3256        pData->prevFullLength=length;
3257        pData->which=0;
3258    }
3259
3260    /*
3261     * Turn the status into a bit set of case foldings to test.
3262     * Remember non-Turkic case foldings as defaults for Turkic mode.
3263     */
3264    switch(status) {
3265    case 'C':
3266        which=CF_SIMPLE|CF_FULL;
3267        pData->prevSimple=simple;
3268        u_memcpy(pData->prevFull, full, count);
3269        pData->prevFullLength=count;
3270        break;
3271    case 'S':
3272        which=CF_SIMPLE;
3273        pData->prevSimple=simple;
3274        break;
3275    case 'F':
3276        which=CF_FULL;
3277        u_memcpy(pData->prevFull, full, count);
3278        pData->prevFullLength=count;
3279        break;
3280    case 'T':
3281        which=CF_TURKIC;
3282        break;
3283    default:
3284        which=0;
3285        break; /* won't happen because of test above */
3286    }
3287
3288    testFold(c, which, simple, simple, full, count, full, count);
3289
3290    /* remember which case foldings of c have been tested */
3291    pData->which|=which;
3292
3293    /* remove c from the set of ones not mentioned in CaseFolding.txt */
3294    uset_remove(pData->notSeen, c);
3295}
3296
3297static void
3298TestCaseFolding() {
3299    CaseFoldingData data={ NULL };
3300    char *fields[3][2];
3301    UErrorCode errorCode;
3302
3303    static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3304
3305    errorCode=U_ZERO_ERROR;
3306    /* test BMP & plane 1 - nothing interesting above */
3307    data.notSeen=uset_open(0, 0x1ffff);
3308    data.prevFullLength=1; /* length of full case folding of U+0000 */
3309
3310    parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3311    if(U_SUCCESS(errorCode)) {
3312        int32_t i, start, end;
3313
3314        /* add a pseudo-last line to finish testing of the actual last one */
3315        fields[0][0]=lastLine;
3316        fields[0][1]=lastLine+6;
3317        fields[1][0]=lastLine+7;
3318        fields[1][1]=lastLine+9;
3319        fields[2][0]=lastLine+10;
3320        fields[2][1]=lastLine+17;
3321        caseFoldingLineFn(&data, fields, 3, &errorCode);
3322
3323        /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3324        for(i=0;
3325            0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3326                U_SUCCESS(errorCode);
3327            ++i
3328        ) {
3329            do {
3330                testFoldToSelf(start, CF_ALL);
3331            } while(++start<=end);
3332        }
3333    }
3334
3335    uset_close(data.notSeen);
3336}
3337