tstnorm.cpp revision 50294ead5e5d23f5bbfed76e00e6b510bd41eee1
1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1997-2010, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7#include "unicode/utypes.h"
8
9#if !UCONFIG_NO_NORMALIZATION
10
11#include "unicode/uchar.h"
12#include "unicode/errorcode.h"
13#include "unicode/normlzr.h"
14#include "unicode/uniset.h"
15#include "unicode/usetiter.h"
16#include "unicode/schriter.h"
17#include "cstring.h"
18#include "unormimp.h"
19#include "tstnorm.h"
20
21#define LENGTHOF(array) ((int32_t)(sizeof(array)/sizeof((array)[0])))
22#define ARRAY_LENGTH(array) LENGTHOF(array)
23
24#define CASE(id,test) case id:                          \
25                          name = #test;                 \
26                          if (exec) {                   \
27                              logln(#test "---");       \
28                              logln((UnicodeString)""); \
29                              test();                   \
30                          }                             \
31                          break
32
33static UErrorCode status = U_ZERO_ERROR;
34
35void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec,
36                                         const char* &name, char* /*par*/) {
37    switch (index) {
38        CASE(0,TestDecomp);
39        CASE(1,TestCompatDecomp);
40        CASE(2,TestCanonCompose);
41        CASE(3,TestCompatCompose);
42        CASE(4,TestPrevious);
43        CASE(5,TestHangulDecomp);
44        CASE(6,TestHangulCompose);
45        CASE(7,TestTibetan);
46        CASE(8,TestCompositionExclusion);
47        CASE(9,TestZeroIndex);
48        CASE(10,TestVerisign);
49        CASE(11,TestPreviousNext);
50        CASE(12,TestNormalizerAPI);
51        CASE(13,TestConcatenate);
52        CASE(14,FindFoldFCDExceptions);
53        CASE(15,TestCompare);
54        CASE(16,TestSkippable);
55        CASE(17,TestCustomComp);
56        CASE(18,TestCustomFCC);
57        default: name = ""; break;
58    }
59}
60
61/**
62 * Convert Java-style strings with \u Unicode escapes into UnicodeString objects
63 */
64static UnicodeString str(const char *input)
65{
66    UnicodeString str(input, ""); // Invariant conversion
67    return str.unescape();
68}
69
70
71BasicNormalizerTest::BasicNormalizerTest()
72{
73  // canonTest
74  // Input                    Decomposed                    Composed
75
76    canonTests[0][0] = str("cat");  canonTests[0][1] = str("cat"); canonTests[0][2] =  str("cat");
77
78    canonTests[1][0] = str("\\u00e0ardvark");    canonTests[1][1] = str("a\\u0300ardvark");  canonTests[1][2] = str("\\u00e0ardvark");
79
80    canonTests[2][0] = str("\\u1e0a"); canonTests[2][1] = str("D\\u0307"); canonTests[2][2] = str("\\u1e0a");                 // D-dot_above
81
82    canonTests[3][0] = str("D\\u0307");  canonTests[3][1] = str("D\\u0307"); canonTests[3][2] = str("\\u1e0a");            // D dot_above
83
84    canonTests[4][0] = str("\\u1e0c\\u0307"); canonTests[4][1] = str("D\\u0323\\u0307");  canonTests[4][2] = str("\\u1e0c\\u0307");         // D-dot_below dot_above
85
86    canonTests[5][0] = str("\\u1e0a\\u0323"); canonTests[5][1] = str("D\\u0323\\u0307");  canonTests[5][2] = str("\\u1e0c\\u0307");        // D-dot_above dot_below
87
88    canonTests[6][0] = str("D\\u0307\\u0323"); canonTests[6][1] = str("D\\u0323\\u0307");  canonTests[6][2] = str("\\u1e0c\\u0307");         // D dot_below dot_above
89
90    canonTests[7][0] = str("\\u1e10\\u0307\\u0323");  canonTests[7][1] = str("D\\u0327\\u0323\\u0307"); canonTests[7][2] = str("\\u1e10\\u0323\\u0307");     // D dot_below cedilla dot_above
91
92    canonTests[8][0] = str("D\\u0307\\u0328\\u0323"); canonTests[8][1] = str("D\\u0328\\u0323\\u0307"); canonTests[8][2] = str("\\u1e0c\\u0328\\u0307");     // D dot_above ogonek dot_below
93
94    canonTests[9][0] = str("\\u1E14"); canonTests[9][1] = str("E\\u0304\\u0300"); canonTests[9][2] = str("\\u1E14");         // E-macron-grave
95
96    canonTests[10][0] = str("\\u0112\\u0300"); canonTests[10][1] = str("E\\u0304\\u0300");  canonTests[10][2] = str("\\u1E14");            // E-macron + grave
97
98    canonTests[11][0] = str("\\u00c8\\u0304"); canonTests[11][1] = str("E\\u0300\\u0304");  canonTests[11][2] = str("\\u00c8\\u0304");         // E-grave + macron
99
100    canonTests[12][0] = str("\\u212b"); canonTests[12][1] = str("A\\u030a"); canonTests[12][2] = str("\\u00c5");             // angstrom_sign
101
102    canonTests[13][0] = str("\\u00c5");      canonTests[13][1] = str("A\\u030a");  canonTests[13][2] = str("\\u00c5");            // A-ring
103
104    canonTests[14][0] = str("\\u00C4ffin");  canonTests[14][1] = str("A\\u0308ffin");  canonTests[14][2] = str("\\u00C4ffin");
105
106    canonTests[15][0] = str("\\u00C4\\uFB03n"); canonTests[15][1] = str("A\\u0308\\uFB03n"); canonTests[15][2] = str("\\u00C4\\uFB03n");
107
108    canonTests[16][0] = str("Henry IV"); canonTests[16][1] = str("Henry IV"); canonTests[16][2] = str("Henry IV");
109
110    canonTests[17][0] = str("Henry \\u2163");  canonTests[17][1] = str("Henry \\u2163");  canonTests[17][2] = str("Henry \\u2163");
111
112    canonTests[18][0] = str("\\u30AC");  canonTests[18][1] = str("\\u30AB\\u3099");  canonTests[18][2] = str("\\u30AC");              // ga (Katakana)
113
114    canonTests[19][0] = str("\\u30AB\\u3099"); canonTests[19][1] = str("\\u30AB\\u3099");  canonTests[19][2] = str("\\u30AC");            // ka + ten
115
116    canonTests[20][0] = str("\\uFF76\\uFF9E"); canonTests[20][1] = str("\\uFF76\\uFF9E");  canonTests[20][2] = str("\\uFF76\\uFF9E");       // hw_ka + hw_ten
117
118    canonTests[21][0] = str("\\u30AB\\uFF9E"); canonTests[21][1] = str("\\u30AB\\uFF9E");  canonTests[21][2] = str("\\u30AB\\uFF9E");         // ka + hw_ten
119
120    canonTests[22][0] = str("\\uFF76\\u3099"); canonTests[22][1] = str("\\uFF76\\u3099");  canonTests[22][2] = str("\\uFF76\\u3099");         // hw_ka + ten
121
122    canonTests[23][0] = str("A\\u0300\\u0316"); canonTests[23][1] = str("A\\u0316\\u0300");  canonTests[23][2] = str("\\u00C0\\u0316");
123
124    /* compatTest */
125  // Input                        Decomposed                        Composed
126  compatTests[0][0] = str("cat"); compatTests[0][1] = str("cat"); compatTests[0][2] = str("cat") ;
127
128  compatTests[1][0] = str("\\uFB4f");  compatTests[1][1] = str("\\u05D0\\u05DC"); compatTests[1][2] = str("\\u05D0\\u05DC");  // Alef-Lamed vs. Alef, Lamed
129
130  compatTests[2][0] = str("\\u00C4ffin"); compatTests[2][1] = str("A\\u0308ffin"); compatTests[2][2] = str("\\u00C4ffin") ;
131
132  compatTests[3][0] = str("\\u00C4\\uFB03n"); compatTests[3][1] = str("A\\u0308ffin"); compatTests[3][2] = str("\\u00C4ffin") ; // ffi ligature -> f + f + i
133
134  compatTests[4][0] = str("Henry IV"); compatTests[4][1] = str("Henry IV"); compatTests[4][2] = str("Henry IV") ;
135
136  compatTests[5][0] = str("Henry \\u2163"); compatTests[5][1] = str("Henry IV");  compatTests[5][2] = str("Henry IV") ;
137
138  compatTests[6][0] = str("\\u30AC"); compatTests[6][1] = str("\\u30AB\\u3099"); compatTests[6][2] = str("\\u30AC") ; // ga (Katakana)
139
140  compatTests[7][0] = str("\\u30AB\\u3099"); compatTests[7][1] = str("\\u30AB\\u3099"); compatTests[7][2] = str("\\u30AC") ; // ka + ten
141
142  compatTests[8][0] = str("\\uFF76\\u3099"); compatTests[8][1] = str("\\u30AB\\u3099"); compatTests[8][2] = str("\\u30AC") ; // hw_ka + ten
143
144  /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later */
145  compatTests[9][0] = str("\\uFF76\\uFF9E"); compatTests[9][1] = str("\\u30AB\\u3099"); compatTests[9][2] = str("\\u30AC") ; // hw_ka + hw_ten
146
147  compatTests[10][0] = str("\\u30AB\\uFF9E"); compatTests[10][1] = str("\\u30AB\\u3099"); compatTests[10][2] = str("\\u30AC") ; // ka + hw_ten
148
149  /* Hangul Canonical */
150  // Input                        Decomposed                        Composed
151  hangulCanon[0][0] = str("\\ud4db"); hangulCanon[0][1] = str("\\u1111\\u1171\\u11b6"); hangulCanon[0][2] = str("\\ud4db") ;
152
153  hangulCanon[1][0] = str("\\u1111\\u1171\\u11b6"), hangulCanon[1][1] = str("\\u1111\\u1171\\u11b6"),   hangulCanon[1][2] = str("\\ud4db");
154}
155
156BasicNormalizerTest::~BasicNormalizerTest()
157{
158}
159
160void BasicNormalizerTest::TestPrevious()
161{
162  Normalizer* norm = new Normalizer("", UNORM_NFD);
163
164  logln("testing decomp...");
165  uint32_t i;
166  for (i = 0; i < ARRAY_LENGTH(canonTests); i++) {
167    backAndForth(norm, canonTests[i][0]);
168  }
169
170  logln("testing compose...");
171  norm->setMode(UNORM_NFC);
172  for (i = 0; i < ARRAY_LENGTH(canonTests); i++) {
173    backAndForth(norm, canonTests[i][0]);
174  }
175
176  delete norm;
177}
178
179void BasicNormalizerTest::TestDecomp()
180{
181  Normalizer* norm = new Normalizer("", UNORM_NFD);
182  iterateTest(norm, canonTests, ARRAY_LENGTH(canonTests), 1);
183  staticTest(UNORM_NFD, 0, canonTests, ARRAY_LENGTH(canonTests), 1);
184  delete norm;
185}
186
187void BasicNormalizerTest::TestCompatDecomp()
188{
189  Normalizer* norm = new Normalizer("", UNORM_NFKD);
190  iterateTest(norm, compatTests, ARRAY_LENGTH(compatTests), 1);
191
192  staticTest(UNORM_NFKD, 0,
193         compatTests, ARRAY_LENGTH(compatTests), 1);
194  delete norm;
195}
196
197void BasicNormalizerTest::TestCanonCompose()
198{
199  Normalizer* norm = new Normalizer("", UNORM_NFC);
200  iterateTest(norm, canonTests, ARRAY_LENGTH(canonTests), 2);
201
202  staticTest(UNORM_NFC, 0, canonTests,
203         ARRAY_LENGTH(canonTests), 2);
204  delete norm;
205}
206
207void BasicNormalizerTest::TestCompatCompose()
208{
209  Normalizer* norm = new Normalizer("", UNORM_NFKC);
210  iterateTest(norm, compatTests, ARRAY_LENGTH(compatTests), 2);
211
212  staticTest(UNORM_NFKC, 0,
213         compatTests, ARRAY_LENGTH(compatTests), 2);
214  delete norm;
215}
216
217
218//-------------------------------------------------------------------------------
219
220void BasicNormalizerTest::TestHangulCompose()
221{
222  // Make sure that the static composition methods work
223  logln("Canonical composition...");
224  staticTest(UNORM_NFC, 0,                    hangulCanon,  ARRAY_LENGTH(hangulCanon),  2);
225  logln("Compatibility composition...");
226
227  // Now try iterative composition....
228  logln("Static composition...");
229  Normalizer* norm = new Normalizer("", UNORM_NFC);
230  iterateTest(norm, hangulCanon, ARRAY_LENGTH(hangulCanon), 2);
231  norm->setMode(UNORM_NFKC);
232
233  // And finally, make sure you can do it in reverse too
234  logln("Reverse iteration...");
235  norm->setMode(UNORM_NFC);
236  for (uint32_t i = 0; i < ARRAY_LENGTH(hangulCanon); i++) {
237    backAndForth(norm, hangulCanon[i][0]);
238  }
239  delete norm;
240}
241
242void BasicNormalizerTest::TestHangulDecomp()
243{
244  // Make sure that the static decomposition methods work
245  logln("Canonical decomposition...");
246  staticTest(UNORM_NFD, 0,                     hangulCanon,  ARRAY_LENGTH(hangulCanon),  1);
247  logln("Compatibility decomposition...");
248
249  // Now the iterative decomposition methods...
250  logln("Iterative decomposition...");
251  Normalizer* norm = new Normalizer("", UNORM_NFD);
252  iterateTest(norm, hangulCanon, ARRAY_LENGTH(hangulCanon), 1);
253  norm->setMode(UNORM_NFKD);
254
255  // And finally, make sure you can do it in reverse too
256  logln("Reverse iteration...");
257  norm->setMode(UNORM_NFD);
258  for (uint32_t i = 0; i < ARRAY_LENGTH(hangulCanon); i++) {
259    backAndForth(norm, hangulCanon[i][0]);
260  }
261  delete norm;
262}
263
264/**
265 * The Tibetan vowel sign AA, 0f71, was messed up prior to Unicode version 2.1.9.
266 */
267void BasicNormalizerTest::TestTibetan(void) {
268    UnicodeString decomp[1][3];
269    decomp[0][0] = str("\\u0f77");
270    decomp[0][1] = str("\\u0f77");
271    decomp[0][2] = str("\\u0fb2\\u0f71\\u0f80");
272
273    UnicodeString compose[1][3];
274    compose[0][0] = str("\\u0fb2\\u0f71\\u0f80");
275    compose[0][1] = str("\\u0fb2\\u0f71\\u0f80");
276    compose[0][2] = str("\\u0fb2\\u0f71\\u0f80");
277
278    staticTest(UNORM_NFD,         0, decomp, ARRAY_LENGTH(decomp), 1);
279    staticTest(UNORM_NFKD,  0, decomp, ARRAY_LENGTH(decomp), 2);
280    staticTest(UNORM_NFC,        0, compose, ARRAY_LENGTH(compose), 1);
281    staticTest(UNORM_NFKC, 0, compose, ARRAY_LENGTH(compose), 2);
282}
283
284/**
285 * Make sure characters in the CompositionExclusion.txt list do not get
286 * composed to.
287 */
288void BasicNormalizerTest::TestCompositionExclusion(void) {
289    // This list is generated from CompositionExclusion.txt.
290    // Update whenever the normalizer tables are updated.  Note
291    // that we test all characters listed, even those that can be
292    // derived from the Unicode DB and are therefore commented
293    // out.
294    // ### TODO read composition exclusion from source/data/unidata file
295    // and test against that
296    UnicodeString EXCLUDED = str(
297        "\\u0340\\u0341\\u0343\\u0344\\u0374\\u037E\\u0387\\u0958"
298        "\\u0959\\u095A\\u095B\\u095C\\u095D\\u095E\\u095F\\u09DC"
299        "\\u09DD\\u09DF\\u0A33\\u0A36\\u0A59\\u0A5A\\u0A5B\\u0A5E"
300        "\\u0B5C\\u0B5D\\u0F43\\u0F4D\\u0F52\\u0F57\\u0F5C\\u0F69"
301        "\\u0F73\\u0F75\\u0F76\\u0F78\\u0F81\\u0F93\\u0F9D\\u0FA2"
302        "\\u0FA7\\u0FAC\\u0FB9\\u1F71\\u1F73\\u1F75\\u1F77\\u1F79"
303        "\\u1F7B\\u1F7D\\u1FBB\\u1FBE\\u1FC9\\u1FCB\\u1FD3\\u1FDB"
304        "\\u1FE3\\u1FEB\\u1FEE\\u1FEF\\u1FF9\\u1FFB\\u1FFD\\u2000"
305        "\\u2001\\u2126\\u212A\\u212B\\u2329\\u232A\\uF900\\uFA10"
306        "\\uFA12\\uFA15\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A\\uFB1F"
307        "\\uFB2A\\uFB2B\\uFB2C\\uFB2D\\uFB2E\\uFB2F\\uFB30\\uFB31"
308        "\\uFB32\\uFB33\\uFB34\\uFB35\\uFB36\\uFB38\\uFB39\\uFB3A"
309        "\\uFB3B\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46"
310        "\\uFB47\\uFB48\\uFB49\\uFB4A\\uFB4B\\uFB4C\\uFB4D\\uFB4E"
311        );
312    for (int32_t i=0; i<EXCLUDED.length(); ++i) {
313        UnicodeString a(EXCLUDED.charAt(i));
314        UnicodeString b;
315        UnicodeString c;
316        Normalizer::normalize(a, UNORM_NFKD, 0, b, status);
317        Normalizer::normalize(b, UNORM_NFC, 0, c, status);
318        if (c == a) {
319            errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " +
320                  hex(b) + " x COMPOSE => " +
321                  hex(c));
322        } else if (verbose) {
323            logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " +
324                  hex(b) + " x COMPOSE => " +
325                  hex(c));
326        }
327    }
328}
329
330/**
331 * Test for a problem that showed up just before ICU 1.6 release
332 * having to do with combining characters with an index of zero.
333 * Such characters do not participate in any canonical
334 * decompositions.  However, having an index of zero means that
335 * they all share one typeMask[] entry, that is, they all have to
336 * map to the same canonical class, which is not the case, in
337 * reality.
338 */
339void BasicNormalizerTest::TestZeroIndex(void) {
340    const char* DATA[] = {
341        // Expect col1 x COMPOSE_COMPAT => col2
342        // Expect col2 x DECOMP => col3
343        "A\\u0316\\u0300", "\\u00C0\\u0316", "A\\u0316\\u0300",
344        "A\\u0300\\u0316", "\\u00C0\\u0316", "A\\u0316\\u0300",
345        "A\\u0327\\u0300", "\\u00C0\\u0327", "A\\u0327\\u0300",
346        "c\\u0321\\u0327", "c\\u0321\\u0327", "c\\u0321\\u0327",
347        "c\\u0327\\u0321", "\\u00E7\\u0321", "c\\u0327\\u0321",
348    };
349    int32_t DATA_length = (int32_t)(sizeof(DATA) / sizeof(DATA[0]));
350
351    for (int32_t i=0; i<DATA_length; i+=3) {
352        UErrorCode status = U_ZERO_ERROR;
353        UnicodeString a(DATA[i], "");
354        a = a.unescape();
355        UnicodeString b;
356        Normalizer::normalize(a, UNORM_NFKC, 0, b, status);
357        if (U_FAILURE(status)) {
358            dataerrln("Error calling normalize UNORM_NFKC: %s", u_errorName(status));
359        } else {
360            UnicodeString exp(DATA[i+1], "");
361            exp = exp.unescape();
362            if (b == exp) {
363                logln((UnicodeString)"Ok: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b));
364            } else {
365                errln((UnicodeString)"FAIL: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b) +
366                      ", expect " + hex(exp));
367            }
368        }
369        Normalizer::normalize(b, UNORM_NFD, 0, a, status);
370        if (U_FAILURE(status)) {
371            dataerrln("Error calling normalize UNORM_NFD: %s", u_errorName(status));
372        } else {
373            UnicodeString exp = UnicodeString(DATA[i+2], "").unescape();
374            if (a == exp) {
375                logln((UnicodeString)"Ok: " + hex(b) + " x DECOMP => " + hex(a));
376            } else {
377                errln((UnicodeString)"FAIL: " + hex(b) + " x DECOMP => " + hex(a) +
378                      ", expect " + hex(exp));
379            }
380        }
381    }
382}
383
384/**
385 * Run a few specific cases that are failing for Verisign.
386 */
387void BasicNormalizerTest::TestVerisign(void) {
388    /*
389      > Their input:
390      > 05B8 05B9 05B1 0591 05C3 05B0 05AC 059F
391      > Their output (supposedly from ICU):
392      > 05B8 05B1 05B9 0591 05C3 05B0 05AC 059F
393      > My output from charlint:
394      > 05B1 05B8 05B9 0591 05C3 05B0 05AC 059F
395
396      05B8 05B9 05B1 0591 05C3 05B0 05AC 059F => 05B1 05B8 05B9 0591 05C3 05B0
397      05AC 059F
398
399      U+05B8  18  E HEBREW POINT QAMATS
400      U+05B9  19  F HEBREW POINT HOLAM
401      U+05B1  11 HEBREW POINT HATAF SEGOL
402      U+0591 220 HEBREW ACCENT ETNAHTA
403      U+05C3   0 HEBREW PUNCTUATION SOF PASUQ
404      U+05B0  10 HEBREW POINT SHEVA
405      U+05AC 230 HEBREW ACCENT ILUY
406      U+059F 230 HEBREW ACCENT QARNEY PARA
407
408      U+05B1  11 HEBREW POINT HATAF SEGOL
409      U+05B8  18 HEBREW POINT QAMATS
410      U+05B9  19 HEBREW POINT HOLAM
411      U+0591 220 HEBREW ACCENT ETNAHTA
412      U+05C3   0 HEBREW PUNCTUATION SOF PASUQ
413      U+05B0  10 HEBREW POINT SHEVA
414      U+05AC 230 HEBREW ACCENT ILUY
415      U+059F 230 HEBREW ACCENT QARNEY PARA
416
417      Wrong result:
418      U+05B8  18 HEBREW POINT QAMATS
419      U+05B1  11 HEBREW POINT HATAF SEGOL
420      U+05B9  19 HEBREW POINT HOLAM
421      U+0591 220 HEBREW ACCENT ETNAHTA
422      U+05C3   0 HEBREW PUNCTUATION SOF PASUQ
423      U+05B0  10 HEBREW POINT SHEVA
424      U+05AC 230 HEBREW ACCENT ILUY
425      U+059F 230 HEBREW ACCENT QARNEY PARA
426
427
428      > Their input:
429      >0592 05B7 05BC 05A5 05B0 05C0 05C4 05AD
430      >Their output (supposedly from ICU):
431      >0592 05B0 05B7 05BC 05A5 05C0 05AD 05C4
432      >My output from charlint:
433      >05B0 05B7 05BC 05A5 0592 05C0 05AD 05C4
434
435      0592 05B7 05BC 05A5 05B0 05C0 05C4 05AD => 05B0 05B7 05BC 05A5 0592 05C0
436      05AD 05C4
437
438      U+0592 230 HEBREW ACCENT SEGOL
439      U+05B7  17 HEBREW POINT PATAH
440      U+05BC  21 HEBREW POINT DAGESH OR MAPIQ
441      U+05A5 220 HEBREW ACCENT MERKHA
442      U+05B0  10 HEBREW POINT SHEVA
443      U+05C0   0 HEBREW PUNCTUATION PASEQ
444      U+05C4 230 HEBREW MARK UPPER DOT
445      U+05AD 222 HEBREW ACCENT DEHI
446
447      U+05B0  10 HEBREW POINT SHEVA
448      U+05B7  17 HEBREW POINT PATAH
449      U+05BC  21 HEBREW POINT DAGESH OR MAPIQ
450      U+05A5 220 HEBREW ACCENT MERKHA
451      U+0592 230 HEBREW ACCENT SEGOL
452      U+05C0   0 HEBREW PUNCTUATION PASEQ
453      U+05AD 222 HEBREW ACCENT DEHI
454      U+05C4 230 HEBREW MARK UPPER DOT
455
456      Wrong result:
457      U+0592 230 HEBREW ACCENT SEGOL
458      U+05B0  10 HEBREW POINT SHEVA
459      U+05B7  17 HEBREW POINT PATAH
460      U+05BC  21 HEBREW POINT DAGESH OR MAPIQ
461      U+05A5 220 HEBREW ACCENT MERKHA
462      U+05C0   0 HEBREW PUNCTUATION PASEQ
463      U+05AD 222 HEBREW ACCENT DEHI
464      U+05C4 230 HEBREW MARK UPPER DOT
465    */
466    UnicodeString data[2][3];
467    data[0][0] = str("\\u05B8\\u05B9\\u05B1\\u0591\\u05C3\\u05B0\\u05AC\\u059F");
468    data[0][1] = str("\\u05B1\\u05B8\\u05B9\\u0591\\u05C3\\u05B0\\u05AC\\u059F");
469    data[0][2] = str("");
470    data[1][0] = str("\\u0592\\u05B7\\u05BC\\u05A5\\u05B0\\u05C0\\u05C4\\u05AD");
471    data[1][1] = str("\\u05B0\\u05B7\\u05BC\\u05A5\\u0592\\u05C0\\u05AD\\u05C4");
472    data[1][2] = str("");
473
474    staticTest(UNORM_NFD, 0, data, ARRAY_LENGTH(data), 1);
475    staticTest(UNORM_NFC, 0, data, ARRAY_LENGTH(data), 1);
476}
477
478//------------------------------------------------------------------------
479// Internal utilities
480//
481
482UnicodeString BasicNormalizerTest::hex(UChar ch) {
483    UnicodeString result;
484    return appendHex(ch, 4, result);
485}
486
487UnicodeString BasicNormalizerTest::hex(const UnicodeString& s) {
488    UnicodeString result;
489    for (int i = 0; i < s.length(); ++i) {
490        if (i != 0) result += (UChar)0x2c/*,*/;
491        appendHex(s[i], 4, result);
492    }
493    return result;
494}
495
496
497inline static void insert(UnicodeString& dest, int pos, UChar32 ch)
498{
499    dest.replace(pos, 0, ch);
500}
501
502void BasicNormalizerTest::backAndForth(Normalizer* iter, const UnicodeString& input)
503{
504    UChar32 ch;
505    iter->setText(input, status);
506
507    // Run through the iterator forwards and stick it into a StringBuffer
508    UnicodeString forward;
509    for (ch = iter->first(); ch != iter->DONE; ch = iter->next()) {
510        forward += ch;
511    }
512
513    // Now do it backwards
514    UnicodeString reverse;
515    for (ch = iter->last(); ch != iter->DONE; ch = iter->previous()) {
516        insert(reverse, 0, ch);
517    }
518
519    if (forward != reverse) {
520        errln("Forward/reverse mismatch for input " + hex(input)
521              + ", forward: " + hex(forward) + ", backward: " + hex(reverse));
522    }
523}
524
525void BasicNormalizerTest::staticTest(UNormalizationMode mode, int options,
526                     UnicodeString tests[][3], int length,
527                     int outCol)
528{
529    for (int i = 0; i < length; i++)
530    {
531        UnicodeString& input = tests[i][0];
532        UnicodeString& expect = tests[i][outCol];
533
534        logln("Normalizing '" + input + "' (" + hex(input) + ")" );
535
536        UnicodeString output;
537        Normalizer::normalize(input, mode, options, output, status);
538
539        if (output != expect) {
540            dataerrln(UnicodeString("ERROR: case ") + i + " normalized " + hex(input) + "\n"
541                + "                expected " + hex(expect) + "\n"
542                + "              static got " + hex(output) );
543        }
544    }
545}
546
547void BasicNormalizerTest::iterateTest(Normalizer* iter,
548                                      UnicodeString tests[][3], int length,
549                                      int outCol)
550{
551    for (int i = 0; i < length; i++)
552    {
553        UnicodeString& input = tests[i][0];
554        UnicodeString& expect = tests[i][outCol];
555
556        logln("Normalizing '" + input + "' (" + hex(input) + ")" );
557
558        iter->setText(input, status);
559        assertEqual(input, expect, iter, UnicodeString("ERROR: case ") + i + " ");
560    }
561}
562
563void BasicNormalizerTest::assertEqual(const UnicodeString&    input,
564                      const UnicodeString&    expected,
565                      Normalizer*        iter,
566                      const UnicodeString&    errPrefix)
567{
568    UnicodeString result;
569
570    for (UChar32 ch = iter->first(); ch != iter->DONE; ch = iter->next()) {
571        result += ch;
572    }
573    if (result != expected) {
574        dataerrln(errPrefix + "normalized " + hex(input) + "\n"
575            + "                expected " + hex(expected) + "\n"
576            + "             iterate got " + hex(result) );
577    }
578}
579
580// helper class for TestPreviousNext()
581// simple UTF-32 character iterator
582class UChar32Iterator {
583public:
584    UChar32Iterator(const UChar32 *text, int32_t len, int32_t index) :
585        s(text), length(len), i(index) {}
586
587    UChar32 current() {
588        if(i<length) {
589            return s[i];
590        } else {
591            return 0xffff;
592        }
593    }
594
595    UChar32 next() {
596        if(i<length) {
597            return s[i++];
598        } else {
599            return 0xffff;
600        }
601    }
602
603    UChar32 previous() {
604        if(i>0) {
605            return s[--i];
606        } else {
607            return 0xffff;
608        }
609    }
610
611    int32_t getIndex() {
612        return i;
613    }
614private:
615    const UChar32 *s;
616    int32_t length, i;
617};
618
619void
620BasicNormalizerTest::TestPreviousNext(const UChar *src, int32_t srcLength,
621                                      const UChar32 *expect, int32_t expectLength,
622                                      const int32_t *expectIndex, // its length=expectLength+1
623                                      int32_t srcMiddle, int32_t expectMiddle,
624                                      const char *moves,
625                                      UNormalizationMode mode,
626                                      const char *name) {
627    // iterators
628    Normalizer iter(src, srcLength, mode);
629
630    // test getStaticClassID and getDynamicClassID
631    if(iter.getDynamicClassID() != Normalizer::getStaticClassID()) {
632        errln("getStaticClassID != getDynamicClassID for Normalizer.");
633    }
634
635    UChar32Iterator iter32(expect, expectLength, expectMiddle);
636
637    UChar32 c1, c2;
638    char m;
639
640    // initially set the indexes into the middle of the strings
641    iter.setIndexOnly(srcMiddle);
642
643    // move around and compare the iteration code points with
644    // the expected ones
645    const char *move=moves;
646    while((m=*move++)!=0) {
647        if(m=='-') {
648            c1=iter.previous();
649            c2=iter32.previous();
650        } else if(m=='0') {
651            c1=iter.current();
652            c2=iter32.current();
653        } else /* m=='+' */ {
654            c1=iter.next();
655            c2=iter32.next();
656        }
657
658        // compare results
659        if(c1!=c2) {
660            // copy the moves until the current (m) move, and terminate
661            char history[64];
662            uprv_strcpy(history, moves);
663            history[move-moves]=0;
664            dataerrln("error: mismatch in Normalizer iteration (%s) at %s: "
665                  "got c1=U+%04lx != expected c2=U+%04lx",
666                  name, history, c1, c2);
667            break;
668        }
669
670        // compare indexes
671        if(iter.getIndex()!=expectIndex[iter32.getIndex()]) {
672            // copy the moves until the current (m) move, and terminate
673            char history[64];
674            uprv_strcpy(history, moves);
675            history[move-moves]=0;
676            errln("error: index mismatch in Normalizer iteration (%s) at %s: "
677                  "Normalizer index %ld expected %ld\n",
678                  name, history, iter.getIndex(), expectIndex[iter32.getIndex()]);
679            break;
680        }
681    }
682}
683
684void
685BasicNormalizerTest::TestPreviousNext() {
686    // src and expect strings
687    static const UChar src[]={
688        UTF16_LEAD(0x2f999), UTF16_TRAIL(0x2f999),
689        UTF16_LEAD(0x1d15f), UTF16_TRAIL(0x1d15f),
690        0xc4,
691        0x1ed0
692    };
693    static const UChar32 expect[]={
694        0x831d,
695        0x1d158, 0x1d165,
696        0x41, 0x308,
697        0x4f, 0x302, 0x301
698    };
699
700    // expected src indexes corresponding to expect indexes
701    static const int32_t expectIndex[]={
702        0,
703        2, 2,
704        4, 4,
705        5, 5, 5,
706        6 // behind last character
707    };
708
709    // src and expect strings for regression test for j2911
710    static const UChar src_j2911[]={
711        UTF16_LEAD(0x2f999), UTF16_TRAIL(0x2f999),
712        0xdd00, 0xd900, // unpaired surrogates - regression test for j2911
713        0xc4,
714        0x4f, 0x302, 0x301
715    };
716    static const UChar32 expect_j2911[]={
717        0x831d,
718        0xdd00, 0xd900, // unpaired surrogates - regression test for j2911
719        0xc4,
720        0x1ed0
721    };
722
723    // expected src indexes corresponding to expect indexes
724    static const int32_t expectIndex_j2911[]={
725        0,
726        2, 3,
727        4,
728        5,
729        8 // behind last character
730    };
731
732    // initial indexes into the src and expect strings
733    // for both sets of test data
734    enum {
735        SRC_MIDDLE=4,
736        EXPECT_MIDDLE=3,
737        SRC_MIDDLE_2=2,
738        EXPECT_MIDDLE_2=1
739    };
740
741    // movement vector
742    // - for previous(), 0 for current(), + for next()
743    // for both sets of test data
744    static const char *const moves="0+0+0--0-0-+++0--+++++++0--------";
745
746    TestPreviousNext(src, LENGTHOF(src),
747                     expect, LENGTHOF(expect),
748                     expectIndex,
749                     SRC_MIDDLE, EXPECT_MIDDLE,
750                     moves, UNORM_NFD, "basic");
751
752    TestPreviousNext(src_j2911, LENGTHOF(src_j2911),
753                     expect_j2911, LENGTHOF(expect_j2911),
754                     expectIndex_j2911,
755                     SRC_MIDDLE, EXPECT_MIDDLE,
756                     moves, UNORM_NFKC, "j2911");
757
758    // try again from different "middle" indexes
759    TestPreviousNext(src, LENGTHOF(src),
760                     expect, LENGTHOF(expect),
761                     expectIndex,
762                     SRC_MIDDLE_2, EXPECT_MIDDLE_2,
763                     moves, UNORM_NFD, "basic_2");
764
765    TestPreviousNext(src_j2911, LENGTHOF(src_j2911),
766                     expect_j2911, LENGTHOF(expect_j2911),
767                     expectIndex_j2911,
768                     SRC_MIDDLE_2, EXPECT_MIDDLE_2,
769                     moves, UNORM_NFKC, "j2911_2");
770}
771
772void BasicNormalizerTest::TestConcatenate() {
773    static const char *const
774    cases[][4]={
775        /* mode, left, right, result */
776        {
777            "C",
778            "re",
779            "\\u0301sum\\u00e9",
780            "r\\u00e9sum\\u00e9"
781        },
782        {
783            "C",
784            "a\\u1100",
785            "\\u1161bcdefghijk",
786            "a\\uac00bcdefghijk"
787        },
788        /* ### TODO: add more interesting cases */
789        {
790            "D",
791            "\\u03B1\\u0345",
792            "\\u0C4D\\U000110BA\\U0001D169",
793            "\\u03B1\\U0001D169\\U000110BA\\u0C4D\\u0345"
794        }
795    };
796
797    UnicodeString left, right, expect, result, r;
798    UErrorCode errorCode;
799    UNormalizationMode mode;
800    int32_t i;
801
802    /* test concatenation */
803    for(i=0; i<(int32_t)(sizeof(cases)/sizeof(cases[0])); ++i) {
804        switch(*cases[i][0]) {
805        case 'C': mode=UNORM_NFC; break;
806        case 'D': mode=UNORM_NFD; break;
807        case 'c': mode=UNORM_NFKC; break;
808        case 'd': mode=UNORM_NFKD; break;
809        default: mode=UNORM_NONE; break;
810        }
811
812        left=UnicodeString(cases[i][1], "").unescape();
813        right=UnicodeString(cases[i][2], "").unescape();
814        expect=UnicodeString(cases[i][3], "").unescape();
815
816        //result=r=UnicodeString();
817        errorCode=U_ZERO_ERROR;
818
819        r=Normalizer::concatenate(left, right, result, mode, 0, errorCode);
820        if(U_FAILURE(errorCode) || /*result!=r ||*/ result!=expect) {
821            dataerrln("error in Normalizer::concatenate(), cases[] fails with "+
822                UnicodeString(u_errorName(errorCode))+", result==expect: expected: "+
823                hex(expect)+" =========> got: " + hex(result));
824        }
825    }
826
827    /* test error cases */
828
829    /* left.getBuffer()==result.getBuffer() */
830    result=r=expect=UnicodeString("zz", "");
831    errorCode=U_UNEXPECTED_TOKEN;
832    r=Normalizer::concatenate(left, right, result, mode, 0, errorCode);
833    if(errorCode!=U_UNEXPECTED_TOKEN || result!=r || !result.isBogus()) {
834        errln("error in Normalizer::concatenate(), violates UErrorCode protocol");
835    }
836
837    left.setToBogus();
838    errorCode=U_ZERO_ERROR;
839    r=Normalizer::concatenate(left, right, result, mode, 0, errorCode);
840    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || result!=r || !result.isBogus()) {
841        errln("error in Normalizer::concatenate(), does not detect left.isBogus()");
842    }
843}
844
845// reference implementation of Normalizer::compare
846static int32_t
847ref_norm_compare(const UnicodeString &s1, const UnicodeString &s2, uint32_t options, UErrorCode &errorCode) {
848    UnicodeString r1, r2, t1, t2;
849    int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
850
851    if(options&U_COMPARE_IGNORE_CASE) {
852        Normalizer::decompose(s1, FALSE, normOptions, r1, errorCode);
853        Normalizer::decompose(s2, FALSE, normOptions, r2, errorCode);
854
855        r1.foldCase(options);
856        r2.foldCase(options);
857    } else {
858        r1=s1;
859        r2=s2;
860    }
861
862    Normalizer::decompose(r1, FALSE, normOptions, t1, errorCode);
863    Normalizer::decompose(r2, FALSE, normOptions, t2, errorCode);
864
865    if(options&U_COMPARE_CODE_POINT_ORDER) {
866        return t1.compareCodePointOrder(t2);
867    } else {
868        return t1.compare(t2);
869    }
870}
871
872// test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately
873static int32_t
874_norm_compare(const UnicodeString &s1, const UnicodeString &s2, uint32_t options, UErrorCode &errorCode) {
875    int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
876
877    if( UNORM_YES==Normalizer::quickCheck(s1, UNORM_FCD, normOptions, errorCode) &&
878        UNORM_YES==Normalizer::quickCheck(s2, UNORM_FCD, normOptions, errorCode)) {
879        options|=UNORM_INPUT_IS_FCD;
880    }
881
882    return Normalizer::compare(s1, s2, options, errorCode);
883}
884
885// reference implementation of UnicodeString::caseCompare
886static int32_t
887ref_case_compare(const UnicodeString &s1, const UnicodeString &s2, uint32_t options) {
888    UnicodeString t1, t2;
889
890    t1=s1;
891    t2=s2;
892
893    t1.foldCase(options);
894    t2.foldCase(options);
895
896    if(options&U_COMPARE_CODE_POINT_ORDER) {
897        return t1.compareCodePointOrder(t2);
898    } else {
899        return t1.compare(t2);
900    }
901}
902
903// reduce an integer to -1/0/1
904static inline int32_t
905_sign(int32_t value) {
906    if(value==0) {
907        return 0;
908    } else {
909        return (value>>31)|1;
910    }
911}
912
913static const char *
914_signString(int32_t value) {
915    if(value<0) {
916        return "<0";
917    } else if(value==0) {
918        return "=0";
919    } else /* value>0 */ {
920        return ">0";
921    }
922}
923
924void
925BasicNormalizerTest::TestCompare() {
926    // test Normalizer::compare and unorm_compare (thinly wrapped by the former)
927    // by comparing it with its semantic equivalent
928    // since we trust the pieces, this is sufficient
929
930    // test each string with itself and each other
931    // each time with all options
932    static const char *const
933    strings[]={
934        // some cases from NormalizationTest.txt
935        // 0..3
936        "D\\u031B\\u0307\\u0323",
937        "\\u1E0C\\u031B\\u0307",
938        "D\\u031B\\u0323\\u0307",
939        "d\\u031B\\u0323\\u0307",
940
941        // 4..6
942        "\\u00E4",
943        "a\\u0308",
944        "A\\u0308",
945
946        // Angstrom sign = A ring
947        // 7..10
948        "\\u212B",
949        "\\u00C5",
950        "A\\u030A",
951        "a\\u030A",
952
953        // 11.14
954        "a\\u059A\\u0316\\u302A\\u032Fb",
955        "a\\u302A\\u0316\\u032F\\u059Ab",
956        "a\\u302A\\u0316\\u032F\\u059Ab",
957        "A\\u059A\\u0316\\u302A\\u032Fb",
958
959        // from ICU case folding tests
960        // 15..20
961        "A\\u00df\\u00b5\\ufb03\\U0001040c\\u0131",
962        "ass\\u03bcffi\\U00010434i",
963        "\\u0061\\u0042\\u0131\\u03a3\\u00df\\ufb03\\ud93f\\udfff",
964        "\\u0041\\u0062\\u0069\\u03c3\\u0073\\u0053\\u0046\\u0066\\u0049\\ud93f\\udfff",
965        "\\u0041\\u0062\\u0131\\u03c3\\u0053\\u0073\\u0066\\u0046\\u0069\\ud93f\\udfff",
966        "\\u0041\\u0062\\u0069\\u03c3\\u0073\\u0053\\u0046\\u0066\\u0049\\ud93f\\udffd",
967
968        //     U+d800 U+10001   see implementation comment in unorm_cmpEquivFold
969        // vs. U+10000          at bottom - code point order
970        // 21..22
971        "\\ud800\\ud800\\udc01",
972        "\\ud800\\udc00",
973
974        // other code point order tests from ustrtest.cpp
975        // 23..31
976        "\\u20ac\\ud801",
977        "\\u20ac\\ud800\\udc00",
978        "\\ud800",
979        "\\ud800\\uff61",
980        "\\udfff",
981        "\\uff61\\udfff",
982        "\\uff61\\ud800\\udc02",
983        "\\ud800\\udc02",
984        "\\ud84d\\udc56",
985
986        // long strings, see cnormtst.c/TestNormCoverage()
987        // equivalent if case-insensitive
988        // 32..33
989        "\\uAD8B\\uAD8B\\uAD8B\\uAD8B"
990        "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
991        "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
992        "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
993        "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
994        "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
995        "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
996        "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
997        "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"
998        "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"
999        "\\uAD8B\\uAD8B\\uAD8B\\uAD8B"
1000        "d\\u031B\\u0307\\u0323",
1001
1002        "\\u1100\\u116f\\u11aa\\uAD8B\\uAD8B\\u1100\\u116f\\u11aa"
1003        "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1004        "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1005        "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1006        "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1007        "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"
1008        "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
1009        "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
1010        "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"
1011        "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"
1012        "\\u1100\\u116f\\u11aa\\uAD8B\\uAD8B\\u1100\\u116f\\u11aa"
1013        "\\u1E0C\\u031B\\u0307",
1014
1015        // some strings that may make a difference whether the compare function
1016        // case-folds or decomposes first
1017        // 34..41
1018        "\\u0360\\u0345\\u0334",
1019        "\\u0360\\u03b9\\u0334",
1020
1021        "\\u0360\\u1f80\\u0334",
1022        "\\u0360\\u03b1\\u0313\\u03b9\\u0334",
1023
1024        "\\u0360\\u1ffc\\u0334",
1025        "\\u0360\\u03c9\\u03b9\\u0334",
1026
1027        "a\\u0360\\u0345\\u0360\\u0345b",
1028        "a\\u0345\\u0360\\u0345\\u0360b",
1029
1030        // interesting cases for canonical caseless match with turkic i handling
1031        // 42..43
1032        "\\u00cc",
1033        "\\u0069\\u0300",
1034
1035        // strings with post-Unicode 3.2 normalization or normalization corrections
1036        // 44..45
1037        "\\u00e4\\u193b\\U0002f868",
1038        "\\u0061\\u193b\\u0308\\u36fc",
1039
1040        // empty string
1041        // 46
1042        ""
1043    };
1044
1045    UnicodeString s[100]; // at least as many items as in strings[] !
1046
1047    // all combinations of options
1048    // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions
1049    // set UNORM_UNICODE_3_2 in one additional combination
1050    static const struct {
1051        uint32_t options;
1052        const char *name;
1053    } opt[]={
1054        { 0, "default" },
1055        { U_COMPARE_CODE_POINT_ORDER, "c.p. order" },
1056        { U_COMPARE_IGNORE_CASE, "ignore case" },
1057        { U_COMPARE_CODE_POINT_ORDER|U_COMPARE_IGNORE_CASE, "c.p. order & ignore case" },
1058        { U_COMPARE_IGNORE_CASE|U_FOLD_CASE_EXCLUDE_SPECIAL_I, "ignore case & special i" },
1059        { U_COMPARE_CODE_POINT_ORDER|U_COMPARE_IGNORE_CASE|U_FOLD_CASE_EXCLUDE_SPECIAL_I, "c.p. order & ignore case & special i" },
1060        { UNORM_UNICODE_3_2<<UNORM_COMPARE_NORM_OPTIONS_SHIFT, "Unicode 3.2" }
1061    };
1062
1063    int32_t i, j, k, count=LENGTHOF(strings);
1064    int32_t result, refResult;
1065
1066    UErrorCode errorCode;
1067
1068    // create the UnicodeStrings
1069    for(i=0; i<count; ++i) {
1070        s[i]=UnicodeString(strings[i], "").unescape();
1071    }
1072
1073    // test them each with each other
1074    for(i=0; i<count; ++i) {
1075        for(j=i; j<count; ++j) {
1076            for(k=0; k<LENGTHOF(opt); ++k) {
1077                // test Normalizer::compare
1078                errorCode=U_ZERO_ERROR;
1079                result=_norm_compare(s[i], s[j], opt[k].options, errorCode);
1080                refResult=ref_norm_compare(s[i], s[j], opt[k].options, errorCode);
1081                if(_sign(result)!=_sign(refResult)) {
1082                    errln("Normalizer::compare(%d, %d, %s)%s should be %s %s",
1083                        i, j, opt[k].name, _signString(result), _signString(refResult),
1084                        U_SUCCESS(errorCode) ? "" : u_errorName(errorCode));
1085                }
1086
1087                // test UnicodeString::caseCompare - same internal implementation function
1088                if(opt[k].options&U_COMPARE_IGNORE_CASE) {
1089                    errorCode=U_ZERO_ERROR;
1090                    result=s[i].caseCompare(s[j], opt[k].options);
1091                    refResult=ref_case_compare(s[i], s[j], opt[k].options);
1092                    if(_sign(result)!=_sign(refResult)) {
1093                        errln("UniStr::caseCompare(%d, %d, %s)%s should be %s %s",
1094                            i, j, opt[k].name, _signString(result), _signString(refResult),
1095                            U_SUCCESS(errorCode) ? "" : u_errorName(errorCode));
1096                    }
1097                }
1098            }
1099        }
1100    }
1101
1102    // test cases with i and I to make sure Turkic works
1103    static const UChar iI[]={ 0x49, 0x69, 0x130, 0x131 };
1104    USerializedSet sset;
1105    UnicodeSet set;
1106
1107    UnicodeString s1, s2;
1108    UChar32 start, end;
1109
1110    // collect all sets into one for contiguous output
1111    for(i=0; i<LENGTHOF(iI); ++i) {
1112        if(unorm_getCanonStartSet(iI[i], &sset)) {
1113            count=uset_getSerializedRangeCount(&sset);
1114            for(j=0; j<count; ++j) {
1115                uset_getSerializedRange(&sset, j, &start, &end);
1116                set.add(start, end);
1117            }
1118        }
1119    }
1120
1121    // test all of these precomposed characters
1122    UnicodeSetIterator it(set);
1123    while(it.nextRange() && !it.isString()) {
1124        start=it.getCodepoint();
1125        end=it.getCodepointEnd();
1126        while(start<=end) {
1127            s1.setTo(start);
1128            errorCode=U_ZERO_ERROR;
1129            Normalizer::decompose(s1, FALSE, 0, s2, errorCode);
1130            if(U_FAILURE(errorCode)) {
1131                dataerrln("Normalizer::decompose(U+%04x) failed: %s", start, u_errorName(errorCode));
1132                return;
1133            }
1134
1135            for(k=0; k<LENGTHOF(opt); ++k) {
1136                // test Normalizer::compare
1137                errorCode=U_ZERO_ERROR;
1138                result=_norm_compare(s1, s2, opt[k].options, errorCode);
1139                refResult=ref_norm_compare(s1, s2, opt[k].options, errorCode);
1140                if(_sign(result)!=_sign(refResult)) {
1141                    errln("Normalizer::compare(U+%04x with its NFD, %s)%s should be %s %s",
1142                        start, opt[k].name, _signString(result), _signString(refResult),
1143                        U_SUCCESS(errorCode) ? "" : u_errorName(errorCode));
1144                }
1145
1146                // test UnicodeString::caseCompare - same internal implementation function
1147                if(opt[k].options&U_COMPARE_IGNORE_CASE) {
1148                    errorCode=U_ZERO_ERROR;
1149                    result=s1.caseCompare(s2, opt[k].options);
1150                    refResult=ref_case_compare(s1, s2, opt[k].options);
1151                    if(_sign(result)!=_sign(refResult)) {
1152                        errln("UniStr::caseCompare(U+%04x with its NFD, %s)%s should be %s %s",
1153                            start, opt[k].name, _signString(result), _signString(refResult),
1154                            U_SUCCESS(errorCode) ? "" : u_errorName(errorCode));
1155                    }
1156                }
1157            }
1158
1159            ++start;
1160        }
1161    }
1162}
1163
1164// verify that case-folding does not un-FCD strings
1165int32_t
1166BasicNormalizerTest::countFoldFCDExceptions(uint32_t foldingOptions) {
1167    UnicodeString s, fold, d;
1168    UChar32 c;
1169    int32_t count;
1170    uint8_t cc, trailCC, foldCC, foldTrailCC;
1171    UNormalizationCheckResult qcResult;
1172    int8_t category;
1173    UBool isNFD;
1174    UErrorCode errorCode;
1175
1176    logln("Test if case folding may un-FCD a string (folding options %04lx)", foldingOptions);
1177
1178    count=0;
1179    for(c=0; c<=0x10ffff; ++c) {
1180        errorCode = U_ZERO_ERROR;
1181        category=u_charType(c);
1182        if(category==U_UNASSIGNED) {
1183            continue; // skip unassigned code points
1184        }
1185        if(c==0xac00) {
1186            c=0xd7a3; // skip Hangul - no case folding there
1187            continue;
1188        }
1189        // skip Han blocks - no case folding there either
1190        if(c==0x3400) {
1191            c=0x4db5;
1192            continue;
1193        }
1194        if(c==0x4e00) {
1195            c=0x9fa5;
1196            continue;
1197        }
1198        if(c==0x20000) {
1199            c=0x2a6d6;
1200            continue;
1201        }
1202
1203        s.setTo(c);
1204
1205        // get leading and trailing cc for c
1206        Normalizer::decompose(s, FALSE, 0, d, errorCode);
1207        isNFD= s==d;
1208        cc=u_getCombiningClass(d.char32At(0));
1209        trailCC=u_getCombiningClass(d.char32At(d.length()-1));
1210
1211        // get leading and trailing cc for the case-folding of c
1212        s.foldCase(foldingOptions);
1213        Normalizer::decompose(s, FALSE, 0, d, errorCode);
1214        foldCC=u_getCombiningClass(d.char32At(0));
1215        foldTrailCC=u_getCombiningClass(d.char32At(d.length()-1));
1216
1217        qcResult=Normalizer::quickCheck(s, UNORM_FCD, errorCode);
1218
1219        if (U_FAILURE(errorCode)) {
1220            ++count;
1221            dataerrln("U+%04lx: Failed with error %s", u_errorName(errorCode));
1222        }
1223
1224        // bad:
1225        // - character maps to empty string: adjacent characters may then need reordering
1226        // - folding has different leading/trailing cc's, and they don't become just 0
1227        // - folding itself is not FCD
1228        if( qcResult!=UNORM_YES ||
1229            s.isEmpty() ||
1230            (cc!=foldCC && foldCC!=0) || (trailCC!=foldTrailCC && foldTrailCC!=0)
1231        ) {
1232            ++count;
1233            dataerrln("U+%04lx: case-folding may un-FCD a string (folding options %04lx)", c, foldingOptions);
1234            dataerrln("  cc %02x trailCC %02x    foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x   quickCheck(folded)=%d", cc, trailCC, d.char32At(0), foldCC, d.char32At(d.length()-1), foldTrailCC, qcResult);
1235            continue;
1236        }
1237
1238        // also bad:
1239        // if a code point is in NFD but its case folding is not, then
1240        // unorm_compare will also fail
1241        if(isNFD && UNORM_YES!=Normalizer::quickCheck(s, UNORM_NFD, errorCode)) {
1242            ++count;
1243            errln("U+%04lx: case-folding un-NFDs this character (folding options %04lx)", c, foldingOptions);
1244        }
1245    }
1246
1247    logln("There are %ld code points for which case-folding may un-FCD a string (folding options %04lx)", count, foldingOptions);
1248    return count;
1249}
1250
1251void
1252BasicNormalizerTest::FindFoldFCDExceptions() {
1253    int32_t count;
1254
1255    count=countFoldFCDExceptions(0);
1256    count+=countFoldFCDExceptions(U_FOLD_CASE_EXCLUDE_SPECIAL_I);
1257    if(count>0) {
1258        /*
1259         * If case-folding un-FCDs any strings, then unorm_compare() must be
1260         * re-implemented.
1261         * It currently assumes that one can check for FCD then case-fold
1262         * and then still have FCD strings for raw decomposition without reordering.
1263         */
1264        dataerrln("error: There are %ld code points for which case-folding may un-FCD a string for all folding options.\n"
1265              "See comment in BasicNormalizerTest::FindFoldFCDExceptions()!", count);
1266    }
1267}
1268
1269/*
1270 * Hardcoded "NF* Skippable" sets, generated from
1271 * Mark Davis' com.ibm.text.UCD.NFSkippable (see ICU4J CVS, module unicodetools).
1272 * Run com.ibm.text.UCD.Main with the option NFSkippable.
1273 *
1274 * Must be updated for each Unicode version.
1275 */
1276static void
1277initExpectedSkippables(UnicodeSet skipSets[UNORM_MODE_COUNT]) {
1278    UErrorCode errorCode=U_ZERO_ERROR;
1279
1280    skipSets[UNORM_NFD].applyPattern(UnicodeString(
1281        "[^\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6\\u00D9-\\u00DD"
1282        "\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6\\u00F9-\\u00FD"
1283        "\\u00FF-\\u010F\\u0112-\\u0125\\u0128-\\u0130\\u0134-\\u0137"
1284        "\\u0139-\\u013E\\u0143-\\u0148\\u014C-\\u0151\\u0154-\\u0165"
1285        "\\u0168-\\u017E\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\\u01DC"
1286        "\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B"
1287        "\\u021E\\u021F\\u0226-\\u0233\\u0300-\\u034E\\u0350-\\u036F"
1288        "\\u0374\\u037E\\u0385-\\u038A\\u038C\\u038E-\\u0390\\u03AA-"
1289        "\\u03B0\\u03CA-\\u03CE\\u03D3\\u03D4\\u0400\\u0401\\u0403\\u0407"
1290        "\\u040C-\\u040E\\u0419\\u0439\\u0450\\u0451\\u0453\\u0457\\u045C"
1291        "-\\u045E\\u0476\\u0477\\u0483-\\u0487\\u04C1\\u04C2\\u04D0-"
1292        "\\u04D3\\u04D6\\u04D7\\u04DA-\\u04DF\\u04E2-\\u04E7\\u04EA-"
1293        "\\u04F5\\u04F8\\u04F9\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4"
1294        "\\u05C5\\u05C7\\u0610-\\u061A\\u0622-\\u0626\\u064B-\\u065E"
1295        "\\u0670\\u06C0\\u06C2\\u06D3\\u06D6-\\u06DC\\u06DF-\\u06E4"
1296        "\\u06E7\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-"
1297        "\\u07F3\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-"
1298        "\\u082D\\u0929\\u0931\\u0934\\u093C\\u094D\\u0951-\\u0954\\u0958"
1299        "-\\u095F\\u09BC\\u09CB-\\u09CD\\u09DC\\u09DD\\u09DF\\u0A33"
1300        "\\u0A36\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C"
1301        "\\u0B48\\u0B4B-\\u0B4D\\u0B5C\\u0B5D\\u0B94\\u0BCA-\\u0BCD"
1302        "\\u0C48\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CC0\\u0CC7\\u0CC8\\u0CCA"
1303        "\\u0CCB\\u0CCD\\u0D4A-\\u0D4D\\u0DCA\\u0DDA\\u0DDC-\\u0DDE"
1304        "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB8\\u0EB9\\u0EC8-\\u0ECB"
1305        "\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D\\u0F52\\u0F57"
1306        "\\u0F5C\\u0F69\\u0F71-\\u0F76\\u0F78\\u0F7A-\\u0F7D\\u0F80-"
1307        "\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"
1308        "\\u0FC6\\u1026\\u1037\\u1039\\u103A\\u108D\\u135F\\u1714\\u1734"
1309        "\\u17D2\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60\\u1A75"
1310        "-\\u1A7C\\u1A7F\\u1B06\\u1B08\\u1B0A\\u1B0C\\u1B0E\\u1B12\\u1B34"
1311        "\\u1B3B\\u1B3D\\u1B40\\u1B41\\u1B43\\u1B44\\u1B6B-\\u1B73\\u1BAA"
1312        "\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8\\u1CED"
1313        "\\u1DC0-\\u1DE6\\u1DFD-\\u1E99\\u1E9B\\u1EA0-\\u1EF9\\u1F00-"
1314        "\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-"
1315        "\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4"
1316        "\\u1FB6-\\u1FBC\\u1FBE\\u1FC1-\\u1FC4\\u1FC6-\\u1FD3\\u1FD6-"
1317        "\\u1FDB\\u1FDD-\\u1FEF\\u1FF2-\\u1FF4\\u1FF6-\\u1FFD\\u2000"
1318        "\\u2001\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2126\\u212A"
1319        "\\u212B\\u219A\\u219B\\u21AE\\u21CD-\\u21CF\\u2204\\u2209\\u220C"
1320        "\\u2224\\u2226\\u2241\\u2244\\u2247\\u2249\\u2260\\u2262\\u226D-"
1321        "\\u2271\\u2274\\u2275\\u2278\\u2279\\u2280\\u2281\\u2284\\u2285"
1322        "\\u2288\\u2289\\u22AC-\\u22AF\\u22E0-\\u22E3\\u22EA-\\u22ED"
1323        "\\u2329\\u232A\\u2ADC\\u2CEF-\\u2CF1\\u2DE0-\\u2DFF\\u302A-"
1324        "\\u302F\\u304C\\u304E\\u3050\\u3052\\u3054\\u3056\\u3058\\u305A"
1325        "\\u305C\\u305E\\u3060\\u3062\\u3065\\u3067\\u3069\\u3070\\u3071"
1326        "\\u3073\\u3074\\u3076\\u3077\\u3079\\u307A\\u307C\\u307D\\u3094"
1327        "\\u3099\\u309A\\u309E\\u30AC\\u30AE\\u30B0\\u30B2\\u30B4\\u30B6"
1328        "\\u30B8\\u30BA\\u30BC\\u30BE\\u30C0\\u30C2\\u30C5\\u30C7\\u30C9"
1329        "\\u30D0\\u30D1\\u30D3\\u30D4\\u30D6\\u30D7\\u30D9\\u30DA\\u30DC"
1330        "\\u30DD\\u30F4\\u30F7-\\u30FA\\u30FE\\uA66F\\uA67C\\uA67D\\uA6F0"
1331        "\\uA6F1\\uA806\\uA8C4\\uA8E0-\\uA8F1\\uA92B-\\uA92D\\uA953"
1332        "\\uA9B3\\uA9C0\\uAAB0\\uAAB2-\\uAAB4\\uAAB7\\uAAB8\\uAABE\\uAABF"
1333        "\\uAAC1\\uABED\\uAC00-\\uD7A3\\uF900-\\uFA0D\\uFA10\\uFA12"
1334        "\\uFA15-\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D"
1335        "\\uFA30-\\uFA6D\\uFA70-\\uFAD9\\uFB1D-\\uFB1F\\uFB2A-\\uFB36"
1336        "\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-"
1337        "\\uFB4E\\uFE20-\\uFE26\\U000101FD\\U00010A0D\\U00010A0F\\U00010A"
1338        "38-\\U00010A3A\\U00010A3F\\U0001109A\\U0001109C\\U000110AB"
1339        "\\U000110B9\\U000110BA\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001"
1340        "D172\\U0001D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-"
1341        "\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0002"
1342        "F800-\\U0002FA1D]"
1343        , ""), errorCode);
1344
1345    skipSets[UNORM_NFC].applyPattern(UnicodeString(
1346        "[^<->A-PR-Za-pr-z\\u00A8\\u00C0-\\u00CF\\u00D1-\\u00D6\\u00D8-"
1347        "\\u00DD\\u00E0-\\u00EF\\u00F1-\\u00F6\\u00F8-\\u00FD\\u00FF-"
1348        "\\u0103\\u0106-\\u010F\\u0112-\\u0117\\u011A-\\u0121\\u0124"
1349        "\\u0125\\u0128-\\u012D\\u0130\\u0139\\u013A\\u013D\\u013E\\u0143"
1350        "\\u0144\\u0147\\u0148\\u014C-\\u0151\\u0154\\u0155\\u0158-"
1351        "\\u015D\\u0160\\u0161\\u0164\\u0165\\u0168-\\u0171\\u0174-"
1352        "\\u017F\\u01A0\\u01A1\\u01AF\\u01B0\\u01B7\\u01CD-\\u01DC\\u01DE"
1353        "-\\u01E1\\u01E6-\\u01EB\\u01F4\\u01F5\\u01F8-\\u01FB\\u0200-"
1354        "\\u021B\\u021E\\u021F\\u0226-\\u0233\\u0292\\u0300-\\u034E"
1355        "\\u0350-\\u036F\\u0374\\u037E\\u0387\\u0391\\u0395\\u0397\\u0399"
1356        "\\u039F\\u03A1\\u03A5\\u03A9\\u03AC\\u03AE\\u03B1\\u03B5\\u03B7"
1357        "\\u03B9\\u03BF\\u03C1\\u03C5\\u03C9-\\u03CB\\u03CE\\u03D2\\u0406"
1358        "\\u0410\\u0413\\u0415-\\u0418\\u041A\\u041E\\u0423\\u0427\\u042B"
1359        "\\u042D\\u0430\\u0433\\u0435-\\u0438\\u043A\\u043E\\u0443\\u0447"
1360        "\\u044B\\u044D\\u0456\\u0474\\u0475\\u0483-\\u0487\\u04D8\\u04D9"
1361        "\\u04E8\\u04E9\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5"
1362        "\\u05C7\\u0610-\\u061A\\u0622\\u0623\\u0627\\u0648\\u064A-"
1363        "\\u065E\\u0670\\u06C1\\u06D2\\u06D5-\\u06DC\\u06DF-\\u06E4"
1364        "\\u06E7\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-"
1365        "\\u07F3\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-"
1366        "\\u082D\\u0928\\u0930\\u0933\\u093C\\u094D\\u0951-\\u0954\\u0958"
1367        "-\\u095F\\u09BC\\u09BE\\u09C7\\u09CD\\u09D7\\u09DC\\u09DD\\u09DF"
1368        "\\u0A33\\u0A36\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD"
1369        "\\u0B3C\\u0B3E\\u0B47\\u0B4D\\u0B56\\u0B57\\u0B5C\\u0B5D\\u0B92"
1370        "\\u0BBE\\u0BC6\\u0BC7\\u0BCD\\u0BD7\\u0C46\\u0C4D\\u0C55\\u0C56"
1371        "\\u0CBC\\u0CBF\\u0CC2\\u0CC6\\u0CCA\\u0CCD\\u0CD5\\u0CD6\\u0D3E"
1372        "\\u0D46\\u0D47\\u0D4D\\u0D57\\u0DCA\\u0DCF\\u0DD9\\u0DDC\\u0DDF"
1373        "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB8\\u0EB9\\u0EC8-\\u0ECB"
1374        "\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D\\u0F52\\u0F57"
1375        "\\u0F5C\\u0F69\\u0F71-\\u0F76\\u0F78\\u0F7A-\\u0F7D\\u0F80-"
1376        "\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"
1377        "\\u0FC6\\u1025\\u102E\\u1037\\u1039\\u103A\\u108D\\u1100-\\u1112"
1378        "\\u1161-\\u1175\\u11A8-\\u11C2\\u135F\\u1714\\u1734\\u17D2"
1379        "\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60\\u1A75-"
1380        "\\u1A7C\\u1A7F\\u1B05\\u1B07\\u1B09\\u1B0B\\u1B0D\\u1B11\\u1B34"
1381        "\\u1B35\\u1B3A\\u1B3C\\u1B3E\\u1B3F\\u1B42\\u1B44\\u1B6B-\\u1B73"
1382        "\\u1BAA\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8"
1383        "\\u1CED\\u1DC0-\\u1DE6\\u1DFD-\\u1E03\\u1E0A-\\u1E0F\\u1E12-"
1384        "\\u1E1B\\u1E20-\\u1E27\\u1E2A-\\u1E41\\u1E44-\\u1E53\\u1E58-"
1385        "\\u1E7D\\u1E80-\\u1E87\\u1E8E-\\u1E91\\u1E96-\\u1E99\\u1EA0-"
1386        "\\u1EF3\\u1EF6-\\u1EF9\\u1F00-\\u1F11\\u1F18\\u1F19\\u1F20-"
1387        "\\u1F31\\u1F38\\u1F39\\u1F40\\u1F41\\u1F48\\u1F49\\u1F50\\u1F51"
1388        "\\u1F59\\u1F60-\\u1F71\\u1F73-\\u1F75\\u1F77\\u1F79\\u1F7B-"
1389        "\\u1F7D\\u1F80\\u1F81\\u1F88\\u1F89\\u1F90\\u1F91\\u1F98\\u1F99"
1390        "\\u1FA0\\u1FA1\\u1FA8\\u1FA9\\u1FB3\\u1FB6\\u1FBB\\u1FBC\\u1FBE"
1391        "\\u1FBF\\u1FC3\\u1FC6\\u1FC9\\u1FCB\\u1FCC\\u1FD3\\u1FDB\\u1FE3"
1392        "\\u1FEB\\u1FEE\\u1FEF\\u1FF3\\u1FF6\\u1FF9\\u1FFB-\\u1FFE\\u2000"
1393        "\\u2001\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2126\\u212A"
1394        "\\u212B\\u2190\\u2192\\u2194\\u21D0\\u21D2\\u21D4\\u2203\\u2208"
1395        "\\u220B\\u2223\\u2225\\u223C\\u2243\\u2245\\u2248\\u224D\\u2261"
1396        "\\u2264\\u2265\\u2272\\u2273\\u2276\\u2277\\u227A-\\u227D\\u2282"
1397        "\\u2283\\u2286\\u2287\\u2291\\u2292\\u22A2\\u22A8\\u22A9\\u22AB"
1398        "\\u22B2-\\u22B5\\u2329\\u232A\\u2ADC\\u2CEF-\\u2CF1\\u2DE0-"
1399        "\\u2DFF\\u302A-\\u302F\\u3046\\u304B\\u304D\\u304F\\u3051\\u3053"
1400        "\\u3055\\u3057\\u3059\\u305B\\u305D\\u305F\\u3061\\u3064\\u3066"
1401        "\\u3068\\u306F\\u3072\\u3075\\u3078\\u307B\\u3099\\u309A\\u309D"
1402        "\\u30A6\\u30AB\\u30AD\\u30AF\\u30B1\\u30B3\\u30B5\\u30B7\\u30B9"
1403        "\\u30BB\\u30BD\\u30BF\\u30C1\\u30C4\\u30C6\\u30C8\\u30CF\\u30D2"
1404        "\\u30D5\\u30D8\\u30DB\\u30EF-\\u30F2\\u30FD\\uA66F\\uA67C\\uA67D"
1405        "\\uA6F0\\uA6F1\\uA806\\uA8C4\\uA8E0-\\uA8F1\\uA92B-\\uA92D"
1406        "\\uA953\\uA9B3\\uA9C0\\uAAB0\\uAAB2-\\uAAB4\\uAAB7\\uAAB8\\uAABE"
1407        "\\uAABF\\uAAC1\\uABED\\uAC00\\uAC1C\\uAC38\\uAC54\\uAC70\\uAC8C"
1408        "\\uACA8\\uACC4\\uACE0\\uACFC\\uAD18\\uAD34\\uAD50\\uAD6C\\uAD88"
1409        "\\uADA4\\uADC0\\uADDC\\uADF8\\uAE14\\uAE30\\uAE4C\\uAE68\\uAE84"
1410        "\\uAEA0\\uAEBC\\uAED8\\uAEF4\\uAF10\\uAF2C\\uAF48\\uAF64\\uAF80"
1411        "\\uAF9C\\uAFB8\\uAFD4\\uAFF0\\uB00C\\uB028\\uB044\\uB060\\uB07C"
1412        "\\uB098\\uB0B4\\uB0D0\\uB0EC\\uB108\\uB124\\uB140\\uB15C\\uB178"
1413        "\\uB194\\uB1B0\\uB1CC\\uB1E8\\uB204\\uB220\\uB23C\\uB258\\uB274"
1414        "\\uB290\\uB2AC\\uB2C8\\uB2E4\\uB300\\uB31C\\uB338\\uB354\\uB370"
1415        "\\uB38C\\uB3A8\\uB3C4\\uB3E0\\uB3FC\\uB418\\uB434\\uB450\\uB46C"
1416        "\\uB488\\uB4A4\\uB4C0\\uB4DC\\uB4F8\\uB514\\uB530\\uB54C\\uB568"
1417        "\\uB584\\uB5A0\\uB5BC\\uB5D8\\uB5F4\\uB610\\uB62C\\uB648\\uB664"
1418        "\\uB680\\uB69C\\uB6B8\\uB6D4\\uB6F0\\uB70C\\uB728\\uB744\\uB760"
1419        "\\uB77C\\uB798\\uB7B4\\uB7D0\\uB7EC\\uB808\\uB824\\uB840\\uB85C"
1420        "\\uB878\\uB894\\uB8B0\\uB8CC\\uB8E8\\uB904\\uB920\\uB93C\\uB958"
1421        "\\uB974\\uB990\\uB9AC\\uB9C8\\uB9E4\\uBA00\\uBA1C\\uBA38\\uBA54"
1422        "\\uBA70\\uBA8C\\uBAA8\\uBAC4\\uBAE0\\uBAFC\\uBB18\\uBB34\\uBB50"
1423        "\\uBB6C\\uBB88\\uBBA4\\uBBC0\\uBBDC\\uBBF8\\uBC14\\uBC30\\uBC4C"
1424        "\\uBC68\\uBC84\\uBCA0\\uBCBC\\uBCD8\\uBCF4\\uBD10\\uBD2C\\uBD48"
1425        "\\uBD64\\uBD80\\uBD9C\\uBDB8\\uBDD4\\uBDF0\\uBE0C\\uBE28\\uBE44"
1426        "\\uBE60\\uBE7C\\uBE98\\uBEB4\\uBED0\\uBEEC\\uBF08\\uBF24\\uBF40"
1427        "\\uBF5C\\uBF78\\uBF94\\uBFB0\\uBFCC\\uBFE8\\uC004\\uC020\\uC03C"
1428        "\\uC058\\uC074\\uC090\\uC0AC\\uC0C8\\uC0E4\\uC100\\uC11C\\uC138"
1429        "\\uC154\\uC170\\uC18C\\uC1A8\\uC1C4\\uC1E0\\uC1FC\\uC218\\uC234"
1430        "\\uC250\\uC26C\\uC288\\uC2A4\\uC2C0\\uC2DC\\uC2F8\\uC314\\uC330"
1431        "\\uC34C\\uC368\\uC384\\uC3A0\\uC3BC\\uC3D8\\uC3F4\\uC410\\uC42C"
1432        "\\uC448\\uC464\\uC480\\uC49C\\uC4B8\\uC4D4\\uC4F0\\uC50C\\uC528"
1433        "\\uC544\\uC560\\uC57C\\uC598\\uC5B4\\uC5D0\\uC5EC\\uC608\\uC624"
1434        "\\uC640\\uC65C\\uC678\\uC694\\uC6B0\\uC6CC\\uC6E8\\uC704\\uC720"
1435        "\\uC73C\\uC758\\uC774\\uC790\\uC7AC\\uC7C8\\uC7E4\\uC800\\uC81C"
1436        "\\uC838\\uC854\\uC870\\uC88C\\uC8A8\\uC8C4\\uC8E0\\uC8FC\\uC918"
1437        "\\uC934\\uC950\\uC96C\\uC988\\uC9A4\\uC9C0\\uC9DC\\uC9F8\\uCA14"
1438        "\\uCA30\\uCA4C\\uCA68\\uCA84\\uCAA0\\uCABC\\uCAD8\\uCAF4\\uCB10"
1439        "\\uCB2C\\uCB48\\uCB64\\uCB80\\uCB9C\\uCBB8\\uCBD4\\uCBF0\\uCC0C"
1440        "\\uCC28\\uCC44\\uCC60\\uCC7C\\uCC98\\uCCB4\\uCCD0\\uCCEC\\uCD08"
1441        "\\uCD24\\uCD40\\uCD5C\\uCD78\\uCD94\\uCDB0\\uCDCC\\uCDE8\\uCE04"
1442        "\\uCE20\\uCE3C\\uCE58\\uCE74\\uCE90\\uCEAC\\uCEC8\\uCEE4\\uCF00"
1443        "\\uCF1C\\uCF38\\uCF54\\uCF70\\uCF8C\\uCFA8\\uCFC4\\uCFE0\\uCFFC"
1444        "\\uD018\\uD034\\uD050\\uD06C\\uD088\\uD0A4\\uD0C0\\uD0DC\\uD0F8"
1445        "\\uD114\\uD130\\uD14C\\uD168\\uD184\\uD1A0\\uD1BC\\uD1D8\\uD1F4"
1446        "\\uD210\\uD22C\\uD248\\uD264\\uD280\\uD29C\\uD2B8\\uD2D4\\uD2F0"
1447        "\\uD30C\\uD328\\uD344\\uD360\\uD37C\\uD398\\uD3B4\\uD3D0\\uD3EC"
1448        "\\uD408\\uD424\\uD440\\uD45C\\uD478\\uD494\\uD4B0\\uD4CC\\uD4E8"
1449        "\\uD504\\uD520\\uD53C\\uD558\\uD574\\uD590\\uD5AC\\uD5C8\\uD5E4"
1450        "\\uD600\\uD61C\\uD638\\uD654\\uD670\\uD68C\\uD6A8\\uD6C4\\uD6E0"
1451        "\\uD6FC\\uD718\\uD734\\uD750\\uD76C\\uD788\\uF900-\\uFA0D\\uFA10"
1452        "\\uFA12\\uFA15-\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A-"
1453        "\\uFA2D\\uFA30-\\uFA6D\\uFA70-\\uFAD9\\uFB1D-\\uFB1F\\uFB2A-"
1454        "\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46"
1455        "-\\uFB4E\\uFE20-\\uFE26\\U000101FD\\U00010A0D\\U00010A0F\\U00010"
1456        "A38-\\U00010A3A\\U00010A3F\\U00011099\\U0001109B\\U000110A5"
1457        "\\U000110B9\\U000110BA\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001"
1458        "D172\\U0001D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-"
1459        "\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0002"
1460        "F800-\\U0002FA1D]"
1461        , ""), errorCode);
1462
1463    skipSets[UNORM_NFKD].applyPattern(UnicodeString(
1464        "[^\\u00A0\\u00A8\\u00AA\\u00AF\\u00B2-\\u00B5\\u00B8-\\u00BA"
1465        "\\u00BC-\\u00BE\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6"
1466        "\\u00D9-\\u00DD\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6"
1467        "\\u00F9-\\u00FD\\u00FF-\\u010F\\u0112-\\u0125\\u0128-\\u0130"
1468        "\\u0132-\\u0137\\u0139-\\u0140\\u0143-\\u0149\\u014C-\\u0151"
1469        "\\u0154-\\u0165\\u0168-\\u017F\\u01A0\\u01A1\\u01AF\\u01B0"
1470        "\\u01C4-\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F5\\u01F8-\\u021B"
1471        "\\u021E\\u021F\\u0226-\\u0233\\u02B0-\\u02B8\\u02D8-\\u02DD"
1472        "\\u02E0-\\u02E4\\u0300-\\u034E\\u0350-\\u036F\\u0374\\u037A"
1473        "\\u037E\\u0384-\\u038A\\u038C\\u038E-\\u0390\\u03AA-\\u03B0"
1474        "\\u03CA-\\u03CE\\u03D0-\\u03D6\\u03F0-\\u03F2\\u03F4\\u03F5"
1475        "\\u03F9\\u0400\\u0401\\u0403\\u0407\\u040C-\\u040E\\u0419\\u0439"
1476        "\\u0450\\u0451\\u0453\\u0457\\u045C-\\u045E\\u0476\\u0477\\u0483"
1477        "-\\u0487\\u04C1\\u04C2\\u04D0-\\u04D3\\u04D6\\u04D7\\u04DA-"
1478        "\\u04DF\\u04E2-\\u04E7\\u04EA-\\u04F5\\u04F8\\u04F9\\u0587"
1479        "\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5\\u05C7\\u0610"
1480        "-\\u061A\\u0622-\\u0626\\u064B-\\u065E\\u0670\\u0675-\\u0678"
1481        "\\u06C0\\u06C2\\u06D3\\u06D6-\\u06DC\\u06DF-\\u06E4\\u06E7"
1482        "\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-\\u07F3"
1483        "\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-\\u082D"
1484        "\\u0929\\u0931\\u0934\\u093C\\u094D\\u0951-\\u0954\\u0958-"
1485        "\\u095F\\u09BC\\u09CB-\\u09CD\\u09DC\\u09DD\\u09DF\\u0A33\\u0A36"
1486        "\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C\\u0B48"
1487        "\\u0B4B-\\u0B4D\\u0B5C\\u0B5D\\u0B94\\u0BCA-\\u0BCD\\u0C48"
1488        "\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CC0\\u0CC7\\u0CC8\\u0CCA\\u0CCB"
1489        "\\u0CCD\\u0D4A-\\u0D4D\\u0DCA\\u0DDA\\u0DDC-\\u0DDE\\u0E33"
1490        "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB3\\u0EB8\\u0EB9\\u0EC8-"
1491        "\\u0ECB\\u0EDC\\u0EDD\\u0F0C\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39"
1492        "\\u0F43\\u0F4D\\u0F52\\u0F57\\u0F5C\\u0F69\\u0F71-\\u0F7D\\u0F80"
1493        "-\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"
1494        "\\u0FC6\\u1026\\u1037\\u1039\\u103A\\u108D\\u10FC\\u135F\\u1714"
1495        "\\u1734\\u17D2\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60"
1496        "\\u1A75-\\u1A7C\\u1A7F\\u1B06\\u1B08\\u1B0A\\u1B0C\\u1B0E\\u1B12"
1497        "\\u1B34\\u1B3B\\u1B3D\\u1B40\\u1B41\\u1B43\\u1B44\\u1B6B-\\u1B73"
1498        "\\u1BAA\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8"
1499        "\\u1CED\\u1D2C-\\u1D2E\\u1D30-\\u1D3A\\u1D3C-\\u1D4D\\u1D4F-"
1500        "\\u1D6A\\u1D78\\u1D9B-\\u1DE6\\u1DFD-\\u1E9B\\u1EA0-\\u1EF9"
1501        "\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D"
1502        "\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-"
1503        "\\u1FB4\\u1FB6-\\u1FC4\\u1FC6-\\u1FD3\\u1FD6-\\u1FDB\\u1FDD-"
1504        "\\u1FEF\\u1FF2-\\u1FF4\\u1FF6-\\u1FFE\\u2000-\\u200A\\u2011"
1505        "\\u2017\\u2024-\\u2026\\u202F\\u2033\\u2034\\u2036\\u2037\\u203C"
1506        "\\u203E\\u2047-\\u2049\\u2057\\u205F\\u2070\\u2071\\u2074-"
1507        "\\u208E\\u2090-\\u2094\\u20A8\\u20D0-\\u20DC\\u20E1\\u20E5-"
1508        "\\u20F0\\u2100-\\u2103\\u2105-\\u2107\\u2109-\\u2113\\u2115"
1509        "\\u2116\\u2119-\\u211D\\u2120-\\u2122\\u2124\\u2126\\u2128"
1510        "\\u212A-\\u212D\\u212F-\\u2131\\u2133-\\u2139\\u213B-\\u2140"
1511        "\\u2145-\\u2149\\u2150-\\u217F\\u2189\\u219A\\u219B\\u21AE"
1512        "\\u21CD-\\u21CF\\u2204\\u2209\\u220C\\u2224\\u2226\\u222C\\u222D"
1513        "\\u222F\\u2230\\u2241\\u2244\\u2247\\u2249\\u2260\\u2262\\u226D-"
1514        "\\u2271\\u2274\\u2275\\u2278\\u2279\\u2280\\u2281\\u2284\\u2285"
1515        "\\u2288\\u2289\\u22AC-\\u22AF\\u22E0-\\u22E3\\u22EA-\\u22ED"
1516        "\\u2329\\u232A\\u2460-\\u24EA\\u2A0C\\u2A74-\\u2A76\\u2ADC"
1517        "\\u2C7C\\u2C7D\\u2CEF-\\u2CF1\\u2D6F\\u2DE0-\\u2DFF\\u2E9F"
1518        "\\u2EF3\\u2F00-\\u2FD5\\u3000\\u302A-\\u302F\\u3036\\u3038-"
1519        "\\u303A\\u304C\\u304E\\u3050\\u3052\\u3054\\u3056\\u3058\\u305A"
1520        "\\u305C\\u305E\\u3060\\u3062\\u3065\\u3067\\u3069\\u3070\\u3071"
1521        "\\u3073\\u3074\\u3076\\u3077\\u3079\\u307A\\u307C\\u307D\\u3094"
1522        "\\u3099-\\u309C\\u309E\\u309F\\u30AC\\u30AE\\u30B0\\u30B2\\u30B4"
1523        "\\u30B6\\u30B8\\u30BA\\u30BC\\u30BE\\u30C0\\u30C2\\u30C5\\u30C7"
1524        "\\u30C9\\u30D0\\u30D1\\u30D3\\u30D4\\u30D6\\u30D7\\u30D9\\u30DA"
1525        "\\u30DC\\u30DD\\u30F4\\u30F7-\\u30FA\\u30FE\\u30FF\\u3131-"
1526        "\\u318E\\u3192-\\u319F\\u3200-\\u321E\\u3220-\\u3247\\u3250-"
1527        "\\u327E\\u3280-\\u32FE\\u3300-\\u33FF\\uA66F\\uA67C\\uA67D"
1528        "\\uA6F0\\uA6F1\\uA770\\uA806\\uA8C4\\uA8E0-\\uA8F1\\uA92B-"
1529        "\\uA92D\\uA953\\uA9B3\\uA9C0\\uAAB0\\uAAB2-\\uAAB4\\uAAB7\\uAAB8"
1530        "\\uAABE\\uAABF\\uAAC1\\uABED\\uAC00-\\uD7A3\\uF900-\\uFA0D"
1531        "\\uFA10\\uFA12\\uFA15-\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A"
1532        "-\\uFA2D\\uFA30-\\uFA6D\\uFA70-\\uFAD9\\uFB00-\\uFB06\\uFB13-"
1533        "\\uFB17\\uFB1D-\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41"
1534        "\\uFB43\\uFB44\\uFB46-\\uFBB1\\uFBD3-\\uFD3D\\uFD50-\\uFD8F"
1535        "\\uFD92-\\uFDC7\\uFDF0-\\uFDFC\\uFE10-\\uFE19\\uFE20-\\uFE26"
1536        "\\uFE30-\\uFE44\\uFE47-\\uFE52\\uFE54-\\uFE66\\uFE68-\\uFE6B"
1537        "\\uFE70-\\uFE72\\uFE74\\uFE76-\\uFEFC\\uFF01-\\uFFBE\\uFFC2-"
1538        "\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7\\uFFDA-\\uFFDC\\uFFE0-"
1539        "\\uFFE6\\uFFE8-\\uFFEE\\U000101FD\\U00010A0D\\U00010A0F\\U00010A"
1540        "38-\\U00010A3A\\U00010A3F\\U0001109A\\U0001109C\\U000110AB"
1541        "\\U000110B9\\U000110BA\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001"
1542        "D172\\U0001D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-"
1543        "\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0001"
1544        "D400-\\U0001D454\\U0001D456-\\U0001D49C\\U0001D49E\\U0001D49F"
1545        "\\U0001D4A2\\U0001D4A5\\U0001D4A6\\U0001D4A9-\\U0001D4AC\\U0001D"
1546        "4AE-\\U0001D4B9\\U0001D4BB\\U0001D4BD-\\U0001D4C3\\U0001D4C5-"
1547        "\\U0001D505\\U0001D507-\\U0001D50A\\U0001D50D-\\U0001D514\\U0001"
1548        "D516-\\U0001D51C\\U0001D51E-\\U0001D539\\U0001D53B-\\U0001D53E"
1549        "\\U0001D540-\\U0001D544\\U0001D546\\U0001D54A-\\U0001D550\\U0001"
1550        "D552-\\U0001D6A5\\U0001D6A8-\\U0001D7CB\\U0001D7CE-\\U0001D7FF"
1551        "\\U0001F100-\\U0001F10A\\U0001F110-\\U0001F12E\\U0001F131\\U0001"
1552        "F13D\\U0001F13F\\U0001F142\\U0001F146\\U0001F14A-\\U0001F14E"
1553        "\\U0001F190\\U0001F200\\U0001F210-\\U0001F231\\U0001F240-\\U0001"
1554        "F248\\U0002F800-\\U0002FA1D]"
1555        , ""), errorCode);
1556
1557    skipSets[UNORM_NFKC].applyPattern(UnicodeString(
1558        "[^<->A-PR-Za-pr-z\\u00A0\\u00A8\\u00AA\\u00AF\\u00B2-\\u00B5"
1559        "\\u00B8-\\u00BA\\u00BC-\\u00BE\\u00C0-\\u00CF\\u00D1-\\u00D6"
1560        "\\u00D8-\\u00DD\\u00E0-\\u00EF\\u00F1-\\u00F6\\u00F8-\\u00FD"
1561        "\\u00FF-\\u0103\\u0106-\\u010F\\u0112-\\u0117\\u011A-\\u0121"
1562        "\\u0124\\u0125\\u0128-\\u012D\\u0130\\u0132\\u0133\\u0139\\u013A"
1563        "\\u013D-\\u0140\\u0143\\u0144\\u0147-\\u0149\\u014C-\\u0151"
1564        "\\u0154\\u0155\\u0158-\\u015D\\u0160\\u0161\\u0164\\u0165\\u0168"
1565        "-\\u0171\\u0174-\\u017F\\u01A0\\u01A1\\u01AF\\u01B0\\u01B7"
1566        "\\u01C4-\\u01DC\\u01DE-\\u01E1\\u01E6-\\u01EB\\u01F1-\\u01F5"
1567        "\\u01F8-\\u01FB\\u0200-\\u021B\\u021E\\u021F\\u0226-\\u0233"
1568        "\\u0292\\u02B0-\\u02B8\\u02D8-\\u02DD\\u02E0-\\u02E4\\u0300-"
1569        "\\u034E\\u0350-\\u036F\\u0374\\u037A\\u037E\\u0384\\u0385\\u0387"
1570        "\\u0391\\u0395\\u0397\\u0399\\u039F\\u03A1\\u03A5\\u03A9\\u03AC"
1571        "\\u03AE\\u03B1\\u03B5\\u03B7\\u03B9\\u03BF\\u03C1\\u03C5\\u03C9-"
1572        "\\u03CB\\u03CE\\u03D0-\\u03D6\\u03F0-\\u03F2\\u03F4\\u03F5"
1573        "\\u03F9\\u0406\\u0410\\u0413\\u0415-\\u0418\\u041A\\u041E\\u0423"
1574        "\\u0427\\u042B\\u042D\\u0430\\u0433\\u0435-\\u0438\\u043A\\u043E"
1575        "\\u0443\\u0447\\u044B\\u044D\\u0456\\u0474\\u0475\\u0483-\\u0487"
1576        "\\u04D8\\u04D9\\u04E8\\u04E9\\u0587\\u0591-\\u05BD\\u05BF\\u05C1"
1577        "\\u05C2\\u05C4\\u05C5\\u05C7\\u0610-\\u061A\\u0622\\u0623\\u0627"
1578        "\\u0648\\u064A-\\u065E\\u0670\\u0675-\\u0678\\u06C1\\u06D2"
1579        "\\u06D5-\\u06DC\\u06DF-\\u06E4\\u06E7\\u06E8\\u06EA-\\u06ED"
1580        "\\u0711\\u0730-\\u074A\\u07EB-\\u07F3\\u0816-\\u0819\\u081B-"
1581        "\\u0823\\u0825-\\u0827\\u0829-\\u082D\\u0928\\u0930\\u0933"
1582        "\\u093C\\u094D\\u0951-\\u0954\\u0958-\\u095F\\u09BC\\u09BE"
1583        "\\u09C7\\u09CD\\u09D7\\u09DC\\u09DD\\u09DF\\u0A33\\u0A36\\u0A3C"
1584        "\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C\\u0B3E\\u0B47"
1585        "\\u0B4D\\u0B56\\u0B57\\u0B5C\\u0B5D\\u0B92\\u0BBE\\u0BC6\\u0BC7"
1586        "\\u0BCD\\u0BD7\\u0C46\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CBF\\u0CC2"
1587        "\\u0CC6\\u0CCA\\u0CCD\\u0CD5\\u0CD6\\u0D3E\\u0D46\\u0D47\\u0D4D"
1588        "\\u0D57\\u0DCA\\u0DCF\\u0DD9\\u0DDC\\u0DDF\\u0E33\\u0E38-\\u0E3A"
1589        "\\u0E48-\\u0E4B\\u0EB3\\u0EB8\\u0EB9\\u0EC8-\\u0ECB\\u0EDC"
1590        "\\u0EDD\\u0F0C\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D"
1591        "\\u0F52\\u0F57\\u0F5C\\u0F69\\u0F71-\\u0F7D\\u0F80-\\u0F84"
1592        "\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9\\u0FC6"
1593        "\\u1025\\u102E\\u1037\\u1039\\u103A\\u108D\\u10FC\\u1100-\\u1112"
1594        "\\u1161-\\u1175\\u11A8-\\u11C2\\u135F\\u1714\\u1734\\u17D2"
1595        "\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60\\u1A75-"
1596        "\\u1A7C\\u1A7F\\u1B05\\u1B07\\u1B09\\u1B0B\\u1B0D\\u1B11\\u1B34"
1597        "\\u1B35\\u1B3A\\u1B3C\\u1B3E\\u1B3F\\u1B42\\u1B44\\u1B6B-\\u1B73"
1598        "\\u1BAA\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8"
1599        "\\u1CED\\u1D2C-\\u1D2E\\u1D30-\\u1D3A\\u1D3C-\\u1D4D\\u1D4F-"
1600        "\\u1D6A\\u1D78\\u1D9B-\\u1DE6\\u1DFD-\\u1E03\\u1E0A-\\u1E0F"
1601        "\\u1E12-\\u1E1B\\u1E20-\\u1E27\\u1E2A-\\u1E41\\u1E44-\\u1E53"
1602        "\\u1E58-\\u1E7D\\u1E80-\\u1E87\\u1E8E-\\u1E91\\u1E96-\\u1E9B"
1603        "\\u1EA0-\\u1EF3\\u1EF6-\\u1EF9\\u1F00-\\u1F11\\u1F18\\u1F19"
1604        "\\u1F20-\\u1F31\\u1F38\\u1F39\\u1F40\\u1F41\\u1F48\\u1F49\\u1F50"
1605        "\\u1F51\\u1F59\\u1F60-\\u1F71\\u1F73-\\u1F75\\u1F77\\u1F79"
1606        "\\u1F7B-\\u1F7D\\u1F80\\u1F81\\u1F88\\u1F89\\u1F90\\u1F91\\u1F98"
1607        "\\u1F99\\u1FA0\\u1FA1\\u1FA8\\u1FA9\\u1FB3\\u1FB6\\u1FBB-\\u1FC1"
1608        "\\u1FC3\\u1FC6\\u1FC9\\u1FCB-\\u1FCF\\u1FD3\\u1FDB\\u1FDD-"
1609        "\\u1FDF\\u1FE3\\u1FEB\\u1FED-\\u1FEF\\u1FF3\\u1FF6\\u1FF9\\u1FFB"
1610        "-\\u1FFE\\u2000-\\u200A\\u2011\\u2017\\u2024-\\u2026\\u202F"
1611        "\\u2033\\u2034\\u2036\\u2037\\u203C\\u203E\\u2047-\\u2049\\u2057"
1612        "\\u205F\\u2070\\u2071\\u2074-\\u208E\\u2090-\\u2094\\u20A8"
1613        "\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2100-\\u2103\\u2105-"
1614        "\\u2107\\u2109-\\u2113\\u2115\\u2116\\u2119-\\u211D\\u2120-"
1615        "\\u2122\\u2124\\u2126\\u2128\\u212A-\\u212D\\u212F-\\u2131"
1616        "\\u2133-\\u2139\\u213B-\\u2140\\u2145-\\u2149\\u2150-\\u217F"
1617        "\\u2189\\u2190\\u2192\\u2194\\u21D0\\u21D2\\u21D4\\u2203\\u2208"
1618        "\\u220B\\u2223\\u2225\\u222C\\u222D\\u222F\\u2230\\u223C\\u2243"
1619        "\\u2245\\u2248\\u224D\\u2261\\u2264\\u2265\\u2272\\u2273\\u2276"
1620        "\\u2277\\u227A-\\u227D\\u2282\\u2283\\u2286\\u2287\\u2291\\u2292"
1621        "\\u22A2\\u22A8\\u22A9\\u22AB\\u22B2-\\u22B5\\u2329\\u232A\\u2460"
1622        "-\\u24EA\\u2A0C\\u2A74-\\u2A76\\u2ADC\\u2C7C\\u2C7D\\u2CEF-"
1623        "\\u2CF1\\u2D6F\\u2DE0-\\u2DFF\\u2E9F\\u2EF3\\u2F00-\\u2FD5"
1624        "\\u3000\\u302A-\\u302F\\u3036\\u3038-\\u303A\\u3046\\u304B"
1625        "\\u304D\\u304F\\u3051\\u3053\\u3055\\u3057\\u3059\\u305B\\u305D"
1626        "\\u305F\\u3061\\u3064\\u3066\\u3068\\u306F\\u3072\\u3075\\u3078"
1627        "\\u307B\\u3099-\\u309D\\u309F\\u30A6\\u30AB\\u30AD\\u30AF\\u30B1"
1628        "\\u30B3\\u30B5\\u30B7\\u30B9\\u30BB\\u30BD\\u30BF\\u30C1\\u30C4"
1629        "\\u30C6\\u30C8\\u30CF\\u30D2\\u30D5\\u30D8\\u30DB\\u30EF-\\u30F2"
1630        "\\u30FD\\u30FF\\u3131-\\u318E\\u3192-\\u319F\\u3200-\\u321E"
1631        "\\u3220-\\u3247\\u3250-\\u327E\\u3280-\\u32FE\\u3300-\\u33FF"
1632        "\\uA66F\\uA67C\\uA67D\\uA6F0\\uA6F1\\uA770\\uA806\\uA8C4\\uA8E0-"
1633        "\\uA8F1\\uA92B-\\uA92D\\uA953\\uA9B3\\uA9C0\\uAAB0\\uAAB2-"
1634        "\\uAAB4\\uAAB7\\uAAB8\\uAABE\\uAABF\\uAAC1\\uABED\\uAC00\\uAC1C"
1635        "\\uAC38\\uAC54\\uAC70\\uAC8C\\uACA8\\uACC4\\uACE0\\uACFC\\uAD18"
1636        "\\uAD34\\uAD50\\uAD6C\\uAD88\\uADA4\\uADC0\\uADDC\\uADF8\\uAE14"
1637        "\\uAE30\\uAE4C\\uAE68\\uAE84\\uAEA0\\uAEBC\\uAED8\\uAEF4\\uAF10"
1638        "\\uAF2C\\uAF48\\uAF64\\uAF80\\uAF9C\\uAFB8\\uAFD4\\uAFF0\\uB00C"
1639        "\\uB028\\uB044\\uB060\\uB07C\\uB098\\uB0B4\\uB0D0\\uB0EC\\uB108"
1640        "\\uB124\\uB140\\uB15C\\uB178\\uB194\\uB1B0\\uB1CC\\uB1E8\\uB204"
1641        "\\uB220\\uB23C\\uB258\\uB274\\uB290\\uB2AC\\uB2C8\\uB2E4\\uB300"
1642        "\\uB31C\\uB338\\uB354\\uB370\\uB38C\\uB3A8\\uB3C4\\uB3E0\\uB3FC"
1643        "\\uB418\\uB434\\uB450\\uB46C\\uB488\\uB4A4\\uB4C0\\uB4DC\\uB4F8"
1644        "\\uB514\\uB530\\uB54C\\uB568\\uB584\\uB5A0\\uB5BC\\uB5D8\\uB5F4"
1645        "\\uB610\\uB62C\\uB648\\uB664\\uB680\\uB69C\\uB6B8\\uB6D4\\uB6F0"
1646        "\\uB70C\\uB728\\uB744\\uB760\\uB77C\\uB798\\uB7B4\\uB7D0\\uB7EC"
1647        "\\uB808\\uB824\\uB840\\uB85C\\uB878\\uB894\\uB8B0\\uB8CC\\uB8E8"
1648        "\\uB904\\uB920\\uB93C\\uB958\\uB974\\uB990\\uB9AC\\uB9C8\\uB9E4"
1649        "\\uBA00\\uBA1C\\uBA38\\uBA54\\uBA70\\uBA8C\\uBAA8\\uBAC4\\uBAE0"
1650        "\\uBAFC\\uBB18\\uBB34\\uBB50\\uBB6C\\uBB88\\uBBA4\\uBBC0\\uBBDC"
1651        "\\uBBF8\\uBC14\\uBC30\\uBC4C\\uBC68\\uBC84\\uBCA0\\uBCBC\\uBCD8"
1652        "\\uBCF4\\uBD10\\uBD2C\\uBD48\\uBD64\\uBD80\\uBD9C\\uBDB8\\uBDD4"
1653        "\\uBDF0\\uBE0C\\uBE28\\uBE44\\uBE60\\uBE7C\\uBE98\\uBEB4\\uBED0"
1654        "\\uBEEC\\uBF08\\uBF24\\uBF40\\uBF5C\\uBF78\\uBF94\\uBFB0\\uBFCC"
1655        "\\uBFE8\\uC004\\uC020\\uC03C\\uC058\\uC074\\uC090\\uC0AC\\uC0C8"
1656        "\\uC0E4\\uC100\\uC11C\\uC138\\uC154\\uC170\\uC18C\\uC1A8\\uC1C4"
1657        "\\uC1E0\\uC1FC\\uC218\\uC234\\uC250\\uC26C\\uC288\\uC2A4\\uC2C0"
1658        "\\uC2DC\\uC2F8\\uC314\\uC330\\uC34C\\uC368\\uC384\\uC3A0\\uC3BC"
1659        "\\uC3D8\\uC3F4\\uC410\\uC42C\\uC448\\uC464\\uC480\\uC49C\\uC4B8"
1660        "\\uC4D4\\uC4F0\\uC50C\\uC528\\uC544\\uC560\\uC57C\\uC598\\uC5B4"
1661        "\\uC5D0\\uC5EC\\uC608\\uC624\\uC640\\uC65C\\uC678\\uC694\\uC6B0"
1662        "\\uC6CC\\uC6E8\\uC704\\uC720\\uC73C\\uC758\\uC774\\uC790\\uC7AC"
1663        "\\uC7C8\\uC7E4\\uC800\\uC81C\\uC838\\uC854\\uC870\\uC88C\\uC8A8"
1664        "\\uC8C4\\uC8E0\\uC8FC\\uC918\\uC934\\uC950\\uC96C\\uC988\\uC9A4"
1665        "\\uC9C0\\uC9DC\\uC9F8\\uCA14\\uCA30\\uCA4C\\uCA68\\uCA84\\uCAA0"
1666        "\\uCABC\\uCAD8\\uCAF4\\uCB10\\uCB2C\\uCB48\\uCB64\\uCB80\\uCB9C"
1667        "\\uCBB8\\uCBD4\\uCBF0\\uCC0C\\uCC28\\uCC44\\uCC60\\uCC7C\\uCC98"
1668        "\\uCCB4\\uCCD0\\uCCEC\\uCD08\\uCD24\\uCD40\\uCD5C\\uCD78\\uCD94"
1669        "\\uCDB0\\uCDCC\\uCDE8\\uCE04\\uCE20\\uCE3C\\uCE58\\uCE74\\uCE90"
1670        "\\uCEAC\\uCEC8\\uCEE4\\uCF00\\uCF1C\\uCF38\\uCF54\\uCF70\\uCF8C"
1671        "\\uCFA8\\uCFC4\\uCFE0\\uCFFC\\uD018\\uD034\\uD050\\uD06C\\uD088"
1672        "\\uD0A4\\uD0C0\\uD0DC\\uD0F8\\uD114\\uD130\\uD14C\\uD168\\uD184"
1673        "\\uD1A0\\uD1BC\\uD1D8\\uD1F4\\uD210\\uD22C\\uD248\\uD264\\uD280"
1674        "\\uD29C\\uD2B8\\uD2D4\\uD2F0\\uD30C\\uD328\\uD344\\uD360\\uD37C"
1675        "\\uD398\\uD3B4\\uD3D0\\uD3EC\\uD408\\uD424\\uD440\\uD45C\\uD478"
1676        "\\uD494\\uD4B0\\uD4CC\\uD4E8\\uD504\\uD520\\uD53C\\uD558\\uD574"
1677        "\\uD590\\uD5AC\\uD5C8\\uD5E4\\uD600\\uD61C\\uD638\\uD654\\uD670"
1678        "\\uD68C\\uD6A8\\uD6C4\\uD6E0\\uD6FC\\uD718\\uD734\\uD750\\uD76C"
1679        "\\uD788\\uF900-\\uFA0D\\uFA10\\uFA12\\uFA15-\\uFA1E\\uFA20"
1680        "\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D\\uFA30-\\uFA6D\\uFA70-"
1681        "\\uFAD9\\uFB00-\\uFB06\\uFB13-\\uFB17\\uFB1D-\\uFB36\\uFB38-"
1682        "\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-\\uFBB1\\uFBD3"
1683        "-\\uFD3D\\uFD50-\\uFD8F\\uFD92-\\uFDC7\\uFDF0-\\uFDFC\\uFE10-"
1684        "\\uFE19\\uFE20-\\uFE26\\uFE30-\\uFE44\\uFE47-\\uFE52\\uFE54-"
1685        "\\uFE66\\uFE68-\\uFE6B\\uFE70-\\uFE72\\uFE74\\uFE76-\\uFEFC"
1686        "\\uFF01-\\uFFBE\\uFFC2-\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7"
1687        "\\uFFDA-\\uFFDC\\uFFE0-\\uFFE6\\uFFE8-\\uFFEE\\U000101FD\\U00010"
1688        "A0D\\U00010A0F\\U00010A38-\\U00010A3A\\U00010A3F\\U00011099"
1689        "\\U0001109B\\U000110A5\\U000110B9\\U000110BA\\U0001D15E-\\U0001D"
1690        "169\\U0001D16D-\\U0001D172\\U0001D17B-\\U0001D182\\U0001D185-"
1691        "\\U0001D18B\\U0001D1AA-\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001"
1692        "D242-\\U0001D244\\U0001D400-\\U0001D454\\U0001D456-\\U0001D49C"
1693        "\\U0001D49E\\U0001D49F\\U0001D4A2\\U0001D4A5\\U0001D4A6\\U0001D4"
1694        "A9-\\U0001D4AC\\U0001D4AE-\\U0001D4B9\\U0001D4BB\\U0001D4BD-"
1695        "\\U0001D4C3\\U0001D4C5-\\U0001D505\\U0001D507-\\U0001D50A\\U0001"
1696        "D50D-\\U0001D514\\U0001D516-\\U0001D51C\\U0001D51E-\\U0001D539"
1697        "\\U0001D53B-\\U0001D53E\\U0001D540-\\U0001D544\\U0001D546\\U0001"
1698        "D54A-\\U0001D550\\U0001D552-\\U0001D6A5\\U0001D6A8-\\U0001D7CB"
1699        "\\U0001D7CE-\\U0001D7FF\\U0001F100-\\U0001F10A\\U0001F110-"
1700        "\\U0001F12E\\U0001F131\\U0001F13D\\U0001F13F\\U0001F142\\U0001F1"
1701        "46\\U0001F14A-\\U0001F14E\\U0001F190\\U0001F200\\U0001F210-"
1702        "\\U0001F231\\U0001F240-\\U0001F248\\U0002F800-\\U0002FA1D]"
1703        , ""), errorCode);
1704}
1705
1706U_CDECL_BEGIN
1707
1708// USetAdder implementation
1709// Does not use uset.h to reduce code dependencies
1710static void U_CALLCONV
1711_set_add(USet *set, UChar32 c) {
1712    uset_add(set, c);
1713}
1714
1715static void U_CALLCONV
1716_set_addRange(USet *set, UChar32 start, UChar32 end) {
1717    uset_addRange(set, start, end);
1718}
1719
1720static void U_CALLCONV
1721_set_addString(USet *set, const UChar *str, int32_t length) {
1722    uset_addString(set, str, length);
1723}
1724
1725U_CDECL_END
1726
1727void
1728BasicNormalizerTest::TestSkippable() {
1729    UnicodeSet diff, skipSets[UNORM_MODE_COUNT], expectSets[UNORM_MODE_COUNT];
1730    UnicodeString s, pattern;
1731
1732    /* build NF*Skippable sets from runtime data */
1733    IcuTestErrorCode errorCode(*this, "TestSkippable");
1734    skipSets[UNORM_NFD].applyPattern(UNICODE_STRING_SIMPLE("[:NFD_Inert:]"), errorCode);
1735    skipSets[UNORM_NFKD].applyPattern(UNICODE_STRING_SIMPLE("[:NFKD_Inert:]"), errorCode);
1736    skipSets[UNORM_NFC].applyPattern(UNICODE_STRING_SIMPLE("[:NFC_Inert:]"), errorCode);
1737    skipSets[UNORM_NFKC].applyPattern(UNICODE_STRING_SIMPLE("[:NFKC_Inert:]"), errorCode);
1738    if(errorCode.logDataIfFailureAndReset("UnicodeSet(NF..._Inert) failed")) {
1739        return;
1740    }
1741
1742    /* get expected sets from hardcoded patterns */
1743    initExpectedSkippables(expectSets);
1744
1745    for(int32_t i=UNORM_NONE; i<UNORM_MODE_COUNT; ++i) {
1746        if(skipSets[i]!=expectSets[i]) {
1747            errln("error: TestSkippable skipSets[%d]!=expectedSets[%d]\n"
1748                  "may need to update hardcoded UnicodeSet patterns in\n"
1749                  "tstnorm.cpp/initExpectedSkippables(),\n"
1750                  "see ICU4J - unicodetools.com.ibm.text.UCD.NFSkippable\n",
1751                  i, i);
1752
1753            s=UNICODE_STRING_SIMPLE("skip-expect=");
1754            (diff=skipSets[i]).removeAll(expectSets[i]).toPattern(pattern, TRUE);
1755            s.append(pattern);
1756
1757            pattern.remove();
1758            s.append(UNICODE_STRING_SIMPLE("\n\nexpect-skip="));
1759            (diff=expectSets[i]).removeAll(skipSets[i]).toPattern(pattern, TRUE);
1760            s.append(pattern);
1761            s.append(UNICODE_STRING_SIMPLE("\n\n"));
1762
1763            errln(s);
1764        }
1765    }
1766}
1767
1768struct StringPair { const char *input, *expected; };
1769
1770void
1771BasicNormalizerTest::TestCustomComp() {
1772    static const StringPair pairs[]={
1773        { "\\uD801\\uE000\\uDFFE", "" },
1774        { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" },
1775        { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" },
1776        { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE002\\U000110B9\\u0327\\u0345" },
1777        { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" },
1778        { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" },
1779        { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" },
1780        { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" }
1781    };
1782    IcuTestErrorCode errorCode(*this, "BasicNormalizerTest/TestCustomComp");
1783    const Normalizer2 *customNorm2=
1784        Normalizer2::getInstance(loadTestData(errorCode), "testnorm",
1785                                 UNORM2_COMPOSE, errorCode);
1786    if(errorCode.logIfFailureAndReset("unable to load testdata/testnorm.nrm")) {
1787        return;
1788    }
1789    for(int32_t i=0; i<LENGTHOF(pairs); ++i) {
1790        const StringPair &pair=pairs[i];
1791        UnicodeString input=UnicodeString(pair.input, -1, US_INV).unescape();
1792        UnicodeString expected=UnicodeString(pair.expected, -1, US_INV).unescape();
1793        UnicodeString result=customNorm2->normalize(input, errorCode);
1794        if(result!=expected) {
1795            errln("custom compose Normalizer2 did not normalize input %d as expected", i);
1796        }
1797    }
1798}
1799
1800void
1801BasicNormalizerTest::TestCustomFCC() {
1802    static const StringPair pairs[]={
1803        { "\\uD801\\uE000\\uDFFE", "" },
1804        { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" },
1805        { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" },
1806        // The following expected result is different from CustomComp
1807        // because of only-contiguous composition.
1808        { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE001\\U000110B9\\u0327\\u0308\\u0345" },
1809        { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" },
1810        { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" },
1811        { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" },
1812        { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" }
1813    };
1814    IcuTestErrorCode errorCode(*this, "BasicNormalizerTest/TestCustomFCC");
1815    const Normalizer2 *customNorm2=
1816        Normalizer2::getInstance(loadTestData(errorCode), "testnorm",
1817                                 UNORM2_COMPOSE_CONTIGUOUS, errorCode);
1818    if(errorCode.logIfFailureAndReset("unable to load testdata/testnorm.nrm")) {
1819        return;
1820    }
1821    for(int32_t i=0; i<LENGTHOF(pairs); ++i) {
1822        const StringPair &pair=pairs[i];
1823        UnicodeString input=UnicodeString(pair.input, -1, US_INV).unescape();
1824        UnicodeString expected=UnicodeString(pair.expected, -1, US_INV).unescape();
1825        UnicodeString result=customNorm2->normalize(input, errorCode);
1826        if(result!=expected) {
1827            errln("custom FCC Normalizer2 did not normalize input %d as expected", i);
1828        }
1829    }
1830}
1831
1832#endif /* #if !UCONFIG_NO_NORMALIZATION */
1833