1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6*   Copyright (C) 2009-2016, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9*******************************************************************************
10*   file name:  normalizer2.cpp
11*   encoding:   US-ASCII
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 2009nov22
16*   created by: Markus W. Scherer
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_NORMALIZATION
22
23#include "unicode/normalizer2.h"
24#include "unicode/unistr.h"
25#include "unicode/unorm.h"
26#include "cstring.h"
27#include "mutex.h"
28#include "norm2allmodes.h"
29#include "normalizer2impl.h"
30#include "uassert.h"
31#include "ucln_cmn.h"
32
33using icu::Normalizer2Impl;
34
35// NFC/NFD data machine-generated by gennorm2 --csource
36#define INCLUDED_FROM_NORMALIZER2_CPP
37#include "norm2_nfc_data.h"
38
39U_NAMESPACE_BEGIN
40
41// Public API dispatch via Normalizer2 subclasses -------------------------- ***
42
43Normalizer2::~Normalizer2() {}
44
45UBool
46Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
47    return FALSE;
48}
49
50UChar32
51Normalizer2::composePair(UChar32, UChar32) const {
52    return U_SENTINEL;
53}
54
55uint8_t
56Normalizer2::getCombiningClass(UChar32 /*c*/) const {
57    return 0;
58}
59
60// Normalizer2 implementation for the old UNORM_NONE.
61class NoopNormalizer2 : public Normalizer2 {
62    virtual ~NoopNormalizer2();
63
64    virtual UnicodeString &
65    normalize(const UnicodeString &src,
66              UnicodeString &dest,
67              UErrorCode &errorCode) const {
68        if(U_SUCCESS(errorCode)) {
69            if(&dest!=&src) {
70                dest=src;
71            } else {
72                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
73            }
74        }
75        return dest;
76    }
77    virtual UnicodeString &
78    normalizeSecondAndAppend(UnicodeString &first,
79                             const UnicodeString &second,
80                             UErrorCode &errorCode) const {
81        if(U_SUCCESS(errorCode)) {
82            if(&first!=&second) {
83                first.append(second);
84            } else {
85                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
86            }
87        }
88        return first;
89    }
90    virtual UnicodeString &
91    append(UnicodeString &first,
92           const UnicodeString &second,
93           UErrorCode &errorCode) const {
94        if(U_SUCCESS(errorCode)) {
95            if(&first!=&second) {
96                first.append(second);
97            } else {
98                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
99            }
100        }
101        return first;
102    }
103    virtual UBool
104    getDecomposition(UChar32, UnicodeString &) const {
105        return FALSE;
106    }
107    // No need to override the default getRawDecomposition().
108    virtual UBool
109    isNormalized(const UnicodeString &, UErrorCode &) const {
110        return TRUE;
111    }
112    virtual UNormalizationCheckResult
113    quickCheck(const UnicodeString &, UErrorCode &) const {
114        return UNORM_YES;
115    }
116    virtual int32_t
117    spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const {
118        return s.length();
119    }
120    virtual UBool hasBoundaryBefore(UChar32) const { return TRUE; }
121    virtual UBool hasBoundaryAfter(UChar32) const { return TRUE; }
122    virtual UBool isInert(UChar32) const { return TRUE; }
123};
124
125NoopNormalizer2::~NoopNormalizer2() {}
126
127Normalizer2WithImpl::~Normalizer2WithImpl() {}
128
129DecomposeNormalizer2::~DecomposeNormalizer2() {}
130
131ComposeNormalizer2::~ComposeNormalizer2() {}
132
133FCDNormalizer2::~FCDNormalizer2() {}
134
135// instance cache ---------------------------------------------------------- ***
136
137Norm2AllModes::~Norm2AllModes() {
138    delete impl;
139}
140
141Norm2AllModes *
142Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
143    if(U_FAILURE(errorCode)) {
144        delete impl;
145        return NULL;
146    }
147    Norm2AllModes *allModes=new Norm2AllModes(impl);
148    if(allModes==NULL) {
149        errorCode=U_MEMORY_ALLOCATION_ERROR;
150        delete impl;
151        return NULL;
152    }
153    return allModes;
154}
155
156Norm2AllModes *
157Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
158    if(U_FAILURE(errorCode)) {
159        return NULL;
160    }
161    Normalizer2Impl *impl=new Normalizer2Impl;
162    if(impl==NULL) {
163        errorCode=U_MEMORY_ALLOCATION_ERROR;
164        return NULL;
165    }
166    impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
167               norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
168    return createInstance(impl, errorCode);
169}
170
171U_CDECL_BEGIN
172static UBool U_CALLCONV uprv_normalizer2_cleanup();
173U_CDECL_END
174
175static Norm2AllModes *nfcSingleton;
176static Normalizer2   *noopSingleton;
177
178static icu::UInitOnce nfcInitOnce = U_INITONCE_INITIALIZER;
179static icu::UInitOnce noopInitOnce = U_INITONCE_INITIALIZER;
180
181// UInitOnce singleton initialization functions
182static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
183    nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
184    ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
185}
186
187static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
188    if(U_FAILURE(errorCode)) {
189        return;
190    }
191    noopSingleton=new NoopNormalizer2;
192    if(noopSingleton==NULL) {
193        errorCode=U_MEMORY_ALLOCATION_ERROR;
194        return;
195    }
196    ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
197}
198
199U_CDECL_BEGIN
200
201static UBool U_CALLCONV uprv_normalizer2_cleanup() {
202    delete nfcSingleton;
203    nfcSingleton = NULL;
204    delete noopSingleton;
205    noopSingleton = NULL;
206    nfcInitOnce.reset();
207    noopInitOnce.reset();
208    return TRUE;
209}
210
211U_CDECL_END
212
213const Norm2AllModes *
214Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
215    if(U_FAILURE(errorCode)) { return NULL; }
216    umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
217    return nfcSingleton;
218}
219
220const Normalizer2 *
221Normalizer2::getNFCInstance(UErrorCode &errorCode) {
222    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
223    return allModes!=NULL ? &allModes->comp : NULL;
224}
225
226const Normalizer2 *
227Normalizer2::getNFDInstance(UErrorCode &errorCode) {
228    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
229    return allModes!=NULL ? &allModes->decomp : NULL;
230}
231
232const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
233    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
234    return allModes!=NULL ? &allModes->fcd : NULL;
235}
236
237const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
238    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
239    return allModes!=NULL ? &allModes->fcc : NULL;
240}
241
242const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
243    if(U_FAILURE(errorCode)) { return NULL; }
244    umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
245    return noopSingleton;
246}
247
248const Normalizer2Impl *
249Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
250    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
251    return allModes!=NULL ? allModes->impl : NULL;
252}
253
254const Normalizer2Impl *
255Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
256    return &((Normalizer2WithImpl *)norm2)->impl;
257}
258
259U_NAMESPACE_END
260
261// C API ------------------------------------------------------------------- ***
262
263U_NAMESPACE_USE
264
265U_CAPI const UNormalizer2 * U_EXPORT2
266unorm2_getNFCInstance(UErrorCode *pErrorCode) {
267    return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
268}
269
270U_CAPI const UNormalizer2 * U_EXPORT2
271unorm2_getNFDInstance(UErrorCode *pErrorCode) {
272    return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
273}
274
275U_CAPI void U_EXPORT2
276unorm2_close(UNormalizer2 *norm2) {
277    delete (Normalizer2 *)norm2;
278}
279
280U_CAPI int32_t U_EXPORT2
281unorm2_normalize(const UNormalizer2 *norm2,
282                 const UChar *src, int32_t length,
283                 UChar *dest, int32_t capacity,
284                 UErrorCode *pErrorCode) {
285    if(U_FAILURE(*pErrorCode)) {
286        return 0;
287    }
288    if( (src==NULL ? length!=0 : length<-1) ||
289        (dest==NULL ? capacity!=0 : capacity<0) ||
290        (src==dest && src!=NULL)
291    ) {
292        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
293        return 0;
294    }
295    UnicodeString destString(dest, 0, capacity);
296    // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash.
297    if(length!=0) {
298        const Normalizer2 *n2=(const Normalizer2 *)norm2;
299        const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
300        if(n2wi!=NULL) {
301            // Avoid duplicate argument checking and support NUL-terminated src.
302            ReorderingBuffer buffer(n2wi->impl, destString);
303            if(buffer.init(length, *pErrorCode)) {
304                n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode);
305            }
306        } else {
307            UnicodeString srcString(length<0, src, length);
308            n2->normalize(srcString, destString, *pErrorCode);
309        }
310    }
311    return destString.extract(dest, capacity, *pErrorCode);
312}
313
314static int32_t
315normalizeSecondAndAppend(const UNormalizer2 *norm2,
316                         UChar *first, int32_t firstLength, int32_t firstCapacity,
317                         const UChar *second, int32_t secondLength,
318                         UBool doNormalize,
319                         UErrorCode *pErrorCode) {
320    if(U_FAILURE(*pErrorCode)) {
321        return 0;
322    }
323    if( (second==NULL ? secondLength!=0 : secondLength<-1) ||
324        (first==NULL ? (firstCapacity!=0 || firstLength!=0) :
325                       (firstCapacity<0 || firstLength<-1)) ||
326        (first==second && first!=NULL)
327    ) {
328        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
329        return 0;
330    }
331    UnicodeString firstString(first, firstLength, firstCapacity);
332    firstLength=firstString.length();  // In case it was -1.
333    // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash.
334    if(secondLength!=0) {
335        const Normalizer2 *n2=(const Normalizer2 *)norm2;
336        const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
337        if(n2wi!=NULL) {
338            // Avoid duplicate argument checking and support NUL-terminated src.
339            UnicodeString safeMiddle;
340            {
341                ReorderingBuffer buffer(n2wi->impl, firstString);
342                if(buffer.init(firstLength+secondLength+1, *pErrorCode)) {  // destCapacity>=-1
343                    n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL,
344                                             doNormalize, safeMiddle, buffer, *pErrorCode);
345                }
346            }  // The ReorderingBuffer destructor finalizes firstString.
347            if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
348                // Restore the modified suffix of the first string.
349                // This does not restore first[] array contents between firstLength and firstCapacity.
350                // (That might be uninitialized memory, as far as we know.)
351                if(first!=NULL) { /* don't dereference NULL */
352                  safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
353                  if(firstLength<firstCapacity) {
354                    first[firstLength]=0;  // NUL-terminate in case it was originally.
355                  }
356                }
357            }
358        } else {
359            UnicodeString secondString(secondLength<0, second, secondLength);
360            if(doNormalize) {
361                n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
362            } else {
363                n2->append(firstString, secondString, *pErrorCode);
364            }
365        }
366    }
367    return firstString.extract(first, firstCapacity, *pErrorCode);
368}
369
370U_CAPI int32_t U_EXPORT2
371unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
372                                UChar *first, int32_t firstLength, int32_t firstCapacity,
373                                const UChar *second, int32_t secondLength,
374                                UErrorCode *pErrorCode) {
375    return normalizeSecondAndAppend(norm2,
376                                    first, firstLength, firstCapacity,
377                                    second, secondLength,
378                                    TRUE, pErrorCode);
379}
380
381U_CAPI int32_t U_EXPORT2
382unorm2_append(const UNormalizer2 *norm2,
383              UChar *first, int32_t firstLength, int32_t firstCapacity,
384              const UChar *second, int32_t secondLength,
385              UErrorCode *pErrorCode) {
386    return normalizeSecondAndAppend(norm2,
387                                    first, firstLength, firstCapacity,
388                                    second, secondLength,
389                                    FALSE, pErrorCode);
390}
391
392U_CAPI int32_t U_EXPORT2
393unorm2_getDecomposition(const UNormalizer2 *norm2,
394                        UChar32 c, UChar *decomposition, int32_t capacity,
395                        UErrorCode *pErrorCode) {
396    if(U_FAILURE(*pErrorCode)) {
397        return 0;
398    }
399    if(decomposition==NULL ? capacity!=0 : capacity<0) {
400        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
401        return 0;
402    }
403    UnicodeString destString(decomposition, 0, capacity);
404    if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
405        return destString.extract(decomposition, capacity, *pErrorCode);
406    } else {
407        return -1;
408    }
409}
410
411U_CAPI int32_t U_EXPORT2
412unorm2_getRawDecomposition(const UNormalizer2 *norm2,
413                           UChar32 c, UChar *decomposition, int32_t capacity,
414                           UErrorCode *pErrorCode) {
415    if(U_FAILURE(*pErrorCode)) {
416        return 0;
417    }
418    if(decomposition==NULL ? capacity!=0 : capacity<0) {
419        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
420        return 0;
421    }
422    UnicodeString destString(decomposition, 0, capacity);
423    if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
424        return destString.extract(decomposition, capacity, *pErrorCode);
425    } else {
426        return -1;
427    }
428}
429
430U_CAPI UChar32 U_EXPORT2
431unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
432    return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
433}
434
435U_CAPI uint8_t U_EXPORT2
436unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
437    return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
438}
439
440U_CAPI UBool U_EXPORT2
441unorm2_isNormalized(const UNormalizer2 *norm2,
442                    const UChar *s, int32_t length,
443                    UErrorCode *pErrorCode) {
444    if(U_FAILURE(*pErrorCode)) {
445        return 0;
446    }
447    if((s==NULL && length!=0) || length<-1) {
448        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
449        return 0;
450    }
451    UnicodeString sString(length<0, s, length);
452    return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
453}
454
455U_CAPI UNormalizationCheckResult U_EXPORT2
456unorm2_quickCheck(const UNormalizer2 *norm2,
457                  const UChar *s, int32_t length,
458                  UErrorCode *pErrorCode) {
459    if(U_FAILURE(*pErrorCode)) {
460        return UNORM_NO;
461    }
462    if((s==NULL && length!=0) || length<-1) {
463        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
464        return UNORM_NO;
465    }
466    UnicodeString sString(length<0, s, length);
467    return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
468}
469
470U_CAPI int32_t U_EXPORT2
471unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
472                         const UChar *s, int32_t length,
473                         UErrorCode *pErrorCode) {
474    if(U_FAILURE(*pErrorCode)) {
475        return 0;
476    }
477    if((s==NULL && length!=0) || length<-1) {
478        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
479        return 0;
480    }
481    UnicodeString sString(length<0, s, length);
482    return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
483}
484
485U_CAPI UBool U_EXPORT2
486unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
487    return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
488}
489
490U_CAPI UBool U_EXPORT2
491unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
492    return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
493}
494
495U_CAPI UBool U_EXPORT2
496unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
497    return ((const Normalizer2 *)norm2)->isInert(c);
498}
499
500// Some properties APIs ---------------------------------------------------- ***
501
502U_CAPI uint8_t U_EXPORT2
503u_getCombiningClass(UChar32 c) {
504    UErrorCode errorCode=U_ZERO_ERROR;
505    const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
506    if(U_SUCCESS(errorCode)) {
507        return nfd->getCombiningClass(c);
508    } else {
509        return 0;
510    }
511}
512
513U_CFUNC uint16_t
514unorm_getFCD16(UChar32 c) {
515    UErrorCode errorCode=U_ZERO_ERROR;
516    const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
517    if(U_SUCCESS(errorCode)) {
518        return impl->getFCD16(c);
519    } else {
520        return 0;
521    }
522}
523
524#endif  // !UCONFIG_NO_NORMALIZATION
525