1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6*   Copyright (C) 2009-2012, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9*******************************************************************************
10*   file name:  filterednormalizer2.cpp
11*   encoding:   US-ASCII
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 2009dec10
16*   created by: Markus W. Scherer
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_NORMALIZATION
22
23#include "unicode/normalizer2.h"
24#include "unicode/uniset.h"
25#include "unicode/unistr.h"
26#include "unicode/unorm.h"
27#include "cpputils.h"
28
29U_NAMESPACE_BEGIN
30
31FilteredNormalizer2::~FilteredNormalizer2() {}
32
33UnicodeString &
34FilteredNormalizer2::normalize(const UnicodeString &src,
35                               UnicodeString &dest,
36                               UErrorCode &errorCode) const {
37    uprv_checkCanGetBuffer(src, errorCode);
38    if(U_FAILURE(errorCode)) {
39        dest.setToBogus();
40        return dest;
41    }
42    if(&dest==&src) {
43        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
44        return dest;
45    }
46    dest.remove();
47    return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
48}
49
50// Internal: No argument checking, and appends to dest.
51// Pass as input spanCondition the one that is likely to yield a non-zero
52// span length at the start of src.
53// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
54// USET_SPAN_SIMPLE should be passed in for the start of src
55// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
56// an in-filter prefix.
57UnicodeString &
58FilteredNormalizer2::normalize(const UnicodeString &src,
59                               UnicodeString &dest,
60                               USetSpanCondition spanCondition,
61                               UErrorCode &errorCode) const {
62    UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
63    for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
64        int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
65        int32_t spanLength=spanLimit-prevSpanLimit;
66        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
67            if(spanLength!=0) {
68                dest.append(src, prevSpanLimit, spanLength);
69            }
70            spanCondition=USET_SPAN_SIMPLE;
71        } else {
72            if(spanLength!=0) {
73                // Not norm2.normalizeSecondAndAppend() because we do not want
74                // to modify the non-filter part of dest.
75                dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
76                                            tempDest, errorCode));
77                if(U_FAILURE(errorCode)) {
78                    break;
79                }
80            }
81            spanCondition=USET_SPAN_NOT_CONTAINED;
82        }
83        prevSpanLimit=spanLimit;
84    }
85    return dest;
86}
87
88UnicodeString &
89FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
90                                              const UnicodeString &second,
91                                              UErrorCode &errorCode) const {
92    return normalizeSecondAndAppend(first, second, TRUE, errorCode);
93}
94
95UnicodeString &
96FilteredNormalizer2::append(UnicodeString &first,
97                            const UnicodeString &second,
98                            UErrorCode &errorCode) const {
99    return normalizeSecondAndAppend(first, second, FALSE, errorCode);
100}
101
102UnicodeString &
103FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
104                                              const UnicodeString &second,
105                                              UBool doNormalize,
106                                              UErrorCode &errorCode) const {
107    uprv_checkCanGetBuffer(first, errorCode);
108    uprv_checkCanGetBuffer(second, errorCode);
109    if(U_FAILURE(errorCode)) {
110        return first;
111    }
112    if(&first==&second) {
113        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
114        return first;
115    }
116    if(first.isEmpty()) {
117        if(doNormalize) {
118            return normalize(second, first, errorCode);
119        } else {
120            return first=second;
121        }
122    }
123    // merge the in-filter suffix of the first string with the in-filter prefix of the second
124    int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
125    if(prefixLimit!=0) {
126        UnicodeString prefix(second.tempSubString(0, prefixLimit));
127        int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
128        if(suffixStart==0) {
129            if(doNormalize) {
130                norm2.normalizeSecondAndAppend(first, prefix, errorCode);
131            } else {
132                norm2.append(first, prefix, errorCode);
133            }
134        } else {
135            UnicodeString middle(first, suffixStart, INT32_MAX);
136            if(doNormalize) {
137                norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
138            } else {
139                norm2.append(middle, prefix, errorCode);
140            }
141            first.replace(suffixStart, INT32_MAX, middle);
142        }
143    }
144    if(prefixLimit<second.length()) {
145        UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
146        if(doNormalize) {
147            normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
148        } else {
149            first.append(rest);
150        }
151    }
152    return first;
153}
154
155UBool
156FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
157    return set.contains(c) && norm2.getDecomposition(c, decomposition);
158}
159
160UBool
161FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
162    return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
163}
164
165UChar32
166FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
167    return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
168}
169
170uint8_t
171FilteredNormalizer2::getCombiningClass(UChar32 c) const {
172    return set.contains(c) ? norm2.getCombiningClass(c) : 0;
173}
174
175UBool
176FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
177    uprv_checkCanGetBuffer(s, errorCode);
178    if(U_FAILURE(errorCode)) {
179        return FALSE;
180    }
181    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
182    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
183        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
184        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
185            spanCondition=USET_SPAN_SIMPLE;
186        } else {
187            if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
188                U_FAILURE(errorCode)
189            ) {
190                return FALSE;
191            }
192            spanCondition=USET_SPAN_NOT_CONTAINED;
193        }
194        prevSpanLimit=spanLimit;
195    }
196    return TRUE;
197}
198
199UNormalizationCheckResult
200FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
201    uprv_checkCanGetBuffer(s, errorCode);
202    if(U_FAILURE(errorCode)) {
203        return UNORM_MAYBE;
204    }
205    UNormalizationCheckResult result=UNORM_YES;
206    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
207    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
208        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
209        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
210            spanCondition=USET_SPAN_SIMPLE;
211        } else {
212            UNormalizationCheckResult qcResult=
213                norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
214            if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
215                return qcResult;
216            } else if(qcResult==UNORM_MAYBE) {
217                result=qcResult;
218            }
219            spanCondition=USET_SPAN_NOT_CONTAINED;
220        }
221        prevSpanLimit=spanLimit;
222    }
223    return result;
224}
225
226int32_t
227FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
228    uprv_checkCanGetBuffer(s, errorCode);
229    if(U_FAILURE(errorCode)) {
230        return 0;
231    }
232    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
233    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
234        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
235        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
236            spanCondition=USET_SPAN_SIMPLE;
237        } else {
238            int32_t yesLimit=
239                prevSpanLimit+
240                norm2.spanQuickCheckYes(
241                    s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
242            if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
243                return yesLimit;
244            }
245            spanCondition=USET_SPAN_NOT_CONTAINED;
246        }
247        prevSpanLimit=spanLimit;
248    }
249    return s.length();
250}
251
252UBool
253FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
254    return !set.contains(c) || norm2.hasBoundaryBefore(c);
255}
256
257UBool
258FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
259    return !set.contains(c) || norm2.hasBoundaryAfter(c);
260}
261
262UBool
263FilteredNormalizer2::isInert(UChar32 c) const {
264    return !set.contains(c) || norm2.isInert(c);
265}
266
267U_NAMESPACE_END
268
269// C API ------------------------------------------------------------------- ***
270
271U_NAMESPACE_USE
272
273U_CAPI UNormalizer2 * U_EXPORT2
274unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
275    if(U_FAILURE(*pErrorCode)) {
276        return NULL;
277    }
278    if(filterSet==NULL) {
279        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
280        return NULL;
281    }
282    Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
283                                             *UnicodeSet::fromUSet(filterSet));
284    if(fn2==NULL) {
285        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
286    }
287    return (UNormalizer2 *)fn2;
288}
289
290#endif  // !UCONFIG_NO_NORMALIZATION
291