1/*
2*******************************************************************************
3*
4*   Copyright (C) 2009-2012, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  filterednormalizer2.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2009dec10
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_NORMALIZATION
20
21#include "unicode/normalizer2.h"
22#include "unicode/uniset.h"
23#include "unicode/unistr.h"
24#include "unicode/unorm.h"
25#include "cpputils.h"
26
27U_NAMESPACE_BEGIN
28
29FilteredNormalizer2::~FilteredNormalizer2() {}
30
31UnicodeString &
32FilteredNormalizer2::normalize(const UnicodeString &src,
33                               UnicodeString &dest,
34                               UErrorCode &errorCode) const {
35    uprv_checkCanGetBuffer(src, errorCode);
36    if(U_FAILURE(errorCode)) {
37        dest.setToBogus();
38        return dest;
39    }
40    if(&dest==&src) {
41        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
42        return dest;
43    }
44    dest.remove();
45    return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
46}
47
48// Internal: No argument checking, and appends to dest.
49// Pass as input spanCondition the one that is likely to yield a non-zero
50// span length at the start of src.
51// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
52// USET_SPAN_SIMPLE should be passed in for the start of src
53// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
54// an in-filter prefix.
55UnicodeString &
56FilteredNormalizer2::normalize(const UnicodeString &src,
57                               UnicodeString &dest,
58                               USetSpanCondition spanCondition,
59                               UErrorCode &errorCode) const {
60    UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
61    for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
62        int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
63        int32_t spanLength=spanLimit-prevSpanLimit;
64        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
65            if(spanLength!=0) {
66                dest.append(src, prevSpanLimit, spanLength);
67            }
68            spanCondition=USET_SPAN_SIMPLE;
69        } else {
70            if(spanLength!=0) {
71                // Not norm2.normalizeSecondAndAppend() because we do not want
72                // to modify the non-filter part of dest.
73                dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
74                                            tempDest, errorCode));
75                if(U_FAILURE(errorCode)) {
76                    break;
77                }
78            }
79            spanCondition=USET_SPAN_NOT_CONTAINED;
80        }
81        prevSpanLimit=spanLimit;
82    }
83    return dest;
84}
85
86UnicodeString &
87FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
88                                              const UnicodeString &second,
89                                              UErrorCode &errorCode) const {
90    return normalizeSecondAndAppend(first, second, TRUE, errorCode);
91}
92
93UnicodeString &
94FilteredNormalizer2::append(UnicodeString &first,
95                            const UnicodeString &second,
96                            UErrorCode &errorCode) const {
97    return normalizeSecondAndAppend(first, second, FALSE, errorCode);
98}
99
100UnicodeString &
101FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
102                                              const UnicodeString &second,
103                                              UBool doNormalize,
104                                              UErrorCode &errorCode) const {
105    uprv_checkCanGetBuffer(first, errorCode);
106    uprv_checkCanGetBuffer(second, errorCode);
107    if(U_FAILURE(errorCode)) {
108        return first;
109    }
110    if(&first==&second) {
111        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
112        return first;
113    }
114    if(first.isEmpty()) {
115        if(doNormalize) {
116            return normalize(second, first, errorCode);
117        } else {
118            return first=second;
119        }
120    }
121    // merge the in-filter suffix of the first string with the in-filter prefix of the second
122    int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
123    if(prefixLimit!=0) {
124        UnicodeString prefix(second.tempSubString(0, prefixLimit));
125        int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
126        if(suffixStart==0) {
127            if(doNormalize) {
128                norm2.normalizeSecondAndAppend(first, prefix, errorCode);
129            } else {
130                norm2.append(first, prefix, errorCode);
131            }
132        } else {
133            UnicodeString middle(first, suffixStart, INT32_MAX);
134            if(doNormalize) {
135                norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
136            } else {
137                norm2.append(middle, prefix, errorCode);
138            }
139            first.replace(suffixStart, INT32_MAX, middle);
140        }
141    }
142    if(prefixLimit<second.length()) {
143        UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
144        if(doNormalize) {
145            normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
146        } else {
147            first.append(rest);
148        }
149    }
150    return first;
151}
152
153UBool
154FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
155    return set.contains(c) && norm2.getDecomposition(c, decomposition);
156}
157
158UBool
159FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
160    return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
161}
162
163UChar32
164FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
165    return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
166}
167
168uint8_t
169FilteredNormalizer2::getCombiningClass(UChar32 c) const {
170    return set.contains(c) ? norm2.getCombiningClass(c) : 0;
171}
172
173UBool
174FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
175    uprv_checkCanGetBuffer(s, errorCode);
176    if(U_FAILURE(errorCode)) {
177        return FALSE;
178    }
179    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
180    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
181        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
182        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
183            spanCondition=USET_SPAN_SIMPLE;
184        } else {
185            if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
186                U_FAILURE(errorCode)
187            ) {
188                return FALSE;
189            }
190            spanCondition=USET_SPAN_NOT_CONTAINED;
191        }
192        prevSpanLimit=spanLimit;
193    }
194    return TRUE;
195}
196
197UNormalizationCheckResult
198FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
199    uprv_checkCanGetBuffer(s, errorCode);
200    if(U_FAILURE(errorCode)) {
201        return UNORM_MAYBE;
202    }
203    UNormalizationCheckResult result=UNORM_YES;
204    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
205    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
206        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
207        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
208            spanCondition=USET_SPAN_SIMPLE;
209        } else {
210            UNormalizationCheckResult qcResult=
211                norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
212            if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
213                return qcResult;
214            } else if(qcResult==UNORM_MAYBE) {
215                result=qcResult;
216            }
217            spanCondition=USET_SPAN_NOT_CONTAINED;
218        }
219        prevSpanLimit=spanLimit;
220    }
221    return result;
222}
223
224int32_t
225FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
226    uprv_checkCanGetBuffer(s, errorCode);
227    if(U_FAILURE(errorCode)) {
228        return 0;
229    }
230    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
231    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
232        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
233        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
234            spanCondition=USET_SPAN_SIMPLE;
235        } else {
236            int32_t yesLimit=
237                prevSpanLimit+
238                norm2.spanQuickCheckYes(
239                    s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
240            if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
241                return yesLimit;
242            }
243            spanCondition=USET_SPAN_NOT_CONTAINED;
244        }
245        prevSpanLimit=spanLimit;
246    }
247    return s.length();
248}
249
250UBool
251FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
252    return !set.contains(c) || norm2.hasBoundaryBefore(c);
253}
254
255UBool
256FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
257    return !set.contains(c) || norm2.hasBoundaryAfter(c);
258}
259
260UBool
261FilteredNormalizer2::isInert(UChar32 c) const {
262    return !set.contains(c) || norm2.isInert(c);
263}
264
265U_NAMESPACE_END
266
267// C API ------------------------------------------------------------------- ***
268
269U_NAMESPACE_USE
270
271U_CAPI UNormalizer2 * U_EXPORT2
272unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
273    if(U_FAILURE(*pErrorCode)) {
274        return NULL;
275    }
276    if(filterSet==NULL) {
277        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
278        return NULL;
279    }
280    Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
281                                             *UnicodeSet::fromUSet(filterSet));
282    if(fn2==NULL) {
283        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
284    }
285    return (UNormalizer2 *)fn2;
286}
287
288#endif  // !UCONFIG_NO_NORMALIZATION
289