1/*
2*******************************************************************************
3*
4*   Copyright (C) 2009-2010, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  filterednormalizer2.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2009dec10
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_NORMALIZATION
20
21#include "unicode/normalizer2.h"
22#include "unicode/uniset.h"
23#include "unicode/unistr.h"
24#include "unicode/unorm.h"
25#include "cpputils.h"
26
27U_NAMESPACE_BEGIN
28
29UnicodeString &
30FilteredNormalizer2::normalize(const UnicodeString &src,
31                               UnicodeString &dest,
32                               UErrorCode &errorCode) const {
33    uprv_checkCanGetBuffer(src, errorCode);
34    if(U_FAILURE(errorCode)) {
35        dest.setToBogus();
36        return dest;
37    }
38    if(&dest==&src) {
39        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
40        return dest;
41    }
42    dest.remove();
43    return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
44}
45
46// Internal: No argument checking, and appends to dest.
47// Pass as input spanCondition the one that is likely to yield a non-zero
48// span length at the start of src.
49// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
50// USET_SPAN_SIMPLE should be passed in for the start of src
51// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
52// an in-filter prefix.
53UnicodeString &
54FilteredNormalizer2::normalize(const UnicodeString &src,
55                               UnicodeString &dest,
56                               USetSpanCondition spanCondition,
57                               UErrorCode &errorCode) const {
58    UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
59    for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
60        int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
61        int32_t spanLength=spanLimit-prevSpanLimit;
62        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
63            if(spanLength!=0) {
64                dest.append(src, prevSpanLimit, spanLength);
65            }
66            spanCondition=USET_SPAN_SIMPLE;
67        } else {
68            if(spanLength!=0) {
69                // Not norm2.normalizeSecondAndAppend() because we do not want
70                // to modify the non-filter part of dest.
71                dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
72                                            tempDest, errorCode));
73                if(U_FAILURE(errorCode)) {
74                    break;
75                }
76            }
77            spanCondition=USET_SPAN_NOT_CONTAINED;
78        }
79        prevSpanLimit=spanLimit;
80    }
81    return dest;
82}
83
84UnicodeString &
85FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
86                                              const UnicodeString &second,
87                                              UErrorCode &errorCode) const {
88    return normalizeSecondAndAppend(first, second, TRUE, errorCode);
89}
90
91UnicodeString &
92FilteredNormalizer2::append(UnicodeString &first,
93                            const UnicodeString &second,
94                            UErrorCode &errorCode) const {
95    return normalizeSecondAndAppend(first, second, FALSE, errorCode);
96}
97
98UnicodeString &
99FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
100                                              const UnicodeString &second,
101                                              UBool doNormalize,
102                                              UErrorCode &errorCode) const {
103    uprv_checkCanGetBuffer(first, errorCode);
104    uprv_checkCanGetBuffer(second, errorCode);
105    if(U_FAILURE(errorCode)) {
106        return first;
107    }
108    if(&first==&second) {
109        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
110        return first;
111    }
112    if(first.isEmpty()) {
113        if(doNormalize) {
114            return normalize(second, first, errorCode);
115        } else {
116            return first=second;
117        }
118    }
119    // merge the in-filter suffix of the first string with the in-filter prefix of the second
120    int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
121    if(prefixLimit!=0) {
122        UnicodeString prefix(second.tempSubString(0, prefixLimit));
123        int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
124        if(suffixStart==0) {
125            if(doNormalize) {
126                norm2.normalizeSecondAndAppend(first, prefix, errorCode);
127            } else {
128                norm2.append(first, prefix, errorCode);
129            }
130        } else {
131            UnicodeString middle(first, suffixStart, INT32_MAX);
132            if(doNormalize) {
133                norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
134            } else {
135                norm2.append(middle, prefix, errorCode);
136            }
137            first.replace(suffixStart, INT32_MAX, middle);
138        }
139    }
140    if(prefixLimit<second.length()) {
141        UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
142        if(doNormalize) {
143            normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
144        } else {
145            first.append(rest);
146        }
147    }
148    return first;
149}
150
151UBool
152FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
153    return set.contains(c) && norm2.getDecomposition(c, decomposition);
154}
155
156UBool
157FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
158    uprv_checkCanGetBuffer(s, errorCode);
159    if(U_FAILURE(errorCode)) {
160        return FALSE;
161    }
162    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
163    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
164        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
165        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
166            spanCondition=USET_SPAN_SIMPLE;
167        } else {
168            if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
169                U_FAILURE(errorCode)
170            ) {
171                return FALSE;
172            }
173            spanCondition=USET_SPAN_NOT_CONTAINED;
174        }
175        prevSpanLimit=spanLimit;
176    }
177    return TRUE;
178}
179
180UNormalizationCheckResult
181FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
182    uprv_checkCanGetBuffer(s, errorCode);
183    if(U_FAILURE(errorCode)) {
184        return UNORM_MAYBE;
185    }
186    UNormalizationCheckResult result=UNORM_YES;
187    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
188    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
189        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
190        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
191            spanCondition=USET_SPAN_SIMPLE;
192        } else {
193            UNormalizationCheckResult qcResult=
194                norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
195            if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
196                return qcResult;
197            } else if(qcResult==UNORM_MAYBE) {
198                result=qcResult;
199            }
200            spanCondition=USET_SPAN_NOT_CONTAINED;
201        }
202        prevSpanLimit=spanLimit;
203    }
204    return result;
205}
206
207int32_t
208FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
209    uprv_checkCanGetBuffer(s, errorCode);
210    if(U_FAILURE(errorCode)) {
211        return 0;
212    }
213    USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
214    for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
215        int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
216        if(spanCondition==USET_SPAN_NOT_CONTAINED) {
217            spanCondition=USET_SPAN_SIMPLE;
218        } else {
219            int32_t yesLimit=
220                prevSpanLimit+
221                norm2.spanQuickCheckYes(
222                    s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
223            if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
224                return yesLimit;
225            }
226            spanCondition=USET_SPAN_NOT_CONTAINED;
227        }
228        prevSpanLimit=spanLimit;
229    }
230    return s.length();
231}
232
233UBool
234FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
235    return !set.contains(c) || norm2.hasBoundaryBefore(c);
236}
237
238UBool
239FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
240    return !set.contains(c) || norm2.hasBoundaryAfter(c);
241}
242
243UBool
244FilteredNormalizer2::isInert(UChar32 c) const {
245    return !set.contains(c) || norm2.isInert(c);
246}
247
248U_NAMESPACE_END
249
250// C API ------------------------------------------------------------------- ***
251
252U_NAMESPACE_USE
253
254U_DRAFT UNormalizer2 * U_EXPORT2
255unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
256    if(U_FAILURE(*pErrorCode)) {
257        return NULL;
258    }
259    if(filterSet==NULL) {
260        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
261        return NULL;
262    }
263    Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
264                                             *UnicodeSet::fromUSet(filterSet));
265    if(fn2==NULL) {
266        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
267    }
268    return (UNormalizer2 *)fn2;
269}
270
271#endif  // !UCONFIG_NO_NORMALIZATION
272