1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* 6* Copyright (C) 2009-2012, International Business Machines 7* Corporation and others. All Rights Reserved. 8* 9******************************************************************************* 10* file name: filterednormalizer2.cpp 11* encoding: US-ASCII 12* tab size: 8 (not used) 13* indentation:4 14* 15* created on: 2009dec10 16* created by: Markus W. Scherer 17*/ 18 19#include "unicode/utypes.h" 20 21#if !UCONFIG_NO_NORMALIZATION 22 23#include "unicode/normalizer2.h" 24#include "unicode/uniset.h" 25#include "unicode/unistr.h" 26#include "unicode/unorm.h" 27#include "cpputils.h" 28 29U_NAMESPACE_BEGIN 30 31FilteredNormalizer2::~FilteredNormalizer2() {} 32 33UnicodeString & 34FilteredNormalizer2::normalize(const UnicodeString &src, 35 UnicodeString &dest, 36 UErrorCode &errorCode) const { 37 uprv_checkCanGetBuffer(src, errorCode); 38 if(U_FAILURE(errorCode)) { 39 dest.setToBogus(); 40 return dest; 41 } 42 if(&dest==&src) { 43 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 44 return dest; 45 } 46 dest.remove(); 47 return normalize(src, dest, USET_SPAN_SIMPLE, errorCode); 48} 49 50// Internal: No argument checking, and appends to dest. 51// Pass as input spanCondition the one that is likely to yield a non-zero 52// span length at the start of src. 53// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, 54// USET_SPAN_SIMPLE should be passed in for the start of src 55// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after 56// an in-filter prefix. 57UnicodeString & 58FilteredNormalizer2::normalize(const UnicodeString &src, 59 UnicodeString &dest, 60 USetSpanCondition spanCondition, 61 UErrorCode &errorCode) const { 62 UnicodeString tempDest; // Don't throw away destination buffer between iterations. 63 for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) { 64 int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition); 65 int32_t spanLength=spanLimit-prevSpanLimit; 66 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 67 if(spanLength!=0) { 68 dest.append(src, prevSpanLimit, spanLength); 69 } 70 spanCondition=USET_SPAN_SIMPLE; 71 } else { 72 if(spanLength!=0) { 73 // Not norm2.normalizeSecondAndAppend() because we do not want 74 // to modify the non-filter part of dest. 75 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit), 76 tempDest, errorCode)); 77 if(U_FAILURE(errorCode)) { 78 break; 79 } 80 } 81 spanCondition=USET_SPAN_NOT_CONTAINED; 82 } 83 prevSpanLimit=spanLimit; 84 } 85 return dest; 86} 87 88UnicodeString & 89FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, 90 const UnicodeString &second, 91 UErrorCode &errorCode) const { 92 return normalizeSecondAndAppend(first, second, TRUE, errorCode); 93} 94 95UnicodeString & 96FilteredNormalizer2::append(UnicodeString &first, 97 const UnicodeString &second, 98 UErrorCode &errorCode) const { 99 return normalizeSecondAndAppend(first, second, FALSE, errorCode); 100} 101 102UnicodeString & 103FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, 104 const UnicodeString &second, 105 UBool doNormalize, 106 UErrorCode &errorCode) const { 107 uprv_checkCanGetBuffer(first, errorCode); 108 uprv_checkCanGetBuffer(second, errorCode); 109 if(U_FAILURE(errorCode)) { 110 return first; 111 } 112 if(&first==&second) { 113 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 114 return first; 115 } 116 if(first.isEmpty()) { 117 if(doNormalize) { 118 return normalize(second, first, errorCode); 119 } else { 120 return first=second; 121 } 122 } 123 // merge the in-filter suffix of the first string with the in-filter prefix of the second 124 int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE); 125 if(prefixLimit!=0) { 126 UnicodeString prefix(second.tempSubString(0, prefixLimit)); 127 int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE); 128 if(suffixStart==0) { 129 if(doNormalize) { 130 norm2.normalizeSecondAndAppend(first, prefix, errorCode); 131 } else { 132 norm2.append(first, prefix, errorCode); 133 } 134 } else { 135 UnicodeString middle(first, suffixStart, INT32_MAX); 136 if(doNormalize) { 137 norm2.normalizeSecondAndAppend(middle, prefix, errorCode); 138 } else { 139 norm2.append(middle, prefix, errorCode); 140 } 141 first.replace(suffixStart, INT32_MAX, middle); 142 } 143 } 144 if(prefixLimit<second.length()) { 145 UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX)); 146 if(doNormalize) { 147 normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode); 148 } else { 149 first.append(rest); 150 } 151 } 152 return first; 153} 154 155UBool 156FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const { 157 return set.contains(c) && norm2.getDecomposition(c, decomposition); 158} 159 160UBool 161FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const { 162 return set.contains(c) && norm2.getRawDecomposition(c, decomposition); 163} 164 165UChar32 166FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const { 167 return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL; 168} 169 170uint8_t 171FilteredNormalizer2::getCombiningClass(UChar32 c) const { 172 return set.contains(c) ? norm2.getCombiningClass(c) : 0; 173} 174 175UBool 176FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const { 177 uprv_checkCanGetBuffer(s, errorCode); 178 if(U_FAILURE(errorCode)) { 179 return FALSE; 180 } 181 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; 182 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { 183 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); 184 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 185 spanCondition=USET_SPAN_SIMPLE; 186 } else { 187 if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) || 188 U_FAILURE(errorCode) 189 ) { 190 return FALSE; 191 } 192 spanCondition=USET_SPAN_NOT_CONTAINED; 193 } 194 prevSpanLimit=spanLimit; 195 } 196 return TRUE; 197} 198 199UNormalizationCheckResult 200FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const { 201 uprv_checkCanGetBuffer(s, errorCode); 202 if(U_FAILURE(errorCode)) { 203 return UNORM_MAYBE; 204 } 205 UNormalizationCheckResult result=UNORM_YES; 206 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; 207 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { 208 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); 209 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 210 spanCondition=USET_SPAN_SIMPLE; 211 } else { 212 UNormalizationCheckResult qcResult= 213 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); 214 if(U_FAILURE(errorCode) || qcResult==UNORM_NO) { 215 return qcResult; 216 } else if(qcResult==UNORM_MAYBE) { 217 result=qcResult; 218 } 219 spanCondition=USET_SPAN_NOT_CONTAINED; 220 } 221 prevSpanLimit=spanLimit; 222 } 223 return result; 224} 225 226int32_t 227FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const { 228 uprv_checkCanGetBuffer(s, errorCode); 229 if(U_FAILURE(errorCode)) { 230 return 0; 231 } 232 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; 233 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { 234 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); 235 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 236 spanCondition=USET_SPAN_SIMPLE; 237 } else { 238 int32_t yesLimit= 239 prevSpanLimit+ 240 norm2.spanQuickCheckYes( 241 s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); 242 if(U_FAILURE(errorCode) || yesLimit<spanLimit) { 243 return yesLimit; 244 } 245 spanCondition=USET_SPAN_NOT_CONTAINED; 246 } 247 prevSpanLimit=spanLimit; 248 } 249 return s.length(); 250} 251 252UBool 253FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const { 254 return !set.contains(c) || norm2.hasBoundaryBefore(c); 255} 256 257UBool 258FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const { 259 return !set.contains(c) || norm2.hasBoundaryAfter(c); 260} 261 262UBool 263FilteredNormalizer2::isInert(UChar32 c) const { 264 return !set.contains(c) || norm2.isInert(c); 265} 266 267U_NAMESPACE_END 268 269// C API ------------------------------------------------------------------- *** 270 271U_NAMESPACE_USE 272 273U_CAPI UNormalizer2 * U_EXPORT2 274unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) { 275 if(U_FAILURE(*pErrorCode)) { 276 return NULL; 277 } 278 if(filterSet==NULL) { 279 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 280 return NULL; 281 } 282 Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2, 283 *UnicodeSet::fromUSet(filterSet)); 284 if(fn2==NULL) { 285 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 286 } 287 return (UNormalizer2 *)fn2; 288} 289 290#endif // !UCONFIG_NO_NORMALIZATION 291