1/* 2****************************************************************************** 3* Copyright (c) 1996-2011, International Business Machines 4* Corporation and others. All Rights Reserved. 5****************************************************************************** 6* File unorm.cpp 7* 8* Created by: Vladimir Weinstein 12052000 9* 10* Modification history : 11* 12* Date Name Description 13* 02/01/01 synwee Added normalization quickcheck enum and method. 14* 02/12/01 synwee Commented out quickcheck util api has been approved 15* Added private method for doing FCD checks 16* 02/23/01 synwee Modified quickcheck and checkFCE to run through 17* string for codepoints < 0x300 for the normalization 18* mode NFC. 19* 05/25/01+ Markus Scherer total rewrite, implement all normalization here 20* instead of just wrappers around normlzr.cpp, 21* load unorm.dat, support Unicode 3.1 with 22* supplementary code points, etc. 23* 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code 24*/ 25 26#include "unicode/utypes.h" 27 28#if !UCONFIG_NO_NORMALIZATION 29 30#include "unicode/udata.h" 31#include "unicode/ustring.h" 32#include "unicode/uiter.h" 33#include "unicode/unorm.h" 34#include "unicode/unorm2.h" 35#include "normalizer2impl.h" 36#include "unormimp.h" 37#include "uprops.h" 38#include "ustr_imp.h" 39 40#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 41 42U_NAMESPACE_USE 43 44/* quick check functions ---------------------------------------------------- */ 45 46U_CAPI UNormalizationCheckResult U_EXPORT2 47unorm_quickCheck(const UChar *src, 48 int32_t srcLength, 49 UNormalizationMode mode, 50 UErrorCode *pErrorCode) { 51 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 52 return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 53} 54 55U_CAPI UNormalizationCheckResult U_EXPORT2 56unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, 57 UNormalizationMode mode, int32_t options, 58 UErrorCode *pErrorCode) { 59 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 60 if(options&UNORM_UNICODE_3_2) { 61 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); 62 return unorm2_quickCheck( 63 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), 64 src, srcLength, pErrorCode); 65 } else { 66 return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 67 } 68} 69 70U_CAPI UBool U_EXPORT2 71unorm_isNormalized(const UChar *src, int32_t srcLength, 72 UNormalizationMode mode, 73 UErrorCode *pErrorCode) { 74 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 75 return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 76} 77 78U_CAPI UBool U_EXPORT2 79unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength, 80 UNormalizationMode mode, int32_t options, 81 UErrorCode *pErrorCode) { 82 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 83 if(options&UNORM_UNICODE_3_2) { 84 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); 85 return unorm2_isNormalized( 86 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), 87 src, srcLength, pErrorCode); 88 } else { 89 return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 90 } 91} 92 93/* normalize() API ---------------------------------------------------------- */ 94 95/** Public API for normalizing. */ 96U_CAPI int32_t U_EXPORT2 97unorm_normalize(const UChar *src, int32_t srcLength, 98 UNormalizationMode mode, int32_t options, 99 UChar *dest, int32_t destCapacity, 100 UErrorCode *pErrorCode) { 101 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 102 if(options&UNORM_UNICODE_3_2) { 103 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); 104 return unorm2_normalize( 105 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), 106 src, srcLength, dest, destCapacity, pErrorCode); 107 } else { 108 return unorm2_normalize((const UNormalizer2 *)n2, 109 src, srcLength, dest, destCapacity, pErrorCode); 110 } 111} 112 113 114/* iteration functions ------------------------------------------------------ */ 115 116static int32_t 117unorm_iterate(UCharIterator *src, UBool forward, 118 UChar *dest, int32_t destCapacity, 119 UNormalizationMode mode, int32_t options, 120 UBool doNormalize, UBool *pNeededToNormalize, 121 UErrorCode *pErrorCode) { 122 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 123 const UnicodeSet *uni32; 124 if(options&UNORM_UNICODE_3_2) { 125 uni32=uniset_getUnicode32Instance(*pErrorCode); 126 } else { 127 uni32=NULL; // unused 128 } 129 130 if(U_FAILURE(*pErrorCode)) { 131 return 0; 132 } 133 134 FilteredNormalizer2 fn2(*n2, *uni32); 135 if(options&UNORM_UNICODE_3_2) { 136 n2=&fn2; 137 } 138 139 if( destCapacity<0 || (dest==NULL && destCapacity>0) || 140 src==NULL 141 ) { 142 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 143 return 0; 144 } 145 146 if(pNeededToNormalize!=NULL) { 147 *pNeededToNormalize=FALSE; 148 } 149 if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { 150 return u_terminateUChars(dest, destCapacity, 0, pErrorCode); 151 } 152 153 UnicodeString buffer; 154 UChar32 c; 155 if(forward) { 156 /* get one character and ignore its properties */ 157 buffer.append(uiter_next32(src)); 158 /* get all following characters until we see a boundary */ 159 while((c=uiter_next32(src))>=0) { 160 if(n2->hasBoundaryBefore(c)) { 161 /* back out the latest movement to stop at the boundary */ 162 src->move(src, -U16_LENGTH(c), UITER_CURRENT); 163 break; 164 } else { 165 buffer.append(c); 166 } 167 } 168 } else { 169 while((c=uiter_previous32(src))>=0) { 170 /* always write this character to the front of the buffer */ 171 buffer.insert(0, c); 172 /* stop if this just-copied character is a boundary */ 173 if(n2->hasBoundaryBefore(c)) { 174 break; 175 } 176 } 177 } 178 179 UnicodeString destString(dest, 0, destCapacity); 180 if(buffer.length()>0 && doNormalize) { 181 n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); 182 if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) { 183 *pNeededToNormalize= destString!=buffer; 184 } 185 return destString.length(); 186 } else { 187 /* just copy the source characters */ 188 return buffer.extract(dest, destCapacity, *pErrorCode); 189 } 190} 191 192U_CAPI int32_t U_EXPORT2 193unorm_previous(UCharIterator *src, 194 UChar *dest, int32_t destCapacity, 195 UNormalizationMode mode, int32_t options, 196 UBool doNormalize, UBool *pNeededToNormalize, 197 UErrorCode *pErrorCode) { 198 return unorm_iterate(src, FALSE, 199 dest, destCapacity, 200 mode, options, 201 doNormalize, pNeededToNormalize, 202 pErrorCode); 203} 204 205U_CAPI int32_t U_EXPORT2 206unorm_next(UCharIterator *src, 207 UChar *dest, int32_t destCapacity, 208 UNormalizationMode mode, int32_t options, 209 UBool doNormalize, UBool *pNeededToNormalize, 210 UErrorCode *pErrorCode) { 211 return unorm_iterate(src, TRUE, 212 dest, destCapacity, 213 mode, options, 214 doNormalize, pNeededToNormalize, 215 pErrorCode); 216} 217 218/* Concatenation of normalized strings -------------------------------------- */ 219 220U_CAPI int32_t U_EXPORT2 221unorm_concatenate(const UChar *left, int32_t leftLength, 222 const UChar *right, int32_t rightLength, 223 UChar *dest, int32_t destCapacity, 224 UNormalizationMode mode, int32_t options, 225 UErrorCode *pErrorCode) { 226 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 227 const UnicodeSet *uni32; 228 if(options&UNORM_UNICODE_3_2) { 229 uni32=uniset_getUnicode32Instance(*pErrorCode); 230 } else { 231 uni32=NULL; // unused 232 } 233 234 if(U_FAILURE(*pErrorCode)) { 235 return 0; 236 } 237 238 FilteredNormalizer2 fn2(*n2, *uni32); 239 if(options&UNORM_UNICODE_3_2) { 240 n2=&fn2; 241 } 242 243 if( destCapacity<0 || (dest==NULL && destCapacity>0) || 244 left==NULL || leftLength<-1 || 245 right==NULL || rightLength<-1 246 ) { 247 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 248 return 0; 249 } 250 251 /* check for overlapping right and destination */ 252 if( dest!=NULL && 253 ((right>=dest && right<(dest+destCapacity)) || 254 (rightLength>0 && dest>=right && dest<(right+rightLength))) 255 ) { 256 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 257 return 0; 258 } 259 260 /* allow left==dest */ 261 UnicodeString destString; 262 if(left==dest) { 263 destString.setTo(dest, leftLength, destCapacity); 264 } else { 265 destString.setTo(dest, 0, destCapacity); 266 destString.append(left, leftLength); 267 } 268 return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode). 269 extract(dest, destCapacity, *pErrorCode); 270} 271 272#endif /* #if !UCONFIG_NO_NORMALIZATION */ 273