1/* 2******************************************************************************* 3* 4* Copyright (C) 1999-2009, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: unistr_cnv.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:2 12* 13* created on: 2004aug19 14* created by: Markus W. Scherer 15* 16* Character conversion functions moved here from unistr.cpp 17*/ 18 19#include "unicode/utypes.h" 20 21#if !UCONFIG_NO_CONVERSION 22 23#include "unicode/putil.h" 24#include "cstring.h" 25#include "cmemory.h" 26#include "unicode/ustring.h" 27#include "unicode/unistr.h" 28#include "unicode/ucnv.h" 29#include "ucnv_imp.h" 30#include "putilimp.h" 31#include "ustr_cnv.h" 32#include "ustr_imp.h" 33 34U_NAMESPACE_BEGIN 35 36//======================================== 37// Constructors 38//======================================== 39 40#if !U_CHARSET_IS_UTF8 41 42UnicodeString::UnicodeString(const char *codepageData) 43 : fShortLength(0), 44 fFlags(kShortString) 45{ 46 if(codepageData != 0) { 47 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0); 48 } 49} 50 51UnicodeString::UnicodeString(const char *codepageData, 52 int32_t dataLength) 53 : fShortLength(0), 54 fFlags(kShortString) 55{ 56 if(codepageData != 0) { 57 doCodepageCreate(codepageData, dataLength, 0); 58 } 59} 60 61// else see unistr.cpp 62#endif 63 64UnicodeString::UnicodeString(const char *codepageData, 65 const char *codepage) 66 : fShortLength(0), 67 fFlags(kShortString) 68{ 69 if(codepageData != 0) { 70 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage); 71 } 72} 73 74UnicodeString::UnicodeString(const char *codepageData, 75 int32_t dataLength, 76 const char *codepage) 77 : fShortLength(0), 78 fFlags(kShortString) 79{ 80 if(codepageData != 0) { 81 doCodepageCreate(codepageData, dataLength, codepage); 82 } 83} 84 85UnicodeString::UnicodeString(const char *src, int32_t srcLength, 86 UConverter *cnv, 87 UErrorCode &errorCode) 88 : fShortLength(0), 89 fFlags(kShortString) 90{ 91 if(U_SUCCESS(errorCode)) { 92 // check arguments 93 if(src==NULL) { 94 // treat as an empty string, do nothing more 95 } else if(srcLength<-1) { 96 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 97 } else { 98 // get input length 99 if(srcLength==-1) { 100 srcLength=(int32_t)uprv_strlen(src); 101 } 102 if(srcLength>0) { 103 if(cnv!=0) { 104 // use the provided converter 105 ucnv_resetToUnicode(cnv); 106 doCodepageCreate(src, srcLength, cnv, errorCode); 107 } else { 108 // use the default converter 109 cnv=u_getDefaultConverter(&errorCode); 110 doCodepageCreate(src, srcLength, cnv, errorCode); 111 u_releaseDefaultConverter(cnv); 112 } 113 } 114 } 115 116 if(U_FAILURE(errorCode)) { 117 setToBogus(); 118 } 119 } 120} 121 122//======================================== 123// Codeset conversion 124//======================================== 125 126#if !U_CHARSET_IS_UTF8 127 128int32_t 129UnicodeString::extract(int32_t start, 130 int32_t length, 131 char *target, 132 uint32_t dstSize) const { 133 return extract(start, length, target, dstSize, 0); 134} 135 136// else see unistr.cpp 137#endif 138 139int32_t 140UnicodeString::extract(int32_t start, 141 int32_t length, 142 char *target, 143 uint32_t dstSize, 144 const char *codepage) const 145{ 146 // if the arguments are illegal, then do nothing 147 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { 148 return 0; 149 } 150 151 // pin the indices to legal values 152 pinIndices(start, length); 153 154 // We need to cast dstSize to int32_t for all subsequent code. 155 // I don't know why the API was defined with uint32_t but we are stuck with it. 156 // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize 157 // as a limit in some functions, it may wrap around and yield a pointer 158 // that compares less-than target. 159 int32_t capacity; 160 if(dstSize < 0x7fffffff) { 161 // Assume that the capacity is real and a limit pointer won't wrap around. 162 capacity = (int32_t)dstSize; 163 } else { 164 char *targetLimit = target + 0x7fffffff; 165 if(targetLimit < target) { 166 // Pin the capacity so that a limit pointer does not wrap around. 167 targetLimit = (char *)U_MAX_PTR(target); 168 capacity = (int32_t)(targetLimit - target); 169 } else { 170 // Pin the capacity to the maximum int32_t value. 171 capacity = 0x7fffffff; 172 } 173 } 174 175 // create the converter 176 UConverter *converter; 177 UErrorCode status = U_ZERO_ERROR; 178 179 // just write the NUL if the string length is 0 180 if(length == 0) { 181 return u_terminateChars(target, capacity, 0, &status); 182 } 183 184 // if the codepage is the default, use our cache 185 // if it is an empty string, then use the "invariant character" conversion 186 if (codepage == 0) { 187 const char *defaultName = ucnv_getDefaultName(); 188 if(UCNV_FAST_IS_UTF8(defaultName)) { 189 return toUTF8(start, length, target, capacity); 190 } 191 converter = u_getDefaultConverter(&status); 192 } else if (*codepage == 0) { 193 // use the "invariant characters" conversion 194 int32_t destLength; 195 if(length <= capacity) { 196 destLength = length; 197 } else { 198 destLength = capacity; 199 } 200 u_UCharsToChars(getArrayStart() + start, target, destLength); 201 return u_terminateChars(target, capacity, length, &status); 202 } else { 203 converter = ucnv_open(codepage, &status); 204 } 205 206 length = doExtract(start, length, target, capacity, converter, status); 207 208 // close the converter 209 if (codepage == 0) { 210 u_releaseDefaultConverter(converter); 211 } else { 212 ucnv_close(converter); 213 } 214 215 return length; 216} 217 218int32_t 219UnicodeString::extract(char *dest, int32_t destCapacity, 220 UConverter *cnv, 221 UErrorCode &errorCode) const 222{ 223 if(U_FAILURE(errorCode)) { 224 return 0; 225 } 226 227 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { 228 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 229 return 0; 230 } 231 232 // nothing to do? 233 if(isEmpty()) { 234 return u_terminateChars(dest, destCapacity, 0, &errorCode); 235 } 236 237 // get the converter 238 UBool isDefaultConverter; 239 if(cnv==0) { 240 isDefaultConverter=TRUE; 241 cnv=u_getDefaultConverter(&errorCode); 242 if(U_FAILURE(errorCode)) { 243 return 0; 244 } 245 } else { 246 isDefaultConverter=FALSE; 247 ucnv_resetFromUnicode(cnv); 248 } 249 250 // convert 251 int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode); 252 253 // release the converter 254 if(isDefaultConverter) { 255 u_releaseDefaultConverter(cnv); 256 } 257 258 return len; 259} 260 261int32_t 262UnicodeString::doExtract(int32_t start, int32_t length, 263 char *dest, int32_t destCapacity, 264 UConverter *cnv, 265 UErrorCode &errorCode) const 266{ 267 if(U_FAILURE(errorCode)) { 268 if(destCapacity!=0) { 269 *dest=0; 270 } 271 return 0; 272 } 273 274 const UChar *src=getArrayStart()+start, *srcLimit=src+length; 275 char *originalDest=dest; 276 const char *destLimit; 277 278 if(destCapacity==0) { 279 destLimit=dest=0; 280 } else if(destCapacity==-1) { 281 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used. 282 destLimit=(char*)U_MAX_PTR(dest); 283 // for NUL-termination, translate into highest int32_t 284 destCapacity=0x7fffffff; 285 } else { 286 destLimit=dest+destCapacity; 287 } 288 289 // perform the conversion 290 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); 291 length=(int32_t)(dest-originalDest); 292 293 // if an overflow occurs, then get the preflighting length 294 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 295 char buffer[1024]; 296 297 destLimit=buffer+sizeof(buffer); 298 do { 299 dest=buffer; 300 errorCode=U_ZERO_ERROR; 301 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); 302 length+=(int32_t)(dest-buffer); 303 } while(errorCode==U_BUFFER_OVERFLOW_ERROR); 304 } 305 306 return u_terminateChars(originalDest, destCapacity, length, &errorCode); 307} 308 309void 310UnicodeString::doCodepageCreate(const char *codepageData, 311 int32_t dataLength, 312 const char *codepage) 313{ 314 // if there's nothing to convert, do nothing 315 if(codepageData == 0 || dataLength == 0 || dataLength < -1) { 316 return; 317 } 318 if(dataLength == -1) { 319 dataLength = (int32_t)uprv_strlen(codepageData); 320 } 321 322 UErrorCode status = U_ZERO_ERROR; 323 324 // create the converter 325 // if the codepage is the default, use our cache 326 // if it is an empty string, then use the "invariant character" conversion 327 UConverter *converter; 328 if (codepage == 0) { 329 const char *defaultName = ucnv_getDefaultName(); 330 if(UCNV_FAST_IS_UTF8(defaultName)) { 331 setToUTF8(StringPiece(codepageData, dataLength)); 332 return; 333 } 334 converter = u_getDefaultConverter(&status); 335 } else if(*codepage == 0) { 336 // use the "invariant characters" conversion 337 if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) { 338 u_charsToUChars(codepageData, getArrayStart(), dataLength); 339 setLength(dataLength); 340 } else { 341 setToBogus(); 342 } 343 return; 344 } else { 345 converter = ucnv_open(codepage, &status); 346 } 347 348 // if we failed, set the appropriate flags and return 349 if(U_FAILURE(status)) { 350 setToBogus(); 351 return; 352 } 353 354 // perform the conversion 355 doCodepageCreate(codepageData, dataLength, converter, status); 356 if(U_FAILURE(status)) { 357 setToBogus(); 358 } 359 360 // close the converter 361 if(codepage == 0) { 362 u_releaseDefaultConverter(converter); 363 } else { 364 ucnv_close(converter); 365 } 366} 367 368void 369UnicodeString::doCodepageCreate(const char *codepageData, 370 int32_t dataLength, 371 UConverter *converter, 372 UErrorCode &status) 373{ 374 if(U_FAILURE(status)) { 375 return; 376 } 377 378 // set up the conversion parameters 379 const char *mySource = codepageData; 380 const char *mySourceEnd = mySource + dataLength; 381 UChar *array, *myTarget; 382 383 // estimate the size needed: 384 int32_t arraySize; 385 if(dataLength <= US_STACKBUF_SIZE) { 386 // try to use the stack buffer 387 arraySize = US_STACKBUF_SIZE; 388 } else { 389 // 1.25 UChar's per source byte should cover most cases 390 arraySize = dataLength + (dataLength >> 2); 391 } 392 393 // we do not care about the current contents 394 UBool doCopyArray = FALSE; 395 for(;;) { 396 if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) { 397 setToBogus(); 398 break; 399 } 400 401 // perform the conversion 402 array = getArrayStart(); 403 myTarget = array + length(); 404 ucnv_toUnicode(converter, &myTarget, array + getCapacity(), 405 &mySource, mySourceEnd, 0, TRUE, &status); 406 407 // update the conversion parameters 408 setLength((int32_t)(myTarget - array)); 409 410 // allocate more space and copy data, if needed 411 if(status == U_BUFFER_OVERFLOW_ERROR) { 412 // reset the error code 413 status = U_ZERO_ERROR; 414 415 // keep the previous conversion results 416 doCopyArray = TRUE; 417 418 // estimate the new size needed, larger than before 419 // try 2 UChar's per remaining source byte 420 arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource)); 421 } else { 422 break; 423 } 424 } 425} 426 427U_NAMESPACE_END 428 429#endif 430