1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* 6* Copyright (C) 1999-2014, International Business Machines 7* Corporation and others. All Rights Reserved. 8* 9******************************************************************************* 10* file name: unistr_cnv.cpp 11* encoding: UTF-8 12* tab size: 8 (not used) 13* indentation:2 14* 15* created on: 2004aug19 16* created by: Markus W. Scherer 17* 18* Character conversion functions moved here from unistr.cpp 19*/ 20 21#include "unicode/utypes.h" 22 23#if !UCONFIG_NO_CONVERSION 24 25#include "unicode/putil.h" 26#include "cstring.h" 27#include "cmemory.h" 28#include "unicode/ustring.h" 29#include "unicode/unistr.h" 30#include "unicode/ucnv.h" 31#include "ucnv_imp.h" 32#include "putilimp.h" 33#include "ustr_cnv.h" 34#include "ustr_imp.h" 35 36U_NAMESPACE_BEGIN 37 38//======================================== 39// Constructors 40//======================================== 41 42#if !U_CHARSET_IS_UTF8 43 44UnicodeString::UnicodeString(const char *codepageData) { 45 fUnion.fFields.fLengthAndFlags = kShortString; 46 if(codepageData != 0) { 47 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0); 48 } 49} 50 51UnicodeString::UnicodeString(const char *codepageData, 52 int32_t dataLength) { 53 fUnion.fFields.fLengthAndFlags = kShortString; 54 if(codepageData != 0) { 55 doCodepageCreate(codepageData, dataLength, 0); 56 } 57} 58 59// else see unistr.cpp 60#endif 61 62UnicodeString::UnicodeString(const char *codepageData, 63 const char *codepage) { 64 fUnion.fFields.fLengthAndFlags = kShortString; 65 if(codepageData != 0) { 66 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage); 67 } 68} 69 70UnicodeString::UnicodeString(const char *codepageData, 71 int32_t dataLength, 72 const char *codepage) { 73 fUnion.fFields.fLengthAndFlags = kShortString; 74 if(codepageData != 0) { 75 doCodepageCreate(codepageData, dataLength, codepage); 76 } 77} 78 79UnicodeString::UnicodeString(const char *src, int32_t srcLength, 80 UConverter *cnv, 81 UErrorCode &errorCode) { 82 fUnion.fFields.fLengthAndFlags = kShortString; 83 if(U_SUCCESS(errorCode)) { 84 // check arguments 85 if(src==NULL) { 86 // treat as an empty string, do nothing more 87 } else if(srcLength<-1) { 88 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 89 } else { 90 // get input length 91 if(srcLength==-1) { 92 srcLength=(int32_t)uprv_strlen(src); 93 } 94 if(srcLength>0) { 95 if(cnv!=0) { 96 // use the provided converter 97 ucnv_resetToUnicode(cnv); 98 doCodepageCreate(src, srcLength, cnv, errorCode); 99 } else { 100 // use the default converter 101 cnv=u_getDefaultConverter(&errorCode); 102 doCodepageCreate(src, srcLength, cnv, errorCode); 103 u_releaseDefaultConverter(cnv); 104 } 105 } 106 } 107 108 if(U_FAILURE(errorCode)) { 109 setToBogus(); 110 } 111 } 112} 113 114//======================================== 115// Codeset conversion 116//======================================== 117 118#if !U_CHARSET_IS_UTF8 119 120int32_t 121UnicodeString::extract(int32_t start, 122 int32_t length, 123 char *target, 124 uint32_t dstSize) const { 125 return extract(start, length, target, dstSize, 0); 126} 127 128// else see unistr.cpp 129#endif 130 131int32_t 132UnicodeString::extract(int32_t start, 133 int32_t length, 134 char *target, 135 uint32_t dstSize, 136 const char *codepage) const 137{ 138 // if the arguments are illegal, then do nothing 139 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { 140 return 0; 141 } 142 143 // pin the indices to legal values 144 pinIndices(start, length); 145 146 // We need to cast dstSize to int32_t for all subsequent code. 147 // I don't know why the API was defined with uint32_t but we are stuck with it. 148 // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize 149 // as a limit in some functions, it may wrap around and yield a pointer 150 // that compares less-than target. 151 int32_t capacity; 152 if(dstSize < 0x7fffffff) { 153 // Assume that the capacity is real and a limit pointer won't wrap around. 154 capacity = (int32_t)dstSize; 155 } else { 156 // Pin the capacity so that a limit pointer does not wrap around. 157 char *targetLimit = (char *)U_MAX_PTR(target); 158 // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff 159 // greater than target and does not wrap around the top of the address space. 160 capacity = (int32_t)(targetLimit - target); 161 } 162 163 // create the converter 164 UConverter *converter; 165 UErrorCode status = U_ZERO_ERROR; 166 167 // just write the NUL if the string length is 0 168 if(length == 0) { 169 return u_terminateChars(target, capacity, 0, &status); 170 } 171 172 // if the codepage is the default, use our cache 173 // if it is an empty string, then use the "invariant character" conversion 174 if (codepage == 0) { 175 const char *defaultName = ucnv_getDefaultName(); 176 if(UCNV_FAST_IS_UTF8(defaultName)) { 177 return toUTF8(start, length, target, capacity); 178 } 179 converter = u_getDefaultConverter(&status); 180 } else if (*codepage == 0) { 181 // use the "invariant characters" conversion 182 int32_t destLength; 183 if(length <= capacity) { 184 destLength = length; 185 } else { 186 destLength = capacity; 187 } 188 u_UCharsToChars(getArrayStart() + start, target, destLength); 189 return u_terminateChars(target, capacity, length, &status); 190 } else { 191 converter = ucnv_open(codepage, &status); 192 } 193 194 length = doExtract(start, length, target, capacity, converter, status); 195 196 // close the converter 197 if (codepage == 0) { 198 u_releaseDefaultConverter(converter); 199 } else { 200 ucnv_close(converter); 201 } 202 203 return length; 204} 205 206int32_t 207UnicodeString::extract(char *dest, int32_t destCapacity, 208 UConverter *cnv, 209 UErrorCode &errorCode) const 210{ 211 if(U_FAILURE(errorCode)) { 212 return 0; 213 } 214 215 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { 216 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 217 return 0; 218 } 219 220 // nothing to do? 221 if(isEmpty()) { 222 return u_terminateChars(dest, destCapacity, 0, &errorCode); 223 } 224 225 // get the converter 226 UBool isDefaultConverter; 227 if(cnv==0) { 228 isDefaultConverter=TRUE; 229 cnv=u_getDefaultConverter(&errorCode); 230 if(U_FAILURE(errorCode)) { 231 return 0; 232 } 233 } else { 234 isDefaultConverter=FALSE; 235 ucnv_resetFromUnicode(cnv); 236 } 237 238 // convert 239 int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode); 240 241 // release the converter 242 if(isDefaultConverter) { 243 u_releaseDefaultConverter(cnv); 244 } 245 246 return len; 247} 248 249int32_t 250UnicodeString::doExtract(int32_t start, int32_t length, 251 char *dest, int32_t destCapacity, 252 UConverter *cnv, 253 UErrorCode &errorCode) const 254{ 255 if(U_FAILURE(errorCode)) { 256 if(destCapacity!=0) { 257 *dest=0; 258 } 259 return 0; 260 } 261 262 const UChar *src=getArrayStart()+start, *srcLimit=src+length; 263 char *originalDest=dest; 264 const char *destLimit; 265 266 if(destCapacity==0) { 267 destLimit=dest=0; 268 } else if(destCapacity==-1) { 269 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used. 270 destLimit=(char*)U_MAX_PTR(dest); 271 // for NUL-termination, translate into highest int32_t 272 destCapacity=0x7fffffff; 273 } else { 274 destLimit=dest+destCapacity; 275 } 276 277 // perform the conversion 278 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); 279 length=(int32_t)(dest-originalDest); 280 281 // if an overflow occurs, then get the preflighting length 282 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 283 char buffer[1024]; 284 285 destLimit=buffer+sizeof(buffer); 286 do { 287 dest=buffer; 288 errorCode=U_ZERO_ERROR; 289 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); 290 length+=(int32_t)(dest-buffer); 291 } while(errorCode==U_BUFFER_OVERFLOW_ERROR); 292 } 293 294 return u_terminateChars(originalDest, destCapacity, length, &errorCode); 295} 296 297void 298UnicodeString::doCodepageCreate(const char *codepageData, 299 int32_t dataLength, 300 const char *codepage) 301{ 302 // if there's nothing to convert, do nothing 303 if(codepageData == 0 || dataLength == 0 || dataLength < -1) { 304 return; 305 } 306 if(dataLength == -1) { 307 dataLength = (int32_t)uprv_strlen(codepageData); 308 } 309 310 UErrorCode status = U_ZERO_ERROR; 311 312 // create the converter 313 // if the codepage is the default, use our cache 314 // if it is an empty string, then use the "invariant character" conversion 315 UConverter *converter; 316 if (codepage == 0) { 317 const char *defaultName = ucnv_getDefaultName(); 318 if(UCNV_FAST_IS_UTF8(defaultName)) { 319 setToUTF8(StringPiece(codepageData, dataLength)); 320 return; 321 } 322 converter = u_getDefaultConverter(&status); 323 } else if(*codepage == 0) { 324 // use the "invariant characters" conversion 325 if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) { 326 u_charsToUChars(codepageData, getArrayStart(), dataLength); 327 setLength(dataLength); 328 } else { 329 setToBogus(); 330 } 331 return; 332 } else { 333 converter = ucnv_open(codepage, &status); 334 } 335 336 // if we failed, set the appropriate flags and return 337 if(U_FAILURE(status)) { 338 setToBogus(); 339 return; 340 } 341 342 // perform the conversion 343 doCodepageCreate(codepageData, dataLength, converter, status); 344 if(U_FAILURE(status)) { 345 setToBogus(); 346 } 347 348 // close the converter 349 if(codepage == 0) { 350 u_releaseDefaultConverter(converter); 351 } else { 352 ucnv_close(converter); 353 } 354} 355 356void 357UnicodeString::doCodepageCreate(const char *codepageData, 358 int32_t dataLength, 359 UConverter *converter, 360 UErrorCode &status) 361{ 362 if(U_FAILURE(status)) { 363 return; 364 } 365 366 // set up the conversion parameters 367 const char *mySource = codepageData; 368 const char *mySourceEnd = mySource + dataLength; 369 UChar *array, *myTarget; 370 371 // estimate the size needed: 372 int32_t arraySize; 373 if(dataLength <= US_STACKBUF_SIZE) { 374 // try to use the stack buffer 375 arraySize = US_STACKBUF_SIZE; 376 } else { 377 // 1.25 UChar's per source byte should cover most cases 378 arraySize = dataLength + (dataLength >> 2); 379 } 380 381 // we do not care about the current contents 382 UBool doCopyArray = FALSE; 383 for(;;) { 384 if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) { 385 setToBogus(); 386 break; 387 } 388 389 // perform the conversion 390 array = getArrayStart(); 391 myTarget = array + length(); 392 ucnv_toUnicode(converter, &myTarget, array + getCapacity(), 393 &mySource, mySourceEnd, 0, TRUE, &status); 394 395 // update the conversion parameters 396 setLength((int32_t)(myTarget - array)); 397 398 // allocate more space and copy data, if needed 399 if(status == U_BUFFER_OVERFLOW_ERROR) { 400 // reset the error code 401 status = U_ZERO_ERROR; 402 403 // keep the previous conversion results 404 doCopyArray = TRUE; 405 406 // estimate the new size needed, larger than before 407 // try 2 UChar's per remaining source byte 408 arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource)); 409 } else { 410 break; 411 } 412 } 413} 414 415U_NAMESPACE_END 416 417#endif 418